38 files changed, 6793 insertions, 0 deletions
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power7/Implies b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/Implies
new file mode 100644
index 0000000000..9d68f39d22
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/Implies
@@ -0,0 +1,2 @@
+powerpc/powerpc64/power6/fpu
+powerpc/powerpc64/power6
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power7/Makefile b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/Makefile
new file mode 100644
index 0000000000..89a2296085
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/Makefile
@@ -0,0 +1,11 @@
+ifeq ($(subdir),elf)
+# Prevent the use of VSX registers and insns in _dl_start, which under -O3
+# optimization may require a TOC reference before relocations are resolved.
+CFLAGS-rtld.c += -mno-vsx
+endif
+
+ifeq ($(subdir),string)
+sysdep_routines += strstr-ppc64
+CFLAGS-strncase.c += -funroll-loops
+CFLAGS-strncase_l.c += -funroll-loops
+endif
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power7/add_n.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/add_n.S
new file mode 100644
index 0000000000..6425afbc9f
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/add_n.S
@@ -0,0 +1,98 @@
+/* PowerPC64 mpn_lshift --  mpn_add_n/mpn_sub_n -- mpn addition and
+   subtraction.
+   Copyright (C) 2003-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+/*             cycles/limb
+ * POWER7             2.18
+ */
+
+#ifdef USE_AS_SUB
+# define FUNC      __mpn_sub_n
+# define ADDSUBC   subfe
+#else
+# define FUNC      __mpn_add_n
+# define ADDSUBC   adde
+#endif
+
+#define RP  r3
+#define UP  r4
+#define VP  r5
+#define N   r6
+
+EALIGN(FUNC, 5, 0)
+#ifdef USE_AS_SUB
+	addic	r0, r0, 0
+#else
+	addic   r0, r1, -1
+#endif
+	andi.	r7, N, 1
+	beq	L(bx0)
+
+	ld	r7, 0(UP)
+	ld	r9, r0(VP)
+	ADDSUBC	r11, r9, r7
+	std	r11, r0(RP)
+	cmpldi	N, N, 1
+	beq	N, L(end)
+	addi	UP, UP, 8
+	addi	VP, VP, 8
+	addi	RP, RP, 8
+
+L(bx0):	addi	r0, N, 2
+	srdi	r0, r0, 2
+	mtctr	r0
+
+	andi.	r7, N, 2
+	bne	L(mid)
+
+	addi	UP, UP, 16
+	addi	VP, VP, 16
+	addi	RP, RP, 16
+
+	.align	5
+L(top):	ld	r6, -16(UP)
+	ld	r7, -8(UP)
+	ld	r8, -16(VP)
+	ld	r9, -8(VP)
+	ADDSUBC	r10, r8, N
+	ADDSUBC	r11, r9, r7
+	std	r10, -16(RP)
+	std	r11, -8(RP)
+L(mid):	ld	r6, 0(UP)
+	ld	r7, 8(UP)
+	ld	r8, 0(VP)
+	ld	r9, 8(VP)
+	ADDSUBC	r10, r8, N
+	ADDSUBC	r11, r9, r7
+	std	r10, 0(RP)
+	std	r11, 8(RP)
+	addi	UP, UP, 32
+	addi	VP, VP, 32
+	addi	RP, RP, 32
+	bdnz	L(top)
+
+L(end):	subfe	r3, r0, r0
+#ifdef USE_AS_SUB
+	neg	r3, r3
+#else
+	addi	r3, r3, 1
+#endif
+	blr
+END(FUNC)
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power7/bcopy.c b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/bcopy.c
new file mode 100644
index 0000000000..4a6a400e7a
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/bcopy.c
@@ -0,0 +1 @@
+/* Implemented at memmove.S  */
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power7/fpu/Implies b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/fpu/Implies
new file mode 100644
index 0000000000..30fa17646e
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/fpu/Implies
@@ -0,0 +1 @@
+powerpc/powerpc64/power6/fpu
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power7/fpu/multiarch/Implies b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/fpu/multiarch/Implies
new file mode 100644
index 0000000000..410d289a6d
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/fpu/multiarch/Implies
@@ -0,0 +1 @@
+powerpc/powerpc64/power6/fpu/multiarch
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power7/fpu/s_finite.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/fpu/s_finite.S
new file mode 100644
index 0000000000..9ccc758c9e
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/fpu/s_finite.S
@@ -0,0 +1,70 @@
+/* finite().  PowerPC64/POWER7 version.
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Luis Machado <luisgpm@br.ibm.com>.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <math_ldbl_opt.h>
+
+/* int __finite(x)  */
+	.section    ".toc","aw"
+.LC0:   /* 1.0 */
+	.tc	    FD_ONE[TC],0x3ff0000000000000
+	.section    ".text"
+	.type	    __finite, @function
+	.machine    power7
+EALIGN (__finite, 4, 0)
+	CALL_MCOUNT 0
+	lfd     fp0,.LC0@toc(r2)
+	ftdiv   cr7,fp1,fp0
+	li	r3,1
+	bflr    30
+
+	/* If we are here, we either have +/-INF,
+	NaN or denormal.  */
+
+	stfd    fp1,-16(r1)   /* Transfer FP to GPR's.  */
+	ori	2,2,0	      /* Force a new dispatch group.  */
+	lhz     r4,-16+HISHORT(r1)  /* Fetch the upper 16 bits of the FP value
+				    (biased exponent and sign bit).  */
+	clrlwi  r4,r4,17      /* r4 = abs(r4).  */
+	cmpwi   cr7,r4,0x7ff0 /* r4 == 0x7ff0?  */
+	bltlr   cr7	      /* LT means finite, other non-finite.  */
+	li      r3,0
+	blr
+	END (__finite)
+
+hidden_def (__finite)
+weak_alias (__finite, finite)
+
+/* It turns out that the 'double' version will also always work for
+   single-precision.  */
+strong_alias (__finite, __finitef)
+hidden_def (__finitef)
+weak_alias (__finitef, finitef)
+
+#if IS_IN (libm)
+# if LONG_DOUBLE_COMPAT (libm, GLIBC_2_0)
+compat_symbol (libm, __finite, __finitel, GLIBC_2_0)
+compat_symbol (libm, finite, finitel, GLIBC_2_0)
+# endif
+#else
+# if LONG_DOUBLE_COMPAT (libc, GLIBC_2_0)
+compat_symbol (libc, __finite, __finitel, GLIBC_2_0);
+compat_symbol (libc, finite, finitel, GLIBC_2_0);
+# endif
+#endif
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power7/fpu/s_finitef.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/fpu/s_finitef.S
new file mode 100644
index 0000000000..54bd94176d
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/fpu/s_finitef.S
@@ -0,0 +1 @@
+/* This function uses the same code as s_finite.S.  */
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power7/fpu/s_isinf.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/fpu/s_isinf.S
new file mode 100644
index 0000000000..4482cddcfa
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/fpu/s_isinf.S
@@ -0,0 +1,69 @@
+/* isinf().  PowerPC64/POWER7 version.
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Luis Machado <luisgpm@br.ibm.com>.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <math_ldbl_opt.h>
+
+/* int __isinf(x)  */
+	.section    ".toc","aw"
+.LC0:   /* 1.0 */
+	.tc	    FD_ONE[TC],0x3ff0000000000000
+	.section    ".text"
+	.type	    __isinf, @function
+	.machine    power7
+EALIGN (__isinf, 4, 0)
+	CALL_MCOUNT 0
+	lfd	fp0,.LC0@toc(r2)
+	ftdiv	cr7,fp1,fp0
+	li	r3,0
+	bflr    29	      /* If not INF, return.  */
+
+	/* Either we have -INF/+INF or a denormal.  */
+
+	stfd    fp1,-16(r1)   /* Transfer FP to GPR's.  */
+	ori	2,2,0	      /* Force a new dispatch group.  */
+	lhz	r4,-16+HISHORT(r1)  /* Fetch the upper 16 bits of the FP value
+				    (biased exponent and sign bit).  */
+	cmpwi	cr7,r4,0x7ff0 /* r4 == 0x7ff0?  */
+	li	r3,1
+	beqlr   cr7	      /* EQ means INF, otherwise -INF.  */
+	li      r3,-1
+	blr
+	END (__isinf)
+
+hidden_def (__isinf)
+weak_alias (__isinf, isinf)
+
+/* It turns out that the 'double' version will also always work for
+   single-precision.  */
+strong_alias (__isinf, __isinff)
+hidden_def (__isinff)
+weak_alias (__isinff, isinff)
+
+#ifdef NO_LONG_DOUBLE
+strong_alias (__isinf, __isinfl)
+weak_alias (__isinf, isinfl)
+#endif
+
+#if !IS_IN (libm)
+# if LONG_DOUBLE_COMPAT(libc, GLIBC_2_0)
+compat_symbol (libc, __isinf, __isinfl, GLIBC_2_0);
+compat_symbol (libc, isinf, isinfl, GLIBC_2_0);
+# endif
+#endif
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power7/fpu/s_isinff.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/fpu/s_isinff.S
new file mode 100644
index 0000000000..be759e091e
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/fpu/s_isinff.S
@@ -0,0 +1 @@
+/* This function uses the same code as s_isinf.S.  */
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power7/fpu/s_isnan.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/fpu/s_isnan.S
new file mode 100644
index 0000000000..46b08a0d37
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/fpu/s_isnan.S
@@ -0,0 +1,68 @@
+/* isnan().  PowerPC64/POWER7 version.
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Luis Machado <luisgpm@br.ibm.com>.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <math_ldbl_opt.h>
+
+/* int __isnan(x)  */
+	.section    ".toc","aw"
+.LC0:   /* 1.0 */
+	.tc	    FD_ONE[TC],0x3ff0000000000000
+	.section    ".text"
+	.type	    __isnan, @function
+	.machine    power7
+EALIGN (__isnan, 4, 0)
+	CALL_MCOUNT 0
+	lfd	fp0,.LC0@toc(r2)
+	ftdiv	cr7,fp1,fp0
+	li	r3,0
+	bflr	30	      /* If not NaN, finish.  */
+
+	stfd    fp1,-16(r1)   /* Transfer FP to GPR's.  */
+	ori	2,2,0	      /* Force a new dispatch group.  */
+	ld	r4,-16(r1)    /* Load FP into GPR.  */
+	lis     r0,0x7ff0
+	sldi	r0,r0,32      /* const long r0 0x7ff00000 00000000.  */
+	clrldi	r4,r4,1	      /* x = fabs(x)  */
+	cmpd	cr7,r4,r0     /* if (fabs(x) <= inf)  */
+	blelr	cr7	      /* LE means not NaN.  */
+	li	r3,1	      /* else return 1  */
+	blr
+	END (__isnan)
+
+hidden_def (__isnan)
+weak_alias (__isnan, isnan)
+
+/* It turns out that the 'double' version will also always work for
+   single-precision.  */
+strong_alias (__isnan, __isnanf)
+hidden_def (__isnanf)
+weak_alias (__isnanf, isnanf)
+
+#ifdef NO_LONG_DOUBLE
+strong_alias (__isnan, __isnanl)
+weak_alias (__isnan, isnanl)
+#endif
+
+#if !IS_IN (libm)
+# if LONG_DOUBLE_COMPAT(libc, GLIBC_2_0)
+compat_symbol (libc, __isnan, __isnanl, GLIBC_2_0);
+compat_symbol (libc, isnan, isnanl, GLIBC_2_0);
+# endif
+#endif
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power7/fpu/s_isnanf.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/fpu/s_isnanf.S
new file mode 100644
index 0000000000..b48c85e0d3
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/fpu/s_isnanf.S
@@ -0,0 +1 @@
+/* This function uses the same code as s_isnan.S.  */
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power7/fpu/s_logb.c b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/fpu/s_logb.c
new file mode 100644
index 0000000000..2599c771d9
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/fpu/s_logb.c
@@ -0,0 +1 @@
+#include <sysdeps/powerpc/power7/fpu/s_logb.c>
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power7/fpu/s_logbf.c b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/fpu/s_logbf.c
new file mode 100644
index 0000000000..7a5a8032e0
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/fpu/s_logbf.c
@@ -0,0 +1 @@
+#include <sysdeps/powerpc/power7/fpu/s_logbf.c>
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power7/fpu/s_logbl.c b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/fpu/s_logbl.c
new file mode 100644
index 0000000000..524ae2c78d
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/fpu/s_logbl.c
@@ -0,0 +1 @@
+#include <sysdeps/powerpc/power7/fpu/s_logbl.c>
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power7/memchr.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/memchr.S
new file mode 100644
index 0000000000..5e9707aa02
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/memchr.S
@@ -0,0 +1,199 @@
+/* Optimized memchr implementation for PowerPC64/POWER7 using cmpb insn.
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Luis Machado <luisgpm@br.ibm.com>.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+/* int [r3] memchr (char *s [r3], int byte [r4], int size [r5])  */
+
+#ifndef MEMCHR
+# define MEMCHR __memchr
+#endif
+	.machine  power7
+ENTRY (MEMCHR)
+	CALL_MCOUNT 3
+	dcbt	0,r3
+	clrrdi  r8,r3,3
+	insrdi	r4,r4,8,48
+
+	/* Calculate the last acceptable address and check for possible
+	   addition overflow by using satured math:
+	   r7 = r3 + r5
+	   r7 |= -(r7 < x)  */
+	add     r7,r3,r5
+	subfc   r6,r3,r7
+	subfe   r9,r9,r9
+	extsw   r6,r9
+	or      r7,r7,r6
+
+	insrdi	r4,r4,16,32
+	cmpldi	r5,32
+	li	r9, -1
+	rlwinm	r6,r3,3,26,28 /* Calculate padding.  */
+	insrdi  r4,r4,32,0
+	addi	r7,r7,-1
+#ifdef __LITTLE_ENDIAN__
+	sld	r9,r9,r6
+#else
+	srd	r9,r9,r6
+#endif
+	ble	L(small_range)
+
+	ld	r12,0(r8)     /* Load doubleword from memory.  */
+	cmpb	r3,r12,r4     /* Check for BYTEs in DWORD1.  */
+	and	r3,r3,r9
+	clrldi	r5,r7,61      /* Byte count - 1 in last dword.  */
+	clrrdi	r7,r7,3       /* Address of last doubleword.  */
+	cmpldi	cr7,r3,0      /* Does r3 indicate we got a hit?  */
+	bne	cr7,L(done)
+
+	mtcrf   0x01,r8
+	/* Are we now aligned to a quadword boundary?  If so, skip to
+	   the main loop.  Otherwise, go through the alignment code.  */
+	bt	28,L(loop_setup)
+
+	/* Handle DWORD2 of pair.  */
+	ldu	r12,8(r8)
+	cmpb	r3,r12,r4
+	cmpldi	cr7,r3,0
+	bne	cr7,L(done)
+
+L(loop_setup):
+	/* The last dword we want to read in the loop below is the one
+	   containing the last byte of the string, ie. the dword at
+	   (s + size - 1) & ~7, or r7.  The first dword read is at
+	   r8 + 8, we read 2 * cnt dwords, so the last dword read will
+	   be at r8 + 8 + 16 * cnt - 8.  Solving for cnt gives
+	   cnt = (r7 - r8) / 16  */
+	sub	r6,r7,r8
+	srdi	r6,r6,4	      /* Number of loop iterations.  */
+	mtctr	r6            /* Setup the counter.  */
+
+	/* Main loop to look for BYTE in the string.  Since
+	   it's a small loop (8 instructions), align it to 32-bytes.  */
+	.align	5
+L(loop):
+	/* Load two doublewords, compare and merge in a
+	   single register for speed.  This is an attempt
+	   to speed up the byte-checking process for bigger strings.  */
+	ld	r12,8(r8)
+	ldu	r11,16(r8)
+	cmpb	r3,r12,r4
+	cmpb	r9,r11,r4
+	or	r6,r9,r3      /* Merge everything in one doubleword.  */
+	cmpldi	cr7,r6,0
+	bne	cr7,L(found)
+	bdnz	L(loop)
+
+	/* We may have one more dword to read.  */
+	cmpld	r8,r7
+	beqlr
+
+	ldu	r12,8(r8)
+	cmpb	r3,r12,r4
+	cmpldi	cr6,r3,0
+	bne	cr6,L(done)
+	blr
+
+	.align	4
+L(found):
+	/* OK, one (or both) of the doublewords contains BYTE.  Check
+	   the first doubleword and decrement the address in case the first
+	   doubleword really contains BYTE.  */
+	cmpldi	cr6,r3,0
+	addi	r8,r8,-8
+	bne	cr6,L(done)
+
+	/* BYTE must be in the second doubleword.  Adjust the address
+	   again and move the result of cmpb to r3 so we can calculate the
+	   pointer.  */
+
+	mr	r3,r9
+	addi	r8,r8,8
+
+	/* r3 has the output of the cmpb instruction, that is, it contains
+	   0xff in the same position as BYTE in the original
+	   doubleword from the string.  Use that to calculate the pointer.
+	   We need to make sure BYTE is *before* the end of the range.  */
+L(done):
+#ifdef __LITTLE_ENDIAN__
+	addi    r0,r3,-1
+	andc    r0,r0,r3
+	popcntd	r0,r0	      /* Count trailing zeros.  */
+#else
+	cntlzd	r0,r3	      /* Count leading zeros before the match.  */
+#endif
+	cmpld	r8,r7         /* Are we on the last dword?  */
+	srdi	r0,r0,3	      /* Convert leading/trailing zeros to bytes.  */
+	add	r3,r8,r0
+	cmpld	cr7,r0,r5     /* If on the last dword, check byte offset.  */
+	bnelr
+	blelr	cr7
+	li	r3,0
+	blr
+
+	.align	4
+L(null):
+	li	r3,0
+	blr
+
+/* Deals with size <= 32.  */
+	.align	4
+L(small_range):
+	cmpldi	r5,0
+	beq	L(null)
+	ld	r12,0(r8)     /* Load word from memory.  */
+	cmpb	r3,r12,r4     /* Check for BYTE in DWORD1.  */
+	and	r3,r3,r9
+	cmpldi	cr7,r3,0
+	clrldi	r5,r7,61      /* Byte count - 1 in last dword.  */
+	clrrdi	r7,r7,3       /* Address of last doubleword.  */
+	cmpld	r8,r7         /* Are we done already?  */
+	bne	cr7,L(done)
+	beqlr
+
+	ldu	r12,8(r8)
+	cmpb	r3,r12,r4
+	cmpldi	cr6,r3,0
+	cmpld	r8,r7
+	bne	cr6,L(done)   /* Found something.  */
+	beqlr		      /* Hit end of string (length).  */
+
+	ldu	r12,8(r8)
+	cmpb	r3,r12,r4
+	cmpldi	cr6,r3,0
+	cmpld	r8,r7
+	bne	cr6,L(done)
+	beqlr
+
+	ldu	r12,8(r8)
+	cmpb	r3,r12,r4
+	cmpldi	cr6,r3,0
+	cmpld	r8,r7
+	bne	cr6,L(done)
+	beqlr
+
+	ldu	r12,8(r8)
+	cmpb	r3,r12,r4
+	cmpldi	cr6,r3,0
+	bne	cr6,L(done)
+	blr
+
+END (MEMCHR)
+weak_alias (__memchr, memchr)
+libc_hidden_builtin_def (memchr)
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power7/memcmp.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/memcmp.S
new file mode 100644
index 0000000000..96ce8cee25
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/memcmp.S
@@ -0,0 +1,1061 @@
+/* Optimized memcmp implementation for POWER7/PowerPC64.
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+/* int [r3] memcmp (const char *s1 [r3],
+		    const char *s2 [r4],
+		    size_t size [r5])  */
+#ifndef MEMCMP
+# define MEMCMP memcmp
+#endif
+	.machine power7
+EALIGN (MEMCMP, 4, 0)
+	CALL_MCOUNT 3
+
+#define rRTN		r3
+#define rSTR1		r3	/* first string arg */
+#define rSTR2		r4	/* second string arg */
+#define rN		r5	/* max string length */
+#define rWORD1		r6	/* current word in s1 */
+#define rWORD2		r7	/* current word in s2 */
+#define rWORD3		r8	/* next word in s1 */
+#define rWORD4		r9	/* next word in s2 */
+#define rWORD5		r10	/* next word in s1 */
+#define rWORD6		r11	/* next word in s2 */
+
+#define rOFF8		r20	/* 8 bytes offset.  */
+#define rOFF16  	r21	/* 16 bytes offset.  */
+#define rOFF24		r22	/* 24 bytes offset.  */
+#define rOFF32		r23	/* 24 bytes offset.  */
+#define rWORD6_SHIFT	r24	/* Left rotation temp for rWORD8.  */
+#define rWORD4_SHIFT	r25	/* Left rotation temp for rWORD6.  */
+#define rWORD2_SHIFT	r26	/* Left rotation temp for rWORD4.  */
+#define rWORD8_SHIFT	r27	/* Left rotation temp for rWORD2.  */
+#define rSHR		r28	/* Unaligned shift right count.  */
+#define rSHL		r29	/* Unaligned shift left count.  */
+#define rWORD7		r30	/* next word in s1 */
+#define rWORD8		r31	/* next word in s2 */
+
+#define rWORD8SAVE	(-8)
+#define rWORD7SAVE	(-16)
+#define rOFF8SAVE	(-24)
+#define rOFF16SAVE	(-32)
+#define rOFF24SAVE	(-40)
+#define rOFF32SAVE	(-48)
+#define rSHRSAVE	(-56)
+#define rSHLSAVE	(-64)
+#define rWORD8SHIFTSAVE	(-72)
+#define rWORD2SHIFTSAVE	(-80)
+#define rWORD4SHIFTSAVE	(-88)
+#define rWORD6SHIFTSAVE	(-96)
+
+#ifdef __LITTLE_ENDIAN__
+# define LD	ldbrx
+#else
+# define LD	ldx
+#endif
+
+	xor	r0, rSTR2, rSTR1
+	cmpldi	cr6, rN, 0
+	cmpldi	cr1, rN, 12
+	clrldi.	r0, r0, 61
+	clrldi	r12, rSTR1, 61
+	cmpldi	cr5, r12, 0
+	beq-	cr6, L(zeroLength)
+	dcbt	0, rSTR1
+	dcbt	0, rSTR2
+/* If less than 8 bytes or not aligned, use the unaligned
+   byte loop.  */
+	blt	cr1, L(bytealigned)
+	std	rWORD8, rWORD8SAVE(r1)
+	std	rWORD7, rWORD7SAVE(r1)
+	std	rOFF8, rOFF8SAVE(r1)
+	std	rOFF16, rOFF16SAVE(r1)
+	std	rOFF24, rOFF24SAVE(r1)
+	std	rOFF32, rOFF32SAVE(r1)
+	cfi_offset(rWORD8, rWORD8SAVE)
+	cfi_offset(rWORD7, rWORD7SAVE)
+	cfi_offset(rOFF8, rOFF8SAVE)
+	cfi_offset(rOFF16, rOFF16SAVE)
+	cfi_offset(rOFF24, rOFF24SAVE)
+	cfi_offset(rOFF32, rOFF32SAVE)
+
+	li	rOFF8,8
+	li	rOFF16,16
+	li	rOFF24,24
+	li	rOFF32,32
+
+	bne	L(unaligned)
+/* At this point we know both strings have the same alignment and the
+   compare length is at least 8 bytes.  r12 contains the low order
+   3 bits of rSTR1 and cr5 contains the result of the logical compare
+   of r12 to 0.  If r12 == 0 then we are already double word
+   aligned and can perform the DW aligned loop.
+
+   Otherwise we know the two strings have the same alignment (but not
+   yet DW).  So we force the string addresses to the next lower DW
+   boundary and special case this first DW using shift left to
+   eliminate bits preceding the first byte.  Since we want to join the
+   normal (DW aligned) compare loop, starting at the second double word,
+   we need to adjust the length (rN) and special case the loop
+   versioning for the first DW. This ensures that the loop count is
+   correct and the first DW (shifted) is in the expected register pair.  */
+	.align	4
+L(samealignment):
+	clrrdi	rSTR1, rSTR1, 3
+	clrrdi	rSTR2, rSTR2, 3
+	beq	cr5, L(DWaligned)
+	add	rN, rN, r12
+	sldi	rWORD6, r12, 3
+	srdi	r0, rN, 5	/* Divide by 32 */
+	andi.	r12, rN, 24	/* Get the DW remainder */
+	LD	rWORD1, 0, rSTR1
+	LD	rWORD2, 0, rSTR2
+	cmpldi	cr1, r12, 16
+	cmpldi	cr7, rN, 32
+	clrldi	rN, rN, 61
+	beq	L(dPs4)
+	mtctr	r0
+	bgt	cr1, L(dPs3)
+	beq	cr1, L(dPs2)
+
+/* Remainder is 8 */
+	.align	3
+L(dsP1):
+	sld	rWORD5, rWORD1, rWORD6
+	sld	rWORD6, rWORD2, rWORD6
+	cmpld	cr5, rWORD5, rWORD6
+	blt	cr7, L(dP1x)
+/* Do something useful in this cycle since we have to branch anyway.  */
+	LD	rWORD1, rOFF8, rSTR1
+	LD	rWORD2, rOFF8, rSTR2
+	cmpld	cr7, rWORD1, rWORD2
+	b	L(dP1e)
+/* Remainder is 16 */
+	.align	4
+L(dPs2):
+	sld	rWORD5, rWORD1, rWORD6
+	sld	rWORD6, rWORD2, rWORD6
+	cmpld	cr6, rWORD5, rWORD6
+	blt	cr7, L(dP2x)
+/* Do something useful in this cycle since we have to branch anyway.  */
+	LD	rWORD7, rOFF8, rSTR1
+	LD	rWORD8, rOFF8, rSTR2
+	cmpld	cr5, rWORD7, rWORD8
+	b	L(dP2e)
+/* Remainder is 24 */
+	.align	4
+L(dPs3):
+	sld	rWORD3, rWORD1, rWORD6
+	sld	rWORD4, rWORD2, rWORD6
+	cmpld	cr1, rWORD3, rWORD4
+	b	L(dP3e)
+/* Count is a multiple of 32, remainder is 0 */
+	.align	4
+L(dPs4):
+	mtctr	r0
+	sld	rWORD1, rWORD1, rWORD6
+	sld	rWORD2, rWORD2, rWORD6
+	cmpld	cr7, rWORD1, rWORD2
+	b	L(dP4e)
+
+/* At this point we know both strings are double word aligned and the
+   compare length is at least 8 bytes.  */
+	.align	4
+L(DWaligned):
+	andi.	r12, rN, 24	/* Get the DW remainder */
+	srdi	r0, rN, 5	/* Divide by 32 */
+	cmpldi	cr1, r12, 16
+	cmpldi	cr7, rN, 32
+	clrldi	rN, rN, 61
+	beq	L(dP4)
+	bgt	cr1, L(dP3)
+	beq	cr1, L(dP2)
+
+/* Remainder is 8 */
+	.align	4
+L(dP1):
+	mtctr	r0
+/* Normally we'd use rWORD7/rWORD8 here, but since we might exit early
+   (8-15 byte compare), we want to use only volatile registers.  This
+   means we can avoid restoring non-volatile registers since we did not
+   change any on the early exit path.  The key here is the non-early
+   exit path only cares about the condition code (cr5), not about which
+   register pair was used.  */
+	LD	rWORD5, 0, rSTR1
+	LD	rWORD6, 0, rSTR2
+	cmpld	cr5, rWORD5, rWORD6
+	blt	cr7, L(dP1x)
+	LD	rWORD1, rOFF8, rSTR1
+	LD	rWORD2, rOFF8, rSTR2
+	cmpld	cr7, rWORD1, rWORD2
+L(dP1e):
+	LD	rWORD3, rOFF16, rSTR1
+	LD	rWORD4, rOFF16, rSTR2
+	cmpld	cr1, rWORD3, rWORD4
+	LD	rWORD5, rOFF24, rSTR1
+	LD	rWORD6, rOFF24, rSTR2
+	cmpld	cr6, rWORD5, rWORD6
+	bne	cr5, L(dLcr5x)
+	bne	cr7, L(dLcr7x)
+
+	LD	rWORD7, rOFF32, rSTR1
+	LD	rWORD8, rOFF32, rSTR2
+	addi	rSTR1, rSTR1, 32
+	addi	rSTR2, rSTR2, 32
+	bne	cr1, L(dLcr1)
+	cmpld	cr5, rWORD7, rWORD8
+	bdnz	L(dLoop)
+	bne	cr6, L(dLcr6)
+	ld	rWORD8, rWORD8SAVE(r1)
+	ld	rWORD7, rWORD7SAVE(r1)
+	.align	3
+L(dP1x):
+	sldi.	r12, rN, 3
+	bne	cr5, L(dLcr5x)
+	subfic	rN, r12, 64	/* Shift count is 64 - (rN * 8).  */
+	bne	L(d00)
+	ld	rOFF8,  rOFF8SAVE(r1)
+	ld	rOFF16, rOFF16SAVE(r1)
+	ld	rOFF24, rOFF24SAVE(r1)
+	ld	rOFF32, rOFF32SAVE(r1)
+	li	rRTN, 0
+	blr
+
+/* Remainder is 16 */
+	.align	4
+L(dP2):
+	mtctr	r0
+	LD	rWORD5, 0, rSTR1
+	LD	rWORD6, 0, rSTR2
+	cmpld	cr6, rWORD5, rWORD6
+	blt	cr7, L(dP2x)
+	LD	rWORD7, rOFF8, rSTR1
+	LD	rWORD8, rOFF8, rSTR2
+	cmpld	cr5, rWORD7, rWORD8
+L(dP2e):
+	LD	rWORD1, rOFF16, rSTR1
+	LD	rWORD2, rOFF16, rSTR2
+	cmpld	cr7, rWORD1, rWORD2
+	LD	rWORD3, rOFF24, rSTR1
+	LD	rWORD4, rOFF24, rSTR2
+	cmpld	cr1, rWORD3, rWORD4
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+	bne	cr6, L(dLcr6)
+	bne	cr5, L(dLcr5)
+	b	L(dLoop2)
+	.align	4
+L(dP2x):
+	LD	rWORD3, rOFF8, rSTR1
+	LD	rWORD4, rOFF8, rSTR2
+	cmpld	cr1, rWORD3, rWORD4
+	sldi.	r12, rN, 3
+	bne	cr6, L(dLcr6x)
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+	bne	cr1, L(dLcr1x)
+	subfic	rN, r12, 64	/* Shift count is 64 - (rN * 8).  */
+	bne	L(d00)
+	ld	rOFF8,  rOFF8SAVE(r1)
+	ld	rOFF16, rOFF16SAVE(r1)
+	ld	rOFF24, rOFF24SAVE(r1)
+	ld	rOFF32, rOFF32SAVE(r1)
+	li	rRTN, 0
+	blr
+
+/* Remainder is 24 */
+	.align	4
+L(dP3):
+	mtctr	r0
+	LD	rWORD3, 0, rSTR1
+	LD	rWORD4, 0, rSTR2
+	cmpld	cr1, rWORD3, rWORD4
+L(dP3e):
+	LD	rWORD5, rOFF8, rSTR1
+	LD	rWORD6, rOFF8, rSTR2
+	cmpld	cr6, rWORD5, rWORD6
+	blt	cr7, L(dP3x)
+	LD	rWORD7, rOFF16, rSTR1
+	LD	rWORD8, rOFF16, rSTR2
+	cmpld	cr5, rWORD7, rWORD8
+	LD	rWORD1, rOFF24, rSTR1
+	LD	rWORD2, rOFF24, rSTR2
+	cmpld	cr7, rWORD1, rWORD2
+	addi	rSTR1, rSTR1, 16
+	addi	rSTR2, rSTR2, 16
+	bne	cr1, L(dLcr1)
+	bne	cr6, L(dLcr6)
+	b	L(dLoop1)
+/* Again we are on a early exit path (24-31 byte compare), we want to
+   only use volatile registers and avoid restoring non-volatile
+   registers.  */
+	.align	4
+L(dP3x):
+	LD	rWORD1, rOFF16, rSTR1
+	LD	rWORD2, rOFF16, rSTR2
+	cmpld	cr7, rWORD1, rWORD2
+	sldi.	r12, rN, 3
+	bne	cr1, L(dLcr1x)
+	addi	rSTR1, rSTR1, 16
+	addi	rSTR2, rSTR2, 16
+	bne	cr6, L(dLcr6x)
+	subfic	rN, r12, 64	/* Shift count is 64 - (rN * 8).  */
+	bne	cr7, L(dLcr7x)
+	bne	L(d00)
+	ld	rOFF8,  rOFF8SAVE(r1)
+	ld	rOFF16, rOFF16SAVE(r1)
+	ld	rOFF24, rOFF24SAVE(r1)
+	ld	rOFF32, rOFF32SAVE(r1)
+	li	rRTN, 0
+	blr
+
+/* Count is a multiple of 32, remainder is 0 */
+	.align	4
+L(dP4):
+	mtctr	r0
+	LD	rWORD1, 0, rSTR1
+	LD	rWORD2, 0, rSTR2
+	cmpld	cr7, rWORD1, rWORD2
+L(dP4e):
+	LD	rWORD3, rOFF8, rSTR1
+	LD	rWORD4, rOFF8, rSTR2
+	cmpld	cr1, rWORD3, rWORD4
+	LD	rWORD5, rOFF16, rSTR1
+	LD	rWORD6, rOFF16, rSTR2
+	cmpld	cr6, rWORD5, rWORD6
+	LD	rWORD7, rOFF24, rSTR1
+	LD	rWORD8, rOFF24, rSTR2
+	addi	rSTR1, rSTR1, 24
+	addi	rSTR2, rSTR2, 24
+	cmpld	cr5, rWORD7, rWORD8
+	bne	cr7, L(dLcr7)
+	bne	cr1, L(dLcr1)
+	bdz-	L(d24)		/* Adjust CTR as we start with +4 */
+/* This is the primary loop */
+	.align	4
+L(dLoop):
+	LD	rWORD1, rOFF8, rSTR1
+	LD	rWORD2, rOFF8, rSTR2
+	cmpld	cr1, rWORD3, rWORD4
+	bne	cr6, L(dLcr6)
+L(dLoop1):
+	LD	rWORD3, rOFF16, rSTR1
+	LD	rWORD4, rOFF16, rSTR2
+	cmpld	cr6, rWORD5, rWORD6
+	bne	cr5, L(dLcr5)
+L(dLoop2):
+	LD	rWORD5, rOFF24, rSTR1
+	LD	rWORD6, rOFF24, rSTR2
+	cmpld	cr5, rWORD7, rWORD8
+	bne	cr7, L(dLcr7)
+L(dLoop3):
+	LD	rWORD7, rOFF32, rSTR1
+	LD	rWORD8, rOFF32, rSTR2
+	addi	rSTR1, rSTR1, 32
+	addi	rSTR2, rSTR2, 32
+	bne	cr1, L(dLcr1)
+	cmpld	cr7, rWORD1, rWORD2
+	bdnz	L(dLoop)
+
+L(dL4):
+	cmpld	cr1, rWORD3, rWORD4
+	bne	cr6, L(dLcr6)
+	cmpld	cr6, rWORD5, rWORD6
+	bne	cr5, L(dLcr5)
+	cmpld	cr5, rWORD7, rWORD8
+L(d44):
+	bne	cr7, L(dLcr7)
+L(d34):
+	bne	cr1, L(dLcr1)
+L(d24):
+	bne	cr6, L(dLcr6)
+L(d14):
+	sldi.	r12, rN, 3
+	bne	cr5, L(dLcr5)
+L(d04):
+	ld	rWORD8, rWORD8SAVE(r1)
+	ld	rWORD7, rWORD7SAVE(r1)
+	subfic	rN, r12, 64	/* Shift count is 64 - (rN * 8).  */
+	beq	L(duzeroLength)
+/* At this point we have a remainder of 1 to 7 bytes to compare.  Since
+   we are aligned it is safe to load the whole double word, and use
+   shift right double to eliminate bits beyond the compare length.  */
+L(d00):
+	LD	rWORD1, rOFF8, rSTR1
+	LD	rWORD2, rOFF8, rSTR2
+	srd	rWORD1, rWORD1, rN
+	srd	rWORD2, rWORD2, rN
+	cmpld	cr7, rWORD1, rWORD2
+	bne	cr7, L(dLcr7x)
+	ld	rOFF8,  rOFF8SAVE(r1)
+	ld	rOFF16, rOFF16SAVE(r1)
+	ld	rOFF24, rOFF24SAVE(r1)
+	ld	rOFF32, rOFF32SAVE(r1)
+	li	rRTN, 0
+	blr
+
+	.align	4
+L(dLcr7):
+	ld	rWORD8, rWORD8SAVE(r1)
+	ld	rWORD7, rWORD7SAVE(r1)
+L(dLcr7x):
+	ld	rOFF8,  rOFF8SAVE(r1)
+	ld	rOFF16, rOFF16SAVE(r1)
+	ld	rOFF24, rOFF24SAVE(r1)
+	ld	rOFF32, rOFF32SAVE(r1)
+	li	rRTN, 1
+	bgtlr	cr7
+	li	rRTN, -1
+	blr
+	.align	4
+L(dLcr1):
+	ld	rWORD8, rWORD8SAVE(r1)
+	ld	rWORD7, rWORD7SAVE(r1)
+L(dLcr1x):
+	ld	rOFF8,  rOFF8SAVE(r1)
+	ld	rOFF16, rOFF16SAVE(r1)
+	ld	rOFF24, rOFF24SAVE(r1)
+	ld	rOFF32, rOFF32SAVE(r1)
+	li	rRTN, 1
+	bgtlr	cr1
+	li	rRTN, -1
+	blr
+	.align	4
+L(dLcr6):
+	ld	rWORD8, rWORD8SAVE(r1)
+	ld	rWORD7, rWORD7SAVE(r1)
+L(dLcr6x):
+	ld	rOFF8,  rOFF8SAVE(r1)
+	ld	rOFF16, rOFF16SAVE(r1)
+	ld	rOFF24, rOFF24SAVE(r1)
+	ld	rOFF32, rOFF32SAVE(r1)
+	li	rRTN, 1
+	bgtlr	cr6
+	li	rRTN, -1
+	blr
+	.align	4
+L(dLcr5):
+	ld	rWORD8, rWORD8SAVE(r1)
+	ld	rWORD7, rWORD7SAVE(r1)
+L(dLcr5x):
+	ld	rOFF8,  rOFF8SAVE(r1)
+	ld	rOFF16, rOFF16SAVE(r1)
+	ld	rOFF24, rOFF24SAVE(r1)
+	ld	rOFF32, rOFF32SAVE(r1)
+	li	rRTN, 1
+	bgtlr	cr5
+	li	rRTN, -1
+	blr
+
+	.align	4
+L(bytealigned):
+	mtctr	rN
+
+/* We need to prime this loop.  This loop is swing modulo scheduled
+   to avoid pipe delays.  The dependent instruction latencies (load to
+   compare to conditional branch) is 2 to 3 cycles.  In this loop each
+   dispatch group ends in a branch and takes 1 cycle.  Effectively
+   the first iteration of the loop only serves to load operands and
+   branches based on compares are delayed until the next loop.
+
+   So we must precondition some registers and condition codes so that
+   we don't exit the loop early on the first iteration.  */
+
+	lbz	rWORD1, 0(rSTR1)
+	lbz	rWORD2, 0(rSTR2)
+	bdz	L(b11)
+	cmpld	cr7, rWORD1, rWORD2
+	lbz	rWORD3, 1(rSTR1)
+	lbz	rWORD4, 1(rSTR2)
+	bdz	L(b12)
+	cmpld	cr1, rWORD3, rWORD4
+	lbzu	rWORD5, 2(rSTR1)
+	lbzu	rWORD6, 2(rSTR2)
+	bdz	L(b13)
+	.align	4
+L(bLoop):
+	lbzu	rWORD1, 1(rSTR1)
+	lbzu	rWORD2, 1(rSTR2)
+	bne	cr7, L(bLcr7)
+
+	cmpld	cr6, rWORD5, rWORD6
+	bdz	L(b3i)
+
+	lbzu	rWORD3, 1(rSTR1)
+	lbzu	rWORD4, 1(rSTR2)
+	bne	cr1, L(bLcr1)
+
+	cmpld	cr7, rWORD1, rWORD2
+	bdz	L(b2i)
+
+	lbzu	rWORD5, 1(rSTR1)
+	lbzu	rWORD6, 1(rSTR2)
+	bne	cr6, L(bLcr6)
+
+	cmpld	cr1, rWORD3, rWORD4
+	bdnz	L(bLoop)
+
+/* We speculatively loading bytes before we have tested the previous
+   bytes.  But we must avoid overrunning the length (in the ctr) to
+   prevent these speculative loads from causing a segfault.  In this
+   case the loop will exit early (before the all pending bytes are
+   tested.  In this case we must complete the pending operations
+   before returning.  */
+L(b1i):
+	bne	cr7, L(bLcr7)
+	bne	cr1, L(bLcr1)
+	b	L(bx56)
+	.align	4
+L(b2i):
+	bne	cr6, L(bLcr6)
+	bne	cr7, L(bLcr7)
+	b	L(bx34)
+	.align	4
+L(b3i):
+	bne	cr1, L(bLcr1)
+	bne	cr6, L(bLcr6)
+	b	L(bx12)
+	.align	4
+L(bLcr7):
+	li	rRTN, 1
+	bgtlr	cr7
+	li	rRTN, -1
+	blr
+L(bLcr1):
+	li	rRTN, 1
+	bgtlr	cr1
+	li	rRTN, -1
+	blr
+L(bLcr6):
+	li	rRTN, 1
+	bgtlr	cr6
+	li	rRTN, -1
+	blr
+
+L(b13):
+	bne	cr7, L(bx12)
+	bne	cr1, L(bx34)
+L(bx56):
+	sub	rRTN, rWORD5, rWORD6
+	blr
+	nop
+L(b12):
+	bne	cr7, L(bx12)
+L(bx34):
+	sub	rRTN, rWORD3, rWORD4
+	blr
+L(b11):
+L(bx12):
+	sub	rRTN, rWORD1, rWORD2
+	blr
+
+	.align	4
+L(zeroLength):
+	li	rRTN, 0
+	blr
+
+	.align	4
+/* At this point we know the strings have different alignment and the
+   compare length is at least 8 bytes.  r12 contains the low order
+   3 bits of rSTR1 and cr5 contains the result of the logical compare
+   of r12 to 0.  If r12 == 0 then rStr1 is double word
+   aligned and can perform the DWunaligned loop.
+
+   Otherwise we know that rSTR1 is not already DW aligned yet.
+   So we can force the string addresses to the next lower DW
+   boundary and special case this first DW using shift left to
+   eliminate bits preceding the first byte.  Since we want to join the
+   normal (DWaligned) compare loop, starting at the second double word,
+   we need to adjust the length (rN) and special case the loop
+   versioning for the first DW. This ensures that the loop count is
+   correct and the first DW (shifted) is in the expected resister pair.  */
+L(unaligned):
+	std	rSHL, rSHLSAVE(r1)
+	cfi_offset(rSHL, rSHLSAVE)
+	clrldi	rSHL, rSTR2, 61
+	beq	cr6, L(duzeroLength)
+	std	rSHR, rSHRSAVE(r1)
+	cfi_offset(rSHR, rSHRSAVE)
+	beq	cr5, L(DWunaligned)
+	std	rWORD8_SHIFT, rWORD8SHIFTSAVE(r1)
+	cfi_offset(rWORD8_SHIFT, rWORD8SHIFTSAVE)
+/* Adjust the logical start of rSTR2 to compensate for the extra bits
+   in the 1st rSTR1 DW.  */
+	sub	rWORD8_SHIFT, rSTR2, r12
+/* But do not attempt to address the DW before that DW that contains
+   the actual start of rSTR2.  */
+	clrrdi	rSTR2, rSTR2, 3
+	std	rWORD2_SHIFT, rWORD2SHIFTSAVE(r1)
+/* Compute the left/right shift counts for the unaligned rSTR2,
+   compensating for the logical (DW aligned) start of rSTR1.  */
+	clrldi	rSHL, rWORD8_SHIFT, 61
+	clrrdi	rSTR1, rSTR1, 3
+	std	rWORD4_SHIFT, rWORD4SHIFTSAVE(r1)
+	sldi	rSHL, rSHL, 3
+	cmpld	cr5, rWORD8_SHIFT, rSTR2
+	add	rN, rN, r12
+	sldi	rWORD6, r12, 3
+	std	rWORD6_SHIFT, rWORD6SHIFTSAVE(r1)
+	cfi_offset(rWORD2_SHIFT, rWORD2SHIFTSAVE)
+	cfi_offset(rWORD4_SHIFT, rWORD4SHIFTSAVE)
+	cfi_offset(rWORD6_SHIFT, rWORD6SHIFTSAVE)
+	subfic	rSHR, rSHL, 64
+	srdi	r0, rN, 5	/* Divide by 32 */
+	andi.	r12, rN, 24	/* Get the DW remainder */
+/* We normally need to load 2 DWs to start the unaligned rSTR2, but in
+   this special case those bits may be discarded anyway.  Also we
+   must avoid loading a DW where none of the bits are part of rSTR2 as
+   this may cross a page boundary and cause a page fault.  */
+	li	rWORD8, 0
+	blt	cr5, L(dus0)
+	LD	rWORD8, 0, rSTR2
+	addi	rSTR2, rSTR2, 8
+	sld	rWORD8, rWORD8, rSHL
+
+L(dus0):
+	LD	rWORD1, 0, rSTR1
+	LD	rWORD2, 0, rSTR2
+	cmpldi	cr1, r12, 16
+	cmpldi	cr7, rN, 32
+	srd	r12, rWORD2, rSHR
+	clrldi	rN, rN, 61
+	beq	L(duPs4)
+	mtctr	r0
+	or	rWORD8, r12, rWORD8
+	bgt	cr1, L(duPs3)
+	beq	cr1, L(duPs2)
+
+/* Remainder is 8 */
+	.align	4
+L(dusP1):
+	sld	rWORD8_SHIFT, rWORD2, rSHL
+	sld	rWORD7, rWORD1, rWORD6
+	sld	rWORD8, rWORD8, rWORD6
+	bge	cr7, L(duP1e)
+/* At this point we exit early with the first double word compare
+   complete and remainder of 0 to 7 bytes.  See L(du14) for details on
+   how we handle the remaining bytes.  */
+	cmpld	cr5, rWORD7, rWORD8
+	sldi.	rN, rN, 3
+	bne	cr5, L(duLcr5)
+	cmpld	cr7, rN, rSHR
+	beq	L(duZeroReturn)
+	li	r0, 0
+	ble	cr7, L(dutrim)
+	LD	rWORD2, rOFF8, rSTR2
+	srd	r0, rWORD2, rSHR
+	b	L(dutrim)
+/* Remainder is 16 */
+	.align	4
+L(duPs2):
+	sld	rWORD6_SHIFT, rWORD2, rSHL
+	sld	rWORD5, rWORD1, rWORD6
+	sld	rWORD6, rWORD8, rWORD6
+	b	L(duP2e)
+/* Remainder is 24 */
+	.align	4
+L(duPs3):
+	sld	rWORD4_SHIFT, rWORD2, rSHL
+	sld	rWORD3, rWORD1, rWORD6
+	sld	rWORD4, rWORD8, rWORD6
+	b	L(duP3e)
+/* Count is a multiple of 32, remainder is 0 */
+	.align	4
+L(duPs4):
+	mtctr	r0
+	or	rWORD8, r12, rWORD8
+	sld	rWORD2_SHIFT, rWORD2, rSHL
+	sld	rWORD1, rWORD1, rWORD6
+	sld	rWORD2, rWORD8, rWORD6
+	b	L(duP4e)
+
+/* At this point we know rSTR1 is double word aligned and the
+   compare length is at least 8 bytes.  */
+	.align	4
+L(DWunaligned):
+	std	rWORD8_SHIFT, rWORD8SHIFTSAVE(r1)
+	clrrdi	rSTR2, rSTR2, 3
+	std	rWORD2_SHIFT, rWORD2SHIFTSAVE(r1)
+	srdi	r0, rN, 5	/* Divide by 32 */
+	std	rWORD4_SHIFT, rWORD4SHIFTSAVE(r1)
+	andi.	r12, rN, 24	/* Get the DW remainder */
+	std	rWORD6_SHIFT, rWORD6SHIFTSAVE(r1)
+	cfi_offset(rWORD8_SHIFT, rWORD8SHIFTSAVE)
+	cfi_offset(rWORD2_SHIFT, rWORD2SHIFTSAVE)
+	cfi_offset(rWORD4_SHIFT, rWORD4SHIFTSAVE)
+	cfi_offset(rWORD6_SHIFT, rWORD6SHIFTSAVE)
+	sldi	rSHL, rSHL, 3
+	LD	rWORD6, 0, rSTR2
+	LD	rWORD8, rOFF8, rSTR2
+	addi	rSTR2, rSTR2, 8
+	cmpldi	cr1, r12, 16
+	cmpldi	cr7, rN, 32
+	clrldi	rN, rN, 61
+	subfic	rSHR, rSHL, 64
+	sld	rWORD6_SHIFT, rWORD6, rSHL
+	beq	L(duP4)
+	mtctr	r0
+	bgt	cr1, L(duP3)
+	beq	cr1, L(duP2)
+
+/* Remainder is 8 */
+	.align	4
+L(duP1):
+	srd	r12, rWORD8, rSHR
+	LD	rWORD7, 0, rSTR1
+	sld	rWORD8_SHIFT, rWORD8, rSHL
+	or	rWORD8, r12, rWORD6_SHIFT
+	blt	cr7, L(duP1x)
+L(duP1e):
+	LD	rWORD1, rOFF8, rSTR1
+	LD	rWORD2, rOFF8, rSTR2
+	cmpld	cr5, rWORD7, rWORD8
+	srd	r0, rWORD2, rSHR
+	sld	rWORD2_SHIFT, rWORD2, rSHL
+	or	rWORD2, r0, rWORD8_SHIFT
+	LD	rWORD3, rOFF16, rSTR1
+	LD	rWORD4, rOFF16, rSTR2
+	cmpld	cr7, rWORD1, rWORD2
+	srd	r12, rWORD4, rSHR
+	sld	rWORD4_SHIFT, rWORD4, rSHL
+	bne	cr5, L(duLcr5)
+	or	rWORD4, r12, rWORD2_SHIFT
+	LD	rWORD5, rOFF24, rSTR1
+	LD	rWORD6, rOFF24, rSTR2
+	cmpld	cr1, rWORD3, rWORD4
+	srd	r0, rWORD6, rSHR
+	sld	rWORD6_SHIFT, rWORD6, rSHL
+	bne	cr7, L(duLcr7)
+	or	rWORD6, r0, rWORD4_SHIFT
+	cmpld	cr6, rWORD5, rWORD6
+	b	L(duLoop3)
+	.align	4
+/* At this point we exit early with the first double word compare
+   complete and remainder of 0 to 7 bytes.  See L(du14) for details on
+   how we handle the remaining bytes.  */
+L(duP1x):
+	cmpld	cr5, rWORD7, rWORD8
+	sldi.	rN, rN, 3
+	bne	cr5, L(duLcr5)
+	cmpld	cr7, rN, rSHR
+	beq	L(duZeroReturn)
+	li	r0, 0
+	ble	cr7, L(dutrim)
+	LD	rWORD2, rOFF8, rSTR2
+	srd	r0, rWORD2, rSHR
+	b	L(dutrim)
+/* Remainder is 16 */
+	.align	4
+L(duP2):
+	srd	r0, rWORD8, rSHR
+	LD	rWORD5, 0, rSTR1
+	or	rWORD6, r0, rWORD6_SHIFT
+	sld	rWORD6_SHIFT, rWORD8, rSHL
+L(duP2e):
+	LD	rWORD7, rOFF8, rSTR1
+	LD	rWORD8, rOFF8, rSTR2
+	cmpld	cr6, rWORD5, rWORD6
+	srd	r12, rWORD8, rSHR
+	sld	rWORD8_SHIFT, rWORD8, rSHL
+	or	rWORD8, r12, rWORD6_SHIFT
+	blt	cr7, L(duP2x)
+	LD	rWORD1, rOFF16, rSTR1
+	LD	rWORD2, rOFF16, rSTR2
+	cmpld	cr5, rWORD7, rWORD8
+	bne	cr6, L(duLcr6)
+	srd	r0, rWORD2, rSHR
+	sld	rWORD2_SHIFT, rWORD2, rSHL
+	or	rWORD2, r0, rWORD8_SHIFT
+	LD	rWORD3, rOFF24, rSTR1
+	LD	rWORD4, rOFF24, rSTR2
+	cmpld	cr7, rWORD1, rWORD2
+	bne	cr5, L(duLcr5)
+	srd	r12, rWORD4, rSHR
+	sld	rWORD4_SHIFT, rWORD4, rSHL
+	or	rWORD4, r12, rWORD2_SHIFT
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+	cmpld	cr1, rWORD3, rWORD4
+	b	L(duLoop2)
+	.align	4
+L(duP2x):
+	cmpld	cr5, rWORD7, rWORD8
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+	bne	cr6, L(duLcr6)
+	sldi.	rN, rN, 3
+	bne	cr5, L(duLcr5)
+	cmpld	cr7, rN, rSHR
+	beq	L(duZeroReturn)
+	li	r0, 0
+	ble	cr7, L(dutrim)
+	LD	rWORD2, rOFF8, rSTR2
+	srd	r0, rWORD2, rSHR
+	b	L(dutrim)
+
+/* Remainder is 24 */
+	.align	4
+L(duP3):
+	srd	r12, rWORD8, rSHR
+	LD	rWORD3, 0, rSTR1
+	sld	rWORD4_SHIFT, rWORD8, rSHL
+	or	rWORD4, r12, rWORD6_SHIFT
+L(duP3e):
+	LD	rWORD5, rOFF8, rSTR1
+	LD	rWORD6, rOFF8, rSTR2
+	cmpld	cr1, rWORD3, rWORD4
+	srd	r0, rWORD6, rSHR
+	sld	rWORD6_SHIFT, rWORD6, rSHL
+	or	rWORD6, r0, rWORD4_SHIFT
+	LD	rWORD7, rOFF16, rSTR1
+	LD	rWORD8, rOFF16, rSTR2
+	cmpld	cr6, rWORD5, rWORD6
+	bne	cr1, L(duLcr1)
+	srd	r12, rWORD8, rSHR
+	sld	rWORD8_SHIFT, rWORD8, rSHL
+	or	rWORD8, r12, rWORD6_SHIFT
+	blt	cr7, L(duP3x)
+	LD	rWORD1, rOFF24, rSTR1
+	LD	rWORD2, rOFF24, rSTR2
+	cmpld	cr5, rWORD7, rWORD8
+	bne	cr6, L(duLcr6)
+	srd	r0, rWORD2, rSHR
+	sld	rWORD2_SHIFT, rWORD2, rSHL
+	or	rWORD2, r0, rWORD8_SHIFT
+	addi	rSTR1, rSTR1, 16
+	addi	rSTR2, rSTR2, 16
+	cmpld	cr7, rWORD1, rWORD2
+	b	L(duLoop1)
+	.align	4
+L(duP3x):
+	addi	rSTR1, rSTR1, 16
+	addi	rSTR2, rSTR2, 16
+	cmpld	cr5, rWORD7, rWORD8
+	bne	cr6, L(duLcr6)
+	sldi.	rN, rN, 3
+	bne	cr5, L(duLcr5)
+	cmpld	cr7, rN, rSHR
+	beq	L(duZeroReturn)
+	li	r0, 0
+	ble	cr7, L(dutrim)
+	LD	rWORD2, rOFF8, rSTR2
+	srd	r0, rWORD2, rSHR
+	b	L(dutrim)
+
+/* Count is a multiple of 32, remainder is 0 */
+	.align	4
+L(duP4):
+	mtctr	r0
+	srd	r0, rWORD8, rSHR
+	LD	rWORD1, 0, rSTR1
+	sld	rWORD2_SHIFT, rWORD8, rSHL
+	or	rWORD2, r0, rWORD6_SHIFT
+L(duP4e):
+	LD	rWORD3, rOFF8, rSTR1
+	LD	rWORD4, rOFF8, rSTR2
+	cmpld	cr7, rWORD1, rWORD2
+	srd	r12, rWORD4, rSHR
+	sld	rWORD4_SHIFT, rWORD4, rSHL
+	or	rWORD4, r12, rWORD2_SHIFT
+	LD	rWORD5, rOFF16, rSTR1
+	LD	rWORD6, rOFF16, rSTR2
+	cmpld	cr1, rWORD3, rWORD4
+	bne	cr7, L(duLcr7)
+	srd	r0, rWORD6, rSHR
+	sld	rWORD6_SHIFT, rWORD6, rSHL
+	or	rWORD6, r0, rWORD4_SHIFT
+	LD	rWORD7, rOFF24, rSTR1
+	LD	rWORD8, rOFF24, rSTR2
+	addi	rSTR1, rSTR1, 24
+	addi	rSTR2, rSTR2, 24
+	cmpld	cr6, rWORD5, rWORD6
+	bne	cr1, L(duLcr1)
+	srd	r12, rWORD8, rSHR
+	sld	rWORD8_SHIFT, rWORD8, rSHL
+	or	rWORD8, r12, rWORD6_SHIFT
+	cmpld	cr5, rWORD7, rWORD8
+	bdz	L(du24)		/* Adjust CTR as we start with +4 */
+/* This is the primary loop */
+	.align	4
+L(duLoop):
+	LD	rWORD1, rOFF8, rSTR1
+	LD	rWORD2, rOFF8, rSTR2
+	cmpld	cr1, rWORD3, rWORD4
+	bne	cr6, L(duLcr6)
+	srd	r0, rWORD2, rSHR
+	sld	rWORD2_SHIFT, rWORD2, rSHL
+	or	rWORD2, r0, rWORD8_SHIFT
+L(duLoop1):
+	LD	rWORD3, rOFF16, rSTR1
+	LD	rWORD4, rOFF16, rSTR2
+	cmpld	cr6, rWORD5, rWORD6
+	bne	cr5, L(duLcr5)
+	srd	r12, rWORD4, rSHR
+	sld	rWORD4_SHIFT, rWORD4, rSHL
+	or	rWORD4, r12, rWORD2_SHIFT
+L(duLoop2):
+	LD	rWORD5, rOFF24, rSTR1
+	LD	rWORD6, rOFF24, rSTR2
+	cmpld	cr5, rWORD7, rWORD8
+	bne	cr7, L(duLcr7)
+	srd	r0, rWORD6, rSHR
+	sld	rWORD6_SHIFT, rWORD6, rSHL
+	or	rWORD6, r0, rWORD4_SHIFT
+L(duLoop3):
+	LD	rWORD7, rOFF32, rSTR1
+	LD	rWORD8, rOFF32, rSTR2
+	addi	rSTR1, rSTR1, 32
+	addi	rSTR2, rSTR2, 32
+	cmpld	cr7, rWORD1, rWORD2
+	bne	cr1, L(duLcr1)
+	srd	r12, rWORD8, rSHR
+	sld	rWORD8_SHIFT, rWORD8, rSHL
+	or	rWORD8, r12, rWORD6_SHIFT
+	bdnz	L(duLoop)
+
+L(duL4):
+	cmpld	cr1, rWORD3, rWORD4
+	bne	cr6, L(duLcr6)
+	cmpld	cr6, rWORD5, rWORD6
+	bne	cr5, L(duLcr5)
+	cmpld	cr5, rWORD7, rWORD8
+L(du44):
+	bne	cr7, L(duLcr7)
+L(du34):
+	bne	cr1, L(duLcr1)
+L(du24):
+	bne	cr6, L(duLcr6)
+L(du14):
+	sldi.	rN, rN, 3
+	bne	cr5, L(duLcr5)
+/* At this point we have a remainder of 1 to 7 bytes to compare.  We use
+   shift right double to eliminate bits beyond the compare length.
+
+   However it may not be safe to load rWORD2 which may be beyond the
+   string length. So we compare the bit length of the remainder to
+   the right shift count (rSHR). If the bit count is less than or equal
+   we do not need to load rWORD2 (all significant bits are already in
+   rWORD8_SHIFT).  */
+	cmpld	cr7, rN, rSHR
+	beq	L(duZeroReturn)
+	li	r0, 0
+	ble	cr7, L(dutrim)
+	LD	rWORD2, rOFF8, rSTR2
+	srd	r0, rWORD2, rSHR
+	.align	4
+L(dutrim):
+	LD	rWORD1, rOFF8, rSTR1
+	ld	rWORD8, -8(r1)
+	subfic	rN, rN, 64	/* Shift count is 64 - (rN * 8).  */
+	or	rWORD2, r0, rWORD8_SHIFT
+	ld	rWORD7, rWORD7SAVE(r1)
+	ld	rSHL, rSHLSAVE(r1)
+	srd	rWORD1, rWORD1, rN
+	srd	rWORD2, rWORD2, rN
+	ld	rSHR, rSHRSAVE(r1)
+	ld	rWORD8_SHIFT, rWORD8SHIFTSAVE(r1)
+	li	rRTN, 0
+	cmpld	cr7, rWORD1, rWORD2
+	ld	rWORD2_SHIFT, rWORD2SHIFTSAVE(r1)
+	ld	rWORD4_SHIFT, rWORD4SHIFTSAVE(r1)
+	beq	cr7, L(dureturn24)
+	li	rRTN, 1
+	ld	rWORD6_SHIFT, rWORD6SHIFTSAVE(r1)
+	ld	rOFF8,  rOFF8SAVE(r1)
+	ld	rOFF16, rOFF16SAVE(r1)
+	ld	rOFF24, rOFF24SAVE(r1)
+	ld	rOFF32, rOFF32SAVE(r1)
+	bgtlr	cr7
+	li	rRTN, -1
+	blr
+	.align	4
+L(duLcr7):
+	ld	rWORD8, rWORD8SAVE(r1)
+	ld	rWORD7, rWORD7SAVE(r1)
+	li	rRTN, 1
+	bgt	cr7, L(dureturn29)
+	ld	rSHL, rSHLSAVE(r1)
+	ld	rSHR, rSHRSAVE(r1)
+	li	rRTN, -1
+	b	L(dureturn27)
+	.align	4
+L(duLcr1):
+	ld	rWORD8, rWORD8SAVE(r1)
+	ld	rWORD7, rWORD7SAVE(r1)
+	li	rRTN, 1
+	bgt	cr1, L(dureturn29)
+	ld	rSHL, rSHLSAVE(r1)
+	ld	rSHR, rSHRSAVE(r1)
+	li	rRTN, -1
+	b	L(dureturn27)
+	.align	4
+L(duLcr6):
+	ld	rWORD8, rWORD8SAVE(r1)
+	ld	rWORD7, rWORD7SAVE(r1)
+	li	rRTN, 1
+	bgt	cr6, L(dureturn29)
+	ld	rSHL, rSHLSAVE(r1)
+	ld	rSHR, rSHRSAVE(r1)
+	li	rRTN, -1
+	b	L(dureturn27)
+	.align	4
+L(duLcr5):
+	ld	rWORD8, rWORD8SAVE(r1)
+	ld	rWORD7, rWORD7SAVE(r1)
+	li	rRTN, 1
+	bgt	cr5, L(dureturn29)
+	ld	rSHL, rSHLSAVE(r1)
+	ld	rSHR, rSHRSAVE(r1)
+	li	rRTN, -1
+	b	L(dureturn27)
+
+	.align	3
+L(duZeroReturn):
+	li	rRTN, 0
+	.align	4
+L(dureturn):
+	ld	rWORD8, rWORD8SAVE(r1)
+	ld	rWORD7, rWORD7SAVE(r1)
+L(dureturn29):
+	ld	rSHL, rSHLSAVE(r1)
+	ld	rSHR, rSHRSAVE(r1)
+L(dureturn27):
+	ld	rWORD8_SHIFT, rWORD8SHIFTSAVE(r1)
+	ld	rWORD2_SHIFT, rWORD2SHIFTSAVE(r1)
+	ld	rWORD4_SHIFT, rWORD4SHIFTSAVE(r1)
+L(dureturn24):
+	ld	rWORD6_SHIFT, rWORD6SHIFTSAVE(r1)
+	ld	rOFF8,  rOFF8SAVE(r1)
+	ld	rOFF16, rOFF16SAVE(r1)
+	ld	rOFF24, rOFF24SAVE(r1)
+	ld	rOFF32, rOFF32SAVE(r1)
+	blr
+
+L(duzeroLength):
+	ld	rOFF8,  rOFF8SAVE(r1)
+	ld	rOFF16, rOFF16SAVE(r1)
+	ld	rOFF24, rOFF24SAVE(r1)
+	ld	rOFF32, rOFF32SAVE(r1)
+	li	rRTN, 0
+	blr
+
+END (MEMCMP)
+libc_hidden_builtin_def (memcmp)
+weak_alias (memcmp, bcmp)
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power7/memcpy.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/memcpy.S
new file mode 100644
index 0000000000..e08993cbc3
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/memcpy.S
@@ -0,0 +1,430 @@
+/* Optimized memcpy implementation for PowerPC64/POWER7.
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Luis Machado <luisgpm@br.ibm.com>.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+
+/* __ptr_t [r3] memcpy (__ptr_t dst [r3], __ptr_t src [r4], size_t len [r5]);
+   Returns 'dst'.  */
+
+#ifndef MEMCPY
+# define MEMCPY memcpy
+#endif
+
+#define dst 11		/* Use r11 so r3 kept unchanged.  */
+#define src 4
+#define cnt 5
+
+	.machine power7
+EALIGN (MEMCPY, 5, 0)
+	CALL_MCOUNT 3
+
+	cmpldi	cr1,cnt,31
+	neg	0,3
+	ble	cr1, L(copy_LT_32)  /* If move < 32 bytes use short move
+				    code.  */
+
+/* Align copies using VSX instructions to quadword. It is to avoid alignment
+   traps when memcpy is used on non-cacheable memory (for instance, memory
+   mapped I/O).  */
+	andi.	10,3,15
+	clrldi	11,4,60
+	cmpld	cr6,10,11	/* SRC and DST alignments match?  */
+
+	mr	dst,3
+	bne	cr6,L(copy_GE_32_unaligned)
+	beq	L(aligned_copy)
+
+	mtocrf	0x01,0
+	clrldi	0,0,60
+
+/* Get the DST and SRC aligned to 16 bytes.  */
+1:
+	bf	31,2f
+	lbz	6,0(src)
+	addi	src,src,1
+	stb	6,0(dst)
+	addi	dst,dst,1
+2:
+	bf	30,4f
+	lhz	6,0(src)
+	addi	src,src,2
+	sth	6,0(dst)
+	addi	dst,dst,2
+4:
+	bf	29,8f
+	lwz	6,0(src)
+	addi	src,src,4
+	stw	6,0(dst)
+	addi	dst,dst,4
+8:
+	bf	28,16f
+	ld	6,0(src)
+	addi	src,src,8
+	std	6,0(dst)
+	addi	dst,dst,8
+16:
+	subf	cnt,0,cnt
+
+/* Main aligned copy loop. Copies 128 bytes at a time. */
+L(aligned_copy):
+	li	6,16
+	li	7,32
+	li	8,48
+	mtocrf	0x02,cnt
+	srdi	12,cnt,7
+	cmpdi	12,0
+	beq	L(aligned_tail)
+	lxvd2x	6,0,src
+	lxvd2x	7,src,6
+	mtctr	12
+	b	L(aligned_128loop)
+
+	.align  4
+L(aligned_128head):
+	/* for the 2nd + iteration of this loop. */
+	lxvd2x	6,0,src
+	lxvd2x	7,src,6
+L(aligned_128loop):
+	lxvd2x	8,src,7
+	lxvd2x	9,src,8
+	stxvd2x	6,0,dst
+	addi	src,src,64
+	stxvd2x	7,dst,6
+	stxvd2x	8,dst,7
+	stxvd2x	9,dst,8
+	lxvd2x	6,0,src
+	lxvd2x	7,src,6
+	addi	dst,dst,64
+	lxvd2x	8,src,7
+	lxvd2x	9,src,8
+	addi	src,src,64
+	stxvd2x	6,0,dst
+	stxvd2x	7,dst,6
+	stxvd2x	8,dst,7
+	stxvd2x	9,dst,8
+	addi	dst,dst,64
+	bdnz	L(aligned_128head)
+
+L(aligned_tail):
+	mtocrf	0x01,cnt
+	bf	25,32f
+	lxvd2x	6,0,src
+	lxvd2x	7,src,6
+	lxvd2x	8,src,7
+	lxvd2x	9,src,8
+	addi	src,src,64
+	stxvd2x	6,0,dst
+	stxvd2x	7,dst,6
+	stxvd2x	8,dst,7
+	stxvd2x	9,dst,8
+	addi	dst,dst,64
+32:
+	bf	26,16f
+	lxvd2x	6,0,src
+	lxvd2x	7,src,6
+	addi	src,src,32
+	stxvd2x	6,0,dst
+	stxvd2x	7,dst,6
+	addi	dst,dst,32
+16:
+	bf	27,8f
+	lxvd2x	6,0,src
+	addi	src,src,16
+	stxvd2x	6,0,dst
+	addi	dst,dst,16
+8:
+	bf	28,4f
+	ld	6,0(src)
+	addi	src,src,8
+	std     6,0(dst)
+	addi	dst,dst,8
+4:	/* Copies 4~7 bytes.  */
+	bf	29,L(tail2)
+	lwz	6,0(src)
+	stw     6,0(dst)
+	bf      30,L(tail5)
+	lhz     7,4(src)
+	sth     7,4(dst)
+	bflr	31
+	lbz     8,6(src)
+	stb     8,6(dst)
+	/* Return original DST pointer.  */
+	blr
+
+
+/* Handle copies of 0~31 bytes.  */
+	.align	4
+L(copy_LT_32):
+	mr	dst,3
+	cmpldi	cr6,cnt,8
+	mtocrf	0x01,cnt
+	ble	cr6,L(copy_LE_8)
+
+	/* At least 9 bytes to go.  */
+	neg	8,4
+	andi.	0,8,3
+	cmpldi	cr1,cnt,16
+	beq	L(copy_LT_32_aligned)
+
+	/* Force 4-byte alignment for SRC.  */
+	mtocrf	0x01,0
+	subf	cnt,0,cnt
+2:
+	bf	30,1f
+	lhz	6,0(src)
+	addi	src,src,2
+	sth	6,0(dst)
+	addi	dst,dst,2
+1:
+	bf	31,L(end_4bytes_alignment)
+	lbz	6,0(src)
+	addi	src,src,1
+	stb	6,0(dst)
+	addi	dst,dst,1
+
+	.align	4
+L(end_4bytes_alignment):
+	cmpldi	cr1,cnt,16
+	mtocrf	0x01,cnt
+
+L(copy_LT_32_aligned):
+	/* At least 6 bytes to go, and SRC is word-aligned.  */
+	blt	cr1,8f
+
+	/* Copy 16 bytes.  */
+	lwz	6,0(src)
+	lwz	7,4(src)
+	stw	6,0(dst)
+	lwz	8,8(src)
+	stw	7,4(dst)
+	lwz	6,12(src)
+	addi	src,src,16
+	stw	8,8(dst)
+	stw	6,12(dst)
+	addi	dst,dst,16
+8:	/* Copy 8 bytes.  */
+	bf	28,L(tail4)
+	lwz	6,0(src)
+	lwz	7,4(src)
+	addi	src,src,8
+	stw	6,0(dst)
+	stw	7,4(dst)
+	addi	dst,dst,8
+
+	.align	4
+/* Copies 4~7 bytes.  */
+L(tail4):
+	bf	29,L(tail2)
+	lwz	6,0(src)
+	stw	6,0(dst)
+	bf	30,L(tail5)
+	lhz	7,4(src)
+	sth	7,4(dst)
+	bflr	31
+	lbz	8,6(src)
+	stb	8,6(dst)
+	/* Return original DST pointer.  */
+	blr
+
+	.align	4
+/* Copies 2~3 bytes.  */
+L(tail2):
+	bf	30,1f
+	lhz	6,0(src)
+	sth	6,0(dst)
+	bflr	31
+	lbz	7,2(src)
+	stb	7,2(dst)
+	blr
+
+	.align	4
+L(tail5):
+	bflr	31
+	lbz	6,4(src)
+	stb	6,4(dst)
+	blr
+
+	.align	4
+1:
+	bflr	31
+	lbz	6,0(src)
+	stb	6,0(dst)
+	/* Return original DST pointer.  */
+	blr
+
+
+/* Handles copies of 0~8 bytes.  */
+	.align	4
+L(copy_LE_8):
+	bne	cr6,L(tail4)
+
+	/* Though we could've used ld/std here, they are still
+	slow for unaligned cases.  */
+
+	lwz	6,0(src)
+	lwz	7,4(src)
+	stw	6,0(dst)
+	stw	7,4(dst)
+	blr
+
+
+/* Handle copies of 32+ bytes where DST is aligned (to quadword) but
+   SRC is not.	Use aligned quadword loads from SRC, shifted to realign
+   the data, allowing for aligned DST stores.  */
+	.align	4
+L(copy_GE_32_unaligned):
+	clrldi	0,0,60	      /* Number of bytes until the 1st dst quadword.  */
+	srdi	9,cnt,4	      /* Number of full quadwords remaining.  */
+
+	beq	L(copy_GE_32_unaligned_cont)
+
+	/* DST is not quadword aligned, get it aligned.  */
+
+	mtocrf	0x01,0
+	subf	cnt,0,cnt
+
+	/* Vector instructions work best when proper alignment (16-bytes)
+	is present.  Move 0~15 bytes as needed to get DST quadword-aligned.  */
+1:
+	bf	31,2f
+	lbz	6,0(src)
+	addi	src,src,1
+	stb	6,0(dst)
+	addi	dst,dst,1
+2:
+	bf	30,4f
+	lhz	6,0(src)
+	addi	src,src,2
+	sth	6,0(dst)
+	addi	dst,dst,2
+4:
+	bf	29,8f
+	lwz	6,0(src)
+	addi	src,src,4
+	stw	6,0(dst)
+	addi	dst,dst,4
+8:
+	bf	28,0f
+	ld	6,0(src)
+	addi	src,src,8
+	std	6,0(dst)
+	addi	dst,dst,8
+0:
+	srdi	9,cnt,4	      /* Number of full quadwords remaining.  */
+
+	/* The proper alignment is present, it is OK to copy the bytes now.  */
+L(copy_GE_32_unaligned_cont):
+
+	/* Setup two indexes to speed up the indexed vector operations.  */
+	clrldi	10,cnt,60
+	li	6,16	      /* Index for 16-bytes offsets.  */
+	li	7,32	      /* Index for 32-bytes offsets.  */
+	cmpldi	cr1,10,0
+	srdi	8,cnt,5	      /* Setup the loop counter.  */
+	mtocrf	0x01,9
+	cmpldi	cr6,9,1
+#ifdef __LITTLE_ENDIAN__
+	lvsr	5,0,src
+#else
+	lvsl	5,0,src
+#endif
+	lvx	3,0,src
+	li	0,0
+	bf	31,L(setup_unaligned_loop)
+
+	/* Copy another 16 bytes to align to 32-bytes due to the loop.  */
+	lvx	4,src,6
+#ifdef __LITTLE_ENDIAN__
+	vperm	6,4,3,5
+#else
+	vperm	6,3,4,5
+#endif
+	addi	src,src,16
+	stvx	6,0,dst
+	addi	dst,dst,16
+	vor	3,4,4
+	clrrdi	0,src,60
+
+L(setup_unaligned_loop):
+	mtctr	8
+	ble	cr6,L(end_unaligned_loop)
+
+	/* Copy 32 bytes at a time using vector instructions.  */
+	.align	4
+L(unaligned_loop):
+
+	/* Note: vr6/vr10 may contain data that was already copied,
+	but in order to get proper alignment, we may have to copy
+	some portions again. This is faster than having unaligned
+	vector instructions though.  */
+
+	lvx	4,src,6
+#ifdef __LITTLE_ENDIAN__
+	vperm	6,4,3,5
+#else
+	vperm	6,3,4,5
+#endif
+	lvx	3,src,7
+#ifdef __LITTLE_ENDIAN__
+	vperm	10,3,4,5
+#else
+	vperm	10,4,3,5
+#endif
+	addi	src,src,32
+	stvx	6,0,dst
+	stvx	10,dst,6
+	addi	dst,dst,32
+	bdnz	L(unaligned_loop)
+
+	clrrdi	0,src,60
+
+	.align	4
+L(end_unaligned_loop):
+
+	/* Check for tail bytes.  */
+	mtocrf	0x01,cnt
+	beqlr	cr1
+
+	add	src,src,0
+
+	/*  We have 1~15 tail bytes to copy, and DST is quadword aligned.  */
+	/* Copy 8 bytes.  */
+	bf	28,4f
+	lwz	6,0(src)
+	lwz	7,4(src)
+	addi	src,src,8
+	stw	6,0(dst)
+	stw	7,4(dst)
+	addi	dst,dst,8
+4:	/* Copy 4~7 bytes.  */
+	bf	29,L(tail2)
+	lwz	6,0(src)
+	stw	6,0(dst)
+	bf	30,L(tail5)
+	lhz	7,4(src)
+	sth	7,4(dst)
+	bflr	31
+	lbz	8,6(src)
+	stb	8,6(dst)
+	/* Return original DST pointer.  */
+	blr
+
+END_GEN_TB (MEMCPY,TB_TOCLESS)
+libc_hidden_builtin_def (memcpy)
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power7/memmove.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/memmove.S
new file mode 100644
index 0000000000..4c0f7c3571
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/memmove.S
@@ -0,0 +1,835 @@
+/* Optimized memmove implementation for PowerPC64/POWER7.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+
+/* void* [r3] memmove (void *dest [r3], const void *src [r4], size_t len [r5])
+
+   This optimization check if memory 'dest'  overlaps with 'src'. If it does
+   not then it calls an optimized memcpy call (similar to memcpy for POWER7,
+   embedded here to gain some cycles).
+   If source and destiny overlaps, a optimized backwards memcpy is used
+   instead.  */
+
+#ifndef MEMMOVE
+# define MEMMOVE memmove
+#endif
+	.machine power7
+EALIGN (MEMMOVE, 5, 0)
+	CALL_MCOUNT 3
+
+L(_memmove):
+	subf    r9,r4,r3
+	cmpld   cr7,r9,r5
+	blt	cr7,L(memmove_bwd)
+
+	cmpldi	cr1,r5,31
+	neg	0,3
+	ble	cr1, L(copy_LT_32)  /* If move < 32 bytes use short move
+				       code.  */
+
+	andi.	10,3,15
+	clrldi	11,4,60
+	cmpld	cr6,10,11	/* SRC and DST alignments match?  */
+
+	mr	r11,3
+	bne	cr6,L(copy_GE_32_unaligned)
+	beq	L(aligned_copy)
+
+	mtocrf	0x01,0
+	clrldi	0,0,60
+
+/* Get the DST and SRC aligned to 8 bytes (16 for little-endian).  */
+1:
+	bf	31,2f
+	lbz	6,0(r4)
+	addi	r4,r4,1
+	stb	6,0(r11)
+	addi	r11,r11,1
+2:
+	bf	30,4f
+	lhz	6,0(r4)
+	addi	r4,r4,2
+	sth	6,0(r11)
+	addi	r11,r11,2
+4:
+	bf	29,8f
+	lwz	6,0(r4)
+	addi	r4,r4,4
+	stw	6,0(r11)
+	addi	r11,r11,4
+8:
+	bf	28,16f
+	ld	6,0(r4)
+	addi	r4,r4,8
+	std	6,0(r11)
+	addi	r11,r11,8
+16:
+	subf	r5,0,r5
+
+/* Main aligned copy loop. Copies 128 bytes at a time. */
+L(aligned_copy):
+	li	6,16
+	li	7,32
+	li	8,48
+	mtocrf	0x02,r5
+	srdi	12,r5,7
+	cmpdi	12,0
+	beq	L(aligned_tail)
+	lxvd2x	6,0,r4
+	lxvd2x	7,r4,6
+	mtctr	12
+	b	L(aligned_128loop)
+
+	.align  4
+L(aligned_128head):
+	/* for the 2nd + iteration of this loop. */
+	lxvd2x	6,0,r4
+	lxvd2x	7,r4,6
+L(aligned_128loop):
+	lxvd2x	8,r4,7
+	lxvd2x	9,r4,8
+	stxvd2x	6,0,r11
+	addi	r4,r4,64
+	stxvd2x	7,r11,6
+	stxvd2x	8,r11,7
+	stxvd2x	9,r11,8
+	lxvd2x	6,0,r4
+	lxvd2x	7,r4,6
+	addi	r11,r11,64
+	lxvd2x	8,r4,7
+	lxvd2x	9,r4,8
+	addi	r4,r4,64
+	stxvd2x	6,0,r11
+	stxvd2x	7,r11,6
+	stxvd2x	8,r11,7
+	stxvd2x	9,r11,8
+	addi	r11,r11,64
+	bdnz	L(aligned_128head)
+
+L(aligned_tail):
+	mtocrf	0x01,r5
+	bf	25,32f
+	lxvd2x	6,0,r4
+	lxvd2x	7,r4,6
+	lxvd2x	8,r4,7
+	lxvd2x	9,r4,8
+	addi	r4,r4,64
+	stxvd2x	6,0,r11
+	stxvd2x	7,r11,6
+	stxvd2x	8,r11,7
+	stxvd2x	9,r11,8
+	addi	r11,r11,64
+32:
+	bf	26,16f
+	lxvd2x	6,0,r4
+	lxvd2x	7,r4,6
+	addi	r4,r4,32
+	stxvd2x	6,0,r11
+	stxvd2x	7,r11,6
+	addi	r11,r11,32
+16:
+	bf	27,8f
+	lxvd2x	6,0,r4
+	addi	r4,r4,16
+	stxvd2x	6,0,r11
+	addi	r11,r11,16
+8:
+	bf	28,4f
+	ld	6,0(r4)
+	addi	r4,r4,8
+	std     6,0(r11)
+	addi	r11,r11,8
+4:	/* Copies 4~7 bytes.  */
+	bf	29,L(tail2)
+	lwz	6,0(r4)
+	stw     6,0(r11)
+	bf      30,L(tail5)
+	lhz     7,4(r4)
+	sth     7,4(r11)
+	bflr	31
+	lbz     8,6(r4)
+	stb     8,6(r11)
+	/* Return original DST pointer.  */
+	blr
+
+/* Handle copies of 0~31 bytes.  */
+	.align	4
+L(copy_LT_32):
+	mr	r11,3
+	cmpldi	cr6,r5,8
+	mtocrf	0x01,r5
+	ble	cr6,L(copy_LE_8)
+
+	/* At least 9 bytes to go.  */
+	neg	8,4
+	andi.	0,8,3
+	cmpldi	cr1,r5,16
+	beq	L(copy_LT_32_aligned)
+
+	/* Force 4-byte alignment for SRC.  */
+	mtocrf	0x01,0
+	subf	r5,0,r5
+2:
+	bf	30,1f
+	lhz	6,0(r4)
+	addi	r4,r4,2
+	sth	6,0(r11)
+	addi	r11,r11,2
+1:
+	bf	31,L(end_4bytes_alignment)
+	lbz	6,0(r4)
+	addi	r4,r4,1
+	stb	6,0(r11)
+	addi	r11,r11,1
+
+	.align	4
+L(end_4bytes_alignment):
+	cmpldi	cr1,r5,16
+	mtocrf	0x01,r5
+
+L(copy_LT_32_aligned):
+	/* At least 6 bytes to go, and SRC is word-aligned.  */
+	blt	cr1,8f
+
+	/* Copy 16 bytes.  */
+	lwz	6,0(r4)
+	lwz	7,4(r4)
+	stw	6,0(r11)
+	lwz	8,8(r4)
+	stw	7,4(r11)
+	lwz	6,12(r4)
+	addi	r4,r4,16
+	stw	8,8(r11)
+	stw	6,12(r11)
+	addi	r11,r11,16
+8:	/* Copy 8 bytes.  */
+	bf	28,L(tail4)
+	lwz	6,0(r4)
+	lwz	7,4(r4)
+	addi	r4,r4,8
+	stw	6,0(r11)
+	stw	7,4(r11)
+	addi	r11,r11,8
+
+	.align	4
+/* Copies 4~7 bytes.  */
+L(tail4):
+	bf	29,L(tail2)
+	lwz	6,0(r4)
+	stw	6,0(r11)
+	bf	30,L(tail5)
+	lhz	7,4(r4)
+	sth	7,4(r11)
+	bflr	31
+	lbz	8,6(r4)
+	stb	8,6(r11)
+	/* Return original DST pointer.  */
+	blr
+
+	.align	4
+/* Copies 2~3 bytes.  */
+L(tail2):
+	bf	30,1f
+	lhz	6,0(r4)
+	sth	6,0(r11)
+	bflr	31
+	lbz	7,2(r4)
+	stb	7,2(r11)
+	blr
+
+	.align	4
+L(tail5):
+	bflr	31
+	lbz	6,4(r4)
+	stb	6,4(r11)
+	blr
+
+	.align	4
+1:
+	bflr	31
+	lbz	6,0(r4)
+	stb	6,0(r11)
+	/* Return original DST pointer.  */
+	blr
+
+/* Handles copies of 0~8 bytes.  */
+	.align	4
+L(copy_LE_8):
+	bne	cr6,L(tail4)
+
+	/* Though we could've used ld/std here, they are still
+	slow for unaligned cases.  */
+
+	lwz	6,0(r4)
+	lwz	7,4(r4)
+	stw	6,0(r11)
+	stw	7,4(r11)
+	blr
+
+
+/* Handle copies of 32+ bytes where DST is aligned (to quadword) but
+   SRC is not.	Use aligned quadword loads from SRC, shifted to realign
+   the data, allowing for aligned DST stores.  */
+	.align	4
+L(copy_GE_32_unaligned):
+	clrldi	0,0,60	      /* Number of bytes until the 1st r11 quadword.  */
+	srdi	9,r5,4	      /* Number of full quadwords remaining.  */
+
+	beq	L(copy_GE_32_unaligned_cont)
+
+	/* DST is not quadword aligned, get it aligned.  */
+
+	mtocrf	0x01,0
+	subf	r5,0,r5
+
+	/* Vector instructions work best when proper alignment (16-bytes)
+	is present.  Move 0~15 bytes as needed to get DST quadword-aligned.  */
+1:
+	bf	31,2f
+	lbz	6,0(r4)
+	addi	r4,r4,1
+	stb	6,0(r11)
+	addi	r11,r11,1
+2:
+	bf	30,4f
+	lhz	6,0(r4)
+	addi	r4,r4,2
+	sth	6,0(r11)
+	addi	r11,r11,2
+4:
+	bf	29,8f
+	lwz	6,0(r4)
+	addi	r4,r4,4
+	stw	6,0(r11)
+	addi	r11,r11,4
+8:
+	bf	28,0f
+	ld	6,0(r4)
+	addi	r4,r4,8
+	std	6,0(r11)
+	addi	r11,r11,8
+0:
+	srdi	9,r5,4	      /* Number of full quadwords remaining.  */
+
+	/* The proper alignment is present, it is OK to copy the bytes now.  */
+L(copy_GE_32_unaligned_cont):
+
+	/* Setup two indexes to speed up the indexed vector operations.  */
+	clrldi	10,r5,60
+	li	6,16	      /* Index for 16-bytes offsets.  */
+	li	7,32	      /* Index for 32-bytes offsets.  */
+	cmpldi	cr1,10,0
+	srdi	8,r5,5	      /* Setup the loop counter.  */
+	mtocrf	0x01,9
+	cmpldi	cr6,9,1
+#ifdef __LITTLE_ENDIAN__
+	lvsr	5,0,r4
+#else
+	lvsl	5,0,r4
+#endif
+	lvx	3,0,r4
+	li	0,0
+	bf	31,L(setup_unaligned_loop)
+
+	/* Copy another 16 bytes to align to 32-bytes due to the loop.  */
+	lvx	4,r4,6
+#ifdef __LITTLE_ENDIAN__
+	vperm	6,4,3,5
+#else
+	vperm	6,3,4,5
+#endif
+	addi	r4,r4,16
+	stvx	6,0,r11
+	addi	r11,r11,16
+	vor	3,4,4
+	clrrdi	0,r4,60
+
+L(setup_unaligned_loop):
+	mtctr	8
+	ble	cr6,L(end_unaligned_loop)
+
+	/* Copy 32 bytes at a time using vector instructions.  */
+	.align	4
+L(unaligned_loop):
+
+	/* Note: vr6/vr10 may contain data that was already copied,
+	but in order to get proper alignment, we may have to copy
+	some portions again. This is faster than having unaligned
+	vector instructions though.  */
+
+	lvx	4,r4,6
+#ifdef __LITTLE_ENDIAN__
+	vperm	6,4,3,5
+#else
+	vperm	6,3,4,5
+#endif
+	lvx	3,r4,7
+#ifdef __LITTLE_ENDIAN__
+	vperm	10,3,4,5
+#else
+	vperm	10,4,3,5
+#endif
+	addi	r4,r4,32
+	stvx	6,0,r11
+	stvx	10,r11,6
+	addi	r11,r11,32
+	bdnz	L(unaligned_loop)
+
+	clrrdi	0,r4,60
+
+	.align	4
+L(end_unaligned_loop):
+
+	/* Check for tail bytes.  */
+	mtocrf	0x01,r5
+	beqlr	cr1
+
+	add	r4,r4,0
+
+	/*  We have 1~15 tail bytes to copy, and DST is quadword aligned.  */
+	/* Copy 8 bytes.  */
+	bf	28,4f
+	lwz	6,0(r4)
+	lwz	7,4(r4)
+	addi	r4,r4,8
+	stw	6,0(r11)
+	stw	7,4(r11)
+	addi	r11,r11,8
+4:	/* Copy 4~7 bytes.  */
+	bf	29,L(tail2)
+	lwz	6,0(r4)
+	stw	6,0(r11)
+	bf	30,L(tail5)
+	lhz	7,4(r4)
+	sth	7,4(r11)
+	bflr	31
+	lbz	8,6(r4)
+	stb	8,6(r11)
+	/* Return original DST pointer.  */
+	blr
+
+	/* Start to memcpy backward implementation: the algorith first check if
+	   src and dest have the same alignment and if it does align both to 16
+	   bytes and copy using VSX instructions.
+	   If does not, align dest to 16 bytes and use VMX (altivec) instruction
+	   to read two 16 bytes at time, shift/permute the bytes read and write
+	   aligned to dest.  */
+L(memmove_bwd):
+	cmpldi	cr1,r5,31
+	/* Copy is done backwards: update the pointers and check alignment.  */
+	add	r11,r3,r5
+	add	r4,r4,r5
+	mr	r0,r11
+	ble	cr1, L(copy_LT_32_bwd)  /* If move < 32 bytes use short move
+				           code.  */
+
+	andi.	r10,r11,15	    /* Check if r11 is aligned to 16 bytes  */
+	clrldi	r9,r4,60	    /* Check if r4 is aligned to 16 bytes  */
+	cmpld	cr6,r10,r9	    /* SRC and DST alignments match?  */
+
+	bne     cr6,L(copy_GE_32_unaligned_bwd)
+	beq     L(aligned_copy_bwd)
+
+	mtocrf	0x01,r0
+	clrldi	r0,r0,60
+
+/* Get the DST and SRC aligned to 16 bytes.  */
+1:
+	bf	31,2f
+	lbz	r6,-1(r4)
+	subi	r4,r4,1
+	stb	r6,-1(r11)
+	subi	r11,r11,1
+2:
+	bf	30,4f
+	lhz	r6,-2(r4)
+	subi	r4,r4,2
+	sth	r6,-2(r11)
+	subi	r11,r11,2
+4:
+	bf	29,8f
+	lwz	r6,-4(r4)
+	subi	r4,r4,4
+	stw	r6,-4(r11)
+	subi	r11,r11,4
+8:
+	bf	28,16f
+	ld	r6,-8(r4)
+	subi	r4,r4,8
+	std	r6,-8(r11)
+	subi	r11,r11,8
+16:
+	subf	r5,0,r5
+
+/* Main aligned copy loop. Copies 128 bytes at a time. */
+L(aligned_copy_bwd):
+	li	r6,-16
+	li	r7,-32
+	li	r8,-48
+	li	r9,-64
+	mtocrf	0x02,r5
+	srdi	r12,r5,7
+	cmpdi	r12,0
+	beq	L(aligned_tail_bwd)
+	lxvd2x	v6,r4,r6
+	lxvd2x	v7,r4,r7
+	mtctr	12
+	b	L(aligned_128loop_bwd)
+
+	.align  4
+L(aligned_128head_bwd):
+	/* for the 2nd + iteration of this loop. */
+	lxvd2x	v6,r4,r6
+	lxvd2x	v7,r4,r7
+L(aligned_128loop_bwd):
+	lxvd2x	v8,r4,r8
+	lxvd2x	v9,r4,r9
+	stxvd2x	v6,r11,r6
+	subi	r4,r4,64
+	stxvd2x	v7,r11,r7
+	stxvd2x	v8,r11,r8
+	stxvd2x	v9,r11,r9
+	lxvd2x	v6,r4,r6
+	lxvd2x	v7,r4,7
+	subi	r11,r11,64
+	lxvd2x	v8,r4,r8
+	lxvd2x	v9,r4,r9
+	subi	r4,r4,64
+	stxvd2x	v6,r11,r6
+	stxvd2x	v7,r11,r7
+	stxvd2x	v8,r11,r8
+	stxvd2x	v9,r11,r9
+	subi	r11,r11,64
+	bdnz	L(aligned_128head_bwd)
+
+L(aligned_tail_bwd):
+	mtocrf	0x01,r5
+	bf	25,32f
+	lxvd2x	v6,r4,r6
+	lxvd2x	v7,r4,r7
+	lxvd2x	v8,r4,r8
+	lxvd2x	v9,r4,r9
+	subi	r4,r4,64
+	stxvd2x	v6,r11,r6
+	stxvd2x	v7,r11,r7
+	stxvd2x	v8,r11,r8
+	stxvd2x	v9,r11,r9
+	subi	r11,r11,64
+32:
+	bf	26,16f
+	lxvd2x	v6,r4,r6
+	lxvd2x	v7,r4,r7
+	subi	r4,r4,32
+	stxvd2x	v6,r11,r6
+	stxvd2x	v7,r11,r7
+	subi	r11,r11,32
+16:
+	bf	27,8f
+	lxvd2x	v6,r4,r6
+	subi	r4,r4,16
+	stxvd2x	v6,r11,r6
+	subi	r11,r11,16
+8:
+	bf	28,4f
+	ld	r6,-8(r4)
+	subi	r4,r4,8
+	std     r6,-8(r11)
+	subi	r11,r11,8
+4:	/* Copies 4~7 bytes.  */
+	bf	29,L(tail2_bwd)
+	lwz	r6,-4(r4)
+	stw     r6,-4(r11)
+	bf      30,L(tail5_bwd)
+	lhz     r7,-6(r4)
+	sth     r7,-6(r11)
+	bflr	31
+	lbz     r8,-7(r4)
+	stb     r8,-7(r11)
+	/* Return original DST pointer.  */
+	blr
+
+/* Handle copies of 0~31 bytes.  */
+	.align	4
+L(copy_LT_32_bwd):
+	cmpldi	cr6,r5,8
+	mtocrf	0x01,r5
+	ble	cr6,L(copy_LE_8_bwd)
+
+	/* At least 9 bytes to go.  */
+	neg	r8,r4
+	andi.	r0,r8,3
+	cmpldi	cr1,r5,16
+	beq	L(copy_LT_32_aligned_bwd)
+
+	/* Force 4-byte alignment for SRC.  */
+	mtocrf	0x01,0
+	subf	r5,0,r5
+2:
+	bf	30,1f
+	lhz	r6,-2(r4)
+	subi	r4,r4,2
+	sth	r6,-2(r11)
+	subi	r11,r11,2
+1:
+	bf	31,L(end_4bytes_alignment_bwd)
+	lbz	6,-1(r4)
+	subi	r4,r4,1
+	stb	6,-1(r11)
+	subi	r11,r11,1
+
+	.align	4
+L(end_4bytes_alignment_bwd):
+	cmpldi	cr1,r5,16
+	mtocrf	0x01,r5
+
+L(copy_LT_32_aligned_bwd):
+	/* At least 6 bytes to go, and SRC is word-aligned.  */
+	blt	cr1,8f
+
+	/* Copy 16 bytes.  */
+	lwz	r6,-4(r4)
+	lwz	r7,-8(r4)
+	stw	r6,-4(r11)
+	lwz	r8,-12(r4)
+	stw	r7,-8(r11)
+	lwz	r6,-16(r4)
+	subi	r4,r4,16
+	stw	r8,-12(r11)
+	stw	r6,-16(r11)
+	subi	r11,r11,16
+8:	/* Copy 8 bytes.  */
+	bf	28,L(tail4_bwd)
+	lwz	r6,-4(r4)
+	lwz	r7,-8(r4)
+	subi	r4,r4,8
+	stw	r6,-4(r11)
+	stw	r7,-8(r11)
+	subi	r11,r11,8
+
+	.align	4
+/* Copies 4~7 bytes.  */
+L(tail4_bwd):
+	bf	29,L(tail2_bwd)
+	lwz	6,-4(r4)
+	stw	6,-4(r11)
+	bf	30,L(tail5_bwd)
+	lhz	7,-6(r4)
+	sth	7,-6(r11)
+	bflr	31
+	lbz	8,-7(r4)
+	stb	8,-7(r11)
+	/* Return original DST pointer.  */
+	blr
+
+	.align	4
+/* Copies 2~3 bytes.  */
+L(tail2_bwd):
+	bf	30,1f
+	lhz	6,-2(r4)
+	sth	6,-2(r11)
+	bflr	31
+	lbz	7,-3(r4)
+	stb	7,-3(r11)
+	blr
+
+	.align	4
+L(tail5_bwd):
+	bflr	31
+	lbz	6,-5(r4)
+	stb	6,-5(r11)
+	blr
+
+	.align	4
+1:
+	bflr	31
+	lbz	6,-1(r4)
+	stb	6,-1(r11)
+	/* Return original DST pointer.  */
+	blr
+
+
+/* Handles copies of 0~8 bytes.  */
+	.align	4
+L(copy_LE_8_bwd):
+	bne	cr6,L(tail4_bwd)
+
+	/* Though we could've used ld/std here, they are still
+	   slow for unaligned cases.  */
+	lwz	6,-8(r4)
+	lwz	7,-4(r4)
+	stw	6,-8(r11)
+	stw	7,-4(r11)
+	blr
+
+
+/* Handle copies of 32+ bytes where DST is aligned (to quadword) but
+   SRC is not.	Use aligned quadword loads from SRC, shifted to realign
+   the data, allowing for aligned DST stores.  */
+	.align	4
+L(copy_GE_32_unaligned_bwd):
+	andi.	r10,r11,15      /* Check alignment of DST against 16 bytes..  */
+	srdi	r9,r5,4		/* Number of full quadwords remaining.  */
+
+	beq	L(copy_GE_32_unaligned_cont_bwd)
+
+	/* DST is not quadword aligned and r10 holds the address masked to
+           compare alignments.  */
+	mtocrf	0x01,r10
+	subf	r5,r10,r5
+
+	/* Vector instructions work best when proper alignment (16-bytes)
+	is present.  Move 0~15 bytes as needed to get DST quadword-aligned.  */
+1:
+	bf	31,2f
+	lbz	r6,-1(r4)
+	subi	r4,r4,1
+	stb	r6,-1(r11)
+	subi	r11,r11,1
+2:
+	bf	30,4f
+	lhz	r6,-2(r4)
+	subi	r4,r4,2
+	sth	r6,-2(r11)
+	subi	r11,r11,2
+4:
+	bf	29,8f
+	lwz	r6,-4(r4)
+	subi	r4,r4,4
+	stw	r6,-4(r11)
+	subi	r11,r11,4
+8:
+	bf	28,0f
+	ld	r6,-8(r4)
+	subi	r4,r4,8
+	std	r6,-8(r11)
+	subi	r11,r11,8
+0:
+	srdi	r9,r5,4	      /* Number of full quadwords remaining.  */
+
+	/* The proper alignment is present, it is OK to copy the bytes now.  */
+L(copy_GE_32_unaligned_cont_bwd):
+
+	/* Setup two indexes to speed up the indexed vector operations.  */
+	clrldi	r10,r5,60
+	li	r6,-16	      /* Index for 16-bytes offsets.  */
+	li	r7,-32	      /* Index for 32-bytes offsets.  */
+	cmpldi	cr1,10,0
+	srdi	r8,r5,5	      /* Setup the loop counter.  */
+	mtocrf	0x01,9
+	cmpldi	cr6,r9,1
+#ifdef __LITTLE_ENDIAN__
+	lvsr	v5,r0,r4
+#else
+	lvsl	v5,r0,r4
+#endif
+	lvx	v3,0,r4
+	li	r0,0
+	bf	31,L(setup_unaligned_loop_bwd)
+
+	/* Copy another 16 bytes to align to 32-bytes due to the loop.  */
+	lvx	v4,r4,r6
+#ifdef __LITTLE_ENDIAN__
+	vperm	v6,v3,v4,v5
+#else
+	vperm	v6,v4,v3,v5
+#endif
+	subi	r4,r4,16
+	stvx	v6,r11,r6
+	subi	r11,r11,16
+	vor	v3,v4,v4
+	clrrdi	r0,r4,60
+
+L(setup_unaligned_loop_bwd):
+	mtctr	r8
+	ble	cr6,L(end_unaligned_loop_bwd)
+
+	/* Copy 32 bytes at a time using vector instructions.  */
+	.align	4
+L(unaligned_loop_bwd):
+
+	/* Note: vr6/vr10 may contain data that was already copied,
+	but in order to get proper alignment, we may have to copy
+	some portions again. This is faster than having unaligned
+	vector instructions though.  */
+
+	lvx	v4,r4,r6
+#ifdef __LITTLE_ENDIAN__
+	vperm	v6,v3,v4,v5
+#else
+	vperm	v6,v4,v3,v5
+#endif
+	lvx	v3,r4,r7
+#ifdef __LITTLE_ENDIAN__
+	vperm	v10,v4,v3,v5
+#else
+	vperm	v10,v3,v4,v5
+#endif
+	subi	r4,r4,32
+	stvx	v6,r11,r6
+	stvx	v10,r11,r7
+	subi	r11,r11,32
+	bdnz	L(unaligned_loop_bwd)
+
+	clrrdi	r0,r4,60
+
+	.align	4
+L(end_unaligned_loop_bwd):
+
+	/* Check for tail bytes.  */
+	mtocrf	0x01,r5
+	beqlr	cr1
+
+	add	r4,r4,0
+
+	/*  We have 1~15 tail bytes to copy, and DST is quadword aligned.  */
+	/* Copy 8 bytes.  */
+	bf	28,4f
+	lwz	r6,-4(r4)
+	lwz	r7,-8(r4)
+	subi	r4,r4,8
+	stw	r6,-4(r11)
+	stw	r7,-8(r11)
+	subi	r11,r11,8
+4:	/* Copy 4~7 bytes.  */
+	bf	29,L(tail2_bwd)
+	lwz	r6,-4(r4)
+	stw	r6,-4(r11)
+	bf	30,L(tail5_bwd)
+	lhz	r7,-6(r4)
+	sth	r7,-6(r11)
+	bflr	31
+	lbz	r8,-7(r4)
+	stb	r8,-7(r11)
+	/* Return original DST pointer.  */
+	blr
+END_GEN_TB (MEMMOVE, TB_TOCLESS)
+libc_hidden_builtin_def (memmove)
+
+
+/* void bcopy(const void *src [r3], void *dest [r4], size_t n [r5])
+   Implemented in this file to avoid linker create a stub function call
+   in the branch to '_memmove'.  */
+ENTRY (__bcopy)
+	mr	r6,r3
+	mr	r3,r4
+	mr	r4,r6
+	b	L(_memmove)
+END (__bcopy)
+weak_alias (__bcopy, bcopy)
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power7/mempcpy.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/mempcpy.S
new file mode 100644
index 0000000000..4e15d1e40c
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/mempcpy.S
@@ -0,0 +1,472 @@
+/* Optimized mempcpy implementation for POWER7.
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Luis Machado <luisgpm@br.ibm.com>.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+
+/* __ptr_t [r3] __mempcpy (__ptr_t dst [r3], __ptr_t src [r4], size_t len [r5]);
+    Returns 'dst' + 'len'.  */
+
+#ifndef MEMPCPY
+# define MEMPCPY __mempcpy
+#endif
+	.machine  power7
+EALIGN (MEMPCPY, 5, 0)
+	CALL_MCOUNT 3
+
+	cmpldi	cr1,5,31
+	neg	0,3
+	std	3,-16(1)
+	std	31,-8(1)
+	cfi_offset(31,-8)
+	ble	cr1,L(copy_LT_32)   /* If move < 32 bytes use short move
+				       code.  */
+
+	andi.	11,3,7	      /* Check alignment of DST.  */
+
+
+	clrldi	10,4,61	      /* Check alignment of SRC.  */
+	cmpld	cr6,10,11     /* SRC and DST alignments match?  */
+	mr	12,4
+	mr	31,5
+	bne	cr6,L(copy_GE_32_unaligned)
+
+	srdi	9,5,3	      /* Number of full quadwords remaining.  */
+
+	beq	L(copy_GE_32_aligned_cont)
+
+	clrldi	0,0,61
+	mtcrf	0x01,0
+	subf	31,0,5
+
+	/* Get the SRC aligned to 8 bytes.  */
+
+1:	bf	31,2f
+	lbz	6,0(12)
+	addi	12,12,1
+	stb	6,0(3)
+	addi	3,3,1
+2:	bf	30,4f
+	lhz	6,0(12)
+	addi	12,12,2
+	sth	6,0(3)
+	addi	3,3,2
+4:	bf	29,0f
+	lwz	6,0(12)
+	addi	12,12,4
+	stw	6,0(3)
+	addi	3,3,4
+0:
+	clrldi	10,12,61      /* Check alignment of SRC again.  */
+	srdi	9,31,3	      /* Number of full doublewords remaining.  */
+
+L(copy_GE_32_aligned_cont):
+
+	clrldi	11,31,61
+	mtcrf	0x01,9
+
+	srdi	8,31,5
+	cmpldi	cr1,9,4
+	cmpldi	cr6,11,0
+	mr	11,12
+
+	/* Copy 1~3 doublewords so the main loop starts
+	at a multiple of 32 bytes.  */
+
+	bf	30,1f
+	ld	6,0(12)
+	ld	7,8(12)
+	addi	11,12,16
+	mtctr	8
+	std	6,0(3)
+	std	7,8(3)
+	addi	10,3,16
+	bf	31,4f
+	ld	0,16(12)
+	std	0,16(3)
+	blt	cr1,3f
+	addi	11,12,24
+	addi	10,3,24
+	b	4f
+
+	.align	4
+1:	/* Copy 1 doubleword and set the counter.  */
+	mr	10,3
+	mtctr	8
+	bf	31,4f
+	ld	6,0(12)
+	addi	11,12,8
+	std	6,0(3)
+	addi	10,3,8
+
+	/* Main aligned copy loop. Copies 32-bytes at a time.  */
+	.align	4
+4:
+	ld	6,0(11)
+	ld	7,8(11)
+	ld	8,16(11)
+	ld	0,24(11)
+	addi	11,11,32
+
+	std	6,0(10)
+	std	7,8(10)
+	std	8,16(10)
+	std	0,24(10)
+	addi	10,10,32
+	bdnz	4b
+3:
+
+	/* Check for tail bytes.  */
+	rldicr	0,31,0,60
+	mtcrf	0x01,31
+	beq	cr6,0f
+
+.L9:
+	add	3,3,0
+	add	12,12,0
+
+	/*  At this point we have a tail of 0-7 bytes and we know that the
+	destination is doubleword-aligned.  */
+4:	/* Copy 4 bytes.  */
+	bf	29,2f
+
+	lwz	6,0(12)
+	addi	12,12,4
+	stw	6,0(3)
+	addi	3,3,4
+2:	/* Copy 2 bytes.  */
+	bf	30,1f
+
+	lhz	6,0(12)
+	addi	12,12,2
+	sth	6,0(3)
+	addi	3,3,2
+1:	/* Copy 1 byte.  */
+	bf	31,0f
+
+	lbz	6,0(12)
+	stb	6,0(3)
+0:	/* Return DST + LEN pointer.  */
+	ld	31,-8(1)
+	ld	3,-16(1)
+	add	3,3,5
+	blr
+
+	/* Handle copies of 0~31 bytes.  */
+	.align	4
+L(copy_LT_32):
+	cmpldi	cr6,5,8
+	mr	12,4
+	mtcrf	0x01,5
+	ble	cr6,L(copy_LE_8)
+
+	/* At least 9 bytes to go.  */
+	neg	8,4
+	clrrdi	11,4,2
+	andi.	0,8,3
+	cmpldi	cr1,5,16
+	mr	10,5
+	beq	L(copy_LT_32_aligned)
+
+	/* Force 4-bytes alignment for SRC.  */
+	mtocrf  0x01,0
+	subf	10,0,5
+2:	bf	30,1f
+
+	lhz	6,0(12)
+	addi	12,12,2
+	sth	6,0(3)
+	addi	3,3,2
+1:	bf	31,L(end_4bytes_alignment)
+
+	lbz	6,0(12)
+	addi	12,12,1
+	stb	6,0(3)
+	addi	3,3,1
+
+	.align	4
+L(end_4bytes_alignment):
+	cmpldi	cr1,10,16
+	mtcrf	0x01,10
+
+L(copy_LT_32_aligned):
+	/* At least 6 bytes to go, and SRC is word-aligned.  */
+	blt	cr1,8f
+
+	/* Copy 16 bytes.  */
+	lwz	6,0(12)
+	lwz	7,4(12)
+	stw	6,0(3)
+	lwz	8,8(12)
+	stw	7,4(3)
+	lwz	6,12(12)
+	addi	12,12,16
+	stw	8,8(3)
+	stw	6,12(3)
+	addi	3,3,16
+8:	/* Copy 8 bytes.  */
+	bf	28,4f
+
+	lwz	6,0(12)
+	lwz	7,4(12)
+	addi	12,12,8
+	stw	6,0(3)
+	stw	7,4(3)
+	addi	3,3,8
+4:	/* Copy 4 bytes.  */
+	bf	29,2f
+
+	lwz	6,0(12)
+	addi	12,12,4
+	stw	6,0(3)
+	addi	3,3,4
+2:	/* Copy 2-3 bytes.  */
+	bf	30,1f
+
+	lhz	6,0(12)
+	sth	6,0(3)
+	bf	31,0f
+	lbz	7,2(12)
+	stb	7,2(3)
+	ld	3,-16(1)
+	add	3,3,5
+	blr
+
+	.align	4
+1:	/* Copy 1 byte.  */
+	bf	31,0f
+
+	lbz	6,0(12)
+	stb	6,0(3)
+0:	/* Return DST + LEN pointer.  */
+	ld	3,-16(1)
+	add	3,3,5
+	blr
+
+	/* Handles copies of 0~8 bytes.  */
+	.align	4
+L(copy_LE_8):
+	bne	cr6,4f
+
+	/* Though we could've used ld/std here, they are still
+	slow for unaligned cases.  */
+
+	lwz	6,0(4)
+	lwz	7,4(4)
+	stw	6,0(3)
+	stw	7,4(3)
+	ld	3,-16(1)      /* Return DST + LEN pointer.  */
+	add	3,3,5
+	blr
+
+	.align	4
+4:	/* Copies 4~7 bytes.  */
+	bf	29,2b
+
+	lwz	6,0(4)
+	stw	6,0(3)
+	bf	30,5f
+	lhz	7,4(4)
+	sth	7,4(3)
+	bf	31,0f
+	lbz	8,6(4)
+	stb	8,6(3)
+	ld	3,-16(1)
+	add	3,3,5
+	blr
+
+	.align	4
+5:	/* Copy 1 byte.  */
+	bf	31,0f
+
+	lbz	6,4(4)
+	stb	6,4(3)
+
+0:	/* Return DST + LEN pointer.  */
+	ld	3,-16(1)
+	add	3,3,5
+	blr
+
+	/* Handle copies of 32+ bytes where DST is aligned (to quadword) but
+	SRC is not.  Use aligned quadword loads from SRC, shifted to realign
+	the data, allowing for aligned DST stores.  */
+	.align	4
+L(copy_GE_32_unaligned):
+	clrldi	0,0,60	      /* Number of bytes until the 1st
+				 quadword.  */
+	andi.	11,3,15	      /* Check alignment of DST (against
+				 quadwords).  */
+	srdi	9,5,4	      /* Number of full quadwords remaining.  */
+
+	beq	L(copy_GE_32_unaligned_cont)
+
+	/* SRC is not quadword aligned, get it aligned.  */
+
+	mtcrf	0x01,0
+	subf	31,0,5
+
+	/* Vector instructions work best when proper alignment (16-bytes)
+	is present.  Move 0~15 bytes as needed to get DST quadword-aligned.  */
+1:	/* Copy 1 byte.  */
+	bf	31,2f
+
+	lbz	6,0(12)
+	addi	12,12,1
+	stb	6,0(3)
+	addi	3,3,1
+2:	/* Copy 2 bytes.  */
+	bf	30,4f
+
+	lhz	6,0(12)
+	addi	12,12,2
+	sth	6,0(3)
+	addi	3,3,2
+4:	/* Copy 4 bytes.  */
+	bf	29,8f
+
+	lwz	6,0(12)
+	addi	12,12,4
+	stw	6,0(3)
+	addi	3,3,4
+8:	/* Copy 8 bytes.  */
+	bf	28,0f
+
+	ld	6,0(12)
+	addi	12,12,8
+	std	6,0(3)
+	addi	3,3,8
+0:
+	clrldi	10,12,60      /* Check alignment of SRC.  */
+	srdi	9,31,4	      /* Number of full quadwords remaining.  */
+
+	/* The proper alignment is present, it is OK to copy the bytes now.  */
+L(copy_GE_32_unaligned_cont):
+
+	/* Setup two indexes to speed up the indexed vector operations.  */
+	clrldi	11,31,60
+	li	6,16	      /* Index for 16-bytes offsets.  */
+	li	7,32	      /* Index for 32-bytes offsets.  */
+	cmpldi	cr1,11,0
+	srdi	8,31,5	      /* Setup the loop counter.  */
+	mr	10,3
+	mr	11,12
+	mtcrf	0x01,9
+	cmpldi	cr6,9,1
+#ifdef __LITTLE_ENDIAN__
+	lvsr    5,0,12
+#else
+	lvsl    5,0,12
+#endif
+	lvx	3,0,12
+	bf	31,L(setup_unaligned_loop)
+
+	/* Copy another 16 bytes to align to 32-bytes due to the loop .  */
+	lvx	4,12,6
+#ifdef __LITTLE_ENDIAN__
+	vperm   6,4,3,5
+#else
+	vperm   6,3,4,5
+#endif
+	addi	11,12,16
+	addi	10,3,16
+	stvx	6,0,3
+	vor	3,4,4
+
+L(setup_unaligned_loop):
+	mtctr	8
+	ble	cr6,L(end_unaligned_loop)
+
+	/* Copy 32 bytes at a time using vector instructions.  */
+	.align	4
+L(unaligned_loop):
+
+	/* Note: vr6/vr10 may contain data that was already copied,
+	but in order to get proper alignment, we may have to copy
+	some portions again. This is faster than having unaligned
+	vector instructions though.  */
+
+	lvx	4,11,6	      /* vr4 = r11+16.  */
+#ifdef __LITTLE_ENDIAN__
+	vperm   6,4,3,5
+#else
+	vperm   6,3,4,5
+#endif
+	lvx	3,11,7	      /* vr3 = r11+32.  */
+#ifdef __LITTLE_ENDIAN__
+	vperm   10,3,4,5
+#else
+	vperm   10,4,3,5
+#endif
+	addi	11,11,32
+	stvx	6,0,10
+	stvx	10,10,6
+	addi	10,10,32
+
+	bdnz	L(unaligned_loop)
+
+	.align	4
+L(end_unaligned_loop):
+
+	/* Check for tail bytes.  */
+	rldicr	0,31,0,59
+	mtcrf	0x01,31
+	beq	cr1,0f
+
+	add	3,3,0
+	add	12,12,0
+
+	/*  We have 1~15 tail bytes to copy, and DST is quadword aligned.  */
+8:	/* Copy 8 bytes.  */
+	bf	28,4f
+
+	lwz	6,0(12)
+	lwz	7,4(12)
+	addi	12,12,8
+	stw	6,0(3)
+	stw	7,4(3)
+	addi	3,3,8
+4:	/* Copy 4 bytes.  */
+	bf	29,2f
+
+	lwz	6,0(12)
+	addi	12,12,4
+	stw	6,0(3)
+	addi	3,3,4
+2:	/* Copy 2~3 bytes.  */
+	bf	30,1f
+
+	lhz	6,0(12)
+	addi	12,12,2
+	sth	6,0(3)
+	addi	3,3,2
+1:	/* Copy 1 byte.  */
+	bf	31,0f
+
+	lbz	6,0(12)
+	stb	6,0(3)
+0:	/* Return DST + LEN pointer.  */
+	ld	31,-8(1)
+	ld	3,-16(1)
+	add	3,3,5
+	blr
+
+END_GEN_TB (MEMPCPY,TB_TOCLESS)
+libc_hidden_def (__mempcpy)
+weak_alias (__mempcpy, mempcpy)
+libc_hidden_builtin_def (mempcpy)
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power7/memrchr.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/memrchr.S
new file mode 100644
index 0000000000..4276768915
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/memrchr.S
@@ -0,0 +1,201 @@
+/* Optimized memrchr implementation for PowerPC64/POWER7 using cmpb insn.
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Luis Machado <luisgpm@br.ibm.com>.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+/* int [r3] memrchr (char *s [r3], int byte [r4], int size [r5])  */
+
+#ifndef MEMRCHR
+# define MEMRCHR __memrchr
+#endif
+	.machine  power7
+ENTRY (MEMRCHR)
+	CALL_MCOUNT 3
+	add	r7,r3,r5      /* Calculate the last acceptable address.  */
+	neg	r0,r7
+	addi	r7,r7,-1
+	mr	r10,r3
+	clrrdi	r6,r7,7
+	li	r9,3<<5
+	dcbt	r9,r6,8       /* Stream hint, decreasing addresses.  */
+
+	/* Replicate BYTE to doubleword.  */
+	insrdi	r4,r4,8,48
+	insrdi	r4,r4,16,32
+	insrdi  r4,r4,32,0
+	li	r6,-8
+	li	r9,-1
+	rlwinm	r0,r0,3,26,28 /* Calculate padding.  */
+	clrrdi	r8,r7,3
+	srd	r9,r9,r0
+	cmpldi	r5,32
+	clrrdi	r0,r10,3
+	ble	L(small_range)
+
+#ifdef __LITTLE_ENDIAN__
+	ldx	r12,0,r8
+#else
+	ldbrx	r12,0,r8      /* Load reversed doubleword from memory.  */
+#endif
+	cmpb	r3,r12,r4     /* Check for BYTE in DWORD1.  */
+	and	r3,r3,r9
+	cmpldi	cr7,r3,0      /* If r3 == 0, no BYTEs have been found.  */
+	bne	cr7,L(done)
+
+	mtcrf   0x01,r8
+	/* Are we now aligned to a quadword boundary?  If so, skip to
+	   the main loop.  Otherwise, go through the alignment code.  */
+	bf	28,L(loop_setup)
+
+	/* Handle DWORD2 of pair.  */
+#ifdef __LITTLE_ENDIAN__
+	ldx	r12,r8,r6
+#else
+	ldbrx	r12,r8,r6
+#endif
+	addi	r8,r8,-8
+	cmpb	r3,r12,r4
+	cmpldi	cr7,r3,0
+	bne	cr7,L(done)
+
+L(loop_setup):
+	/* The last dword we want to read in the loop below is the one
+	   containing the first byte of the string, ie. the dword at
+	   s & ~7, or r0.  The first dword read is at r8 - 8, we
+	   read 2 * cnt dwords, so the last dword read will be at
+	   r8 - 8 - 16 * cnt + 8.  Solving for cnt gives
+	   cnt = (r8 - r0) / 16  */
+	sub	r5,r8,r0
+	addi	r8,r8,-8
+	srdi	r9,r5,4       /* Number of loop iterations.  */
+	mtctr	r9	      /* Setup the counter.  */
+
+	/* Main loop to look for BYTE backwards in the string.
+	   FIXME: Investigate whether 32 byte align helps with this
+	   9 instruction loop.  */
+	.align	5
+L(loop):
+	/* Load two doublewords, compare and merge in a
+	   single register for speed.  This is an attempt
+	   to speed up the byte-checking process for bigger strings.  */
+
+#ifdef __LITTLE_ENDIAN__
+	ldx	r12,0,r8
+	ldx	r11,r8,r6
+#else
+	ldbrx	r12,0,r8
+	ldbrx	r11,r8,r6
+#endif
+	cmpb	r3,r12,r4
+	cmpb	r9,r11,r4
+	or	r5,r9,r3      /* Merge everything in one doubleword.  */
+	cmpldi	cr7,r5,0
+	bne	cr7,L(found)
+	addi	r8,r8,-16
+	bdnz	L(loop)
+
+	/* We may have one more word to read.  */
+	cmpld	r8,r0
+	bnelr
+
+#ifdef __LITTLE_ENDIAN__
+	ldx	r12,0,r8
+#else
+	ldbrx	r12,0,r8
+#endif
+	cmpb	r3,r12,r4
+	cmpldi	cr7,r3,0
+	bne	cr7,L(done)
+	blr
+
+	.align	4
+L(found):
+	/* OK, one (or both) of the dwords contains BYTE.  Check
+	   the first dword.  */
+	cmpldi	cr6,r3,0
+	bne	cr6,L(done)
+
+	/* BYTE must be in the second word.  Adjust the address
+	   again and move the result of cmpb to r3 so we can calculate the
+	   pointer.  */
+
+	mr	r3,r9
+	addi	r8,r8,-8
+
+	/* r3 has the output of the cmpb instruction, that is, it contains
+	   0xff in the same position as BYTE in the original
+	   word from the string.  Use that to calculate the pointer.
+	   We need to make sure BYTE is *before* the end of the
+	   range.  */
+L(done):
+	cntlzd	r9,r3	      /* Count leading zeros before the match.  */
+	cmpld	r8,r0         /* Are we on the last word?  */
+	srdi	r6,r9,3	      /* Convert leading zeros to bytes.  */
+	addi	r0,r6,-7
+	sub	r3,r8,r0
+	cmpld	cr7,r3,r10
+	bnelr
+	bgelr	cr7
+	li	r3,0
+	blr
+
+	.align	4
+L(null):
+	li	r3,0
+	blr
+
+/* Deals with size <= 32.  */
+	.align	4
+L(small_range):
+	cmpldi	r5,0
+	beq	L(null)
+
+#ifdef __LITTLE_ENDIAN__
+	ldx	r12,0,r8
+#else
+	ldbrx	r12,0,r8      /* Load reversed doubleword from memory.  */
+#endif
+	cmpb	r3,r12,r4     /* Check for BYTE in DWORD1.  */
+	and	r3,r3,r9
+	cmpldi	cr7,r3,0
+	bne	cr7,L(done)
+
+	/* Are we done already?  */
+	cmpld	r8,r0
+	addi	r8,r8,-8
+	beqlr
+
+	.align	5
+L(loop_small):
+#ifdef __LITTLE_ENDIAN__
+	ldx	r12,0,r8
+#else
+	ldbrx	r12,0,r8
+#endif
+	cmpb	r3,r12,r4
+	cmpld	r8,r0
+	cmpldi	cr7,r3,0
+	bne	cr7,L(done)
+	addi	r8,r8,-8
+	bne	L(loop_small)
+	blr
+
+END (MEMRCHR)
+weak_alias (__memrchr, memrchr)
+libc_hidden_builtin_def (memrchr)
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power7/memset.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/memset.S
new file mode 100644
index 0000000000..21933c0672
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/memset.S
@@ -0,0 +1,399 @@
+/* Optimized memset implementation for PowerPC64/POWER7.
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Luis Machado <luisgpm@br.ibm.com>.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+/* __ptr_t [r3] memset (__ptr_t s [r3], int c [r4], size_t n [r5]));
+   Returns 's'.  */
+
+#ifndef MEMSET
+# define MEMSET memset
+#endif
+	.machine power7
+EALIGN (MEMSET, 5, 0)
+	CALL_MCOUNT 3
+
+L(_memset):
+	cmpldi	cr7,5,31
+	cmpldi	cr6,5,8
+	mr	10,3
+
+	/* Replicate byte to word.  */
+	insrdi	4,4,8,48
+	insrdi	4,4,16,32
+	ble	cr6,L(small)	/* If length <= 8, use short copy code.  */
+
+	neg	0,3
+	ble	cr7,L(medium)	/* If length < 32, use medium copy code.  */
+
+	andi.	11,10,7		/* Check alignment of SRC.  */
+	insrdi	4,4,32,0	/* Replicate word to double word.  */
+
+	mr	12,5
+	beq	L(big_aligned)
+
+	clrldi	0,0,61
+	mtocrf	0x01,0
+	subf	5,0,5
+
+	/* Get DST aligned to 8 bytes.  */
+1:	bf	31,2f
+
+	stb	4,0(10)
+	addi	10,10,1
+2:	bf	30,4f
+
+	sth	4,0(10)
+	addi	10,10,2
+4:	bf	29,L(big_aligned)
+
+	stw	4,0(10)
+	addi	10,10,4
+
+	.align	4
+L(big_aligned):
+
+	cmpldi	cr5,5,255
+	li	0,32
+	dcbtst	0,10
+	cmpldi	cr6,4,0
+	srdi	9,5,3	/* Number of full doublewords remaining.  */
+	crand	27,26,21
+	mtocrf	0x01,9
+	bt	27,L(huge)
+
+	/* From this point on, we'll copy 32+ bytes and the value
+	   isn't 0 (so we can't use dcbz).  */
+
+	srdi	8,5,5
+	clrldi	11,5,61
+	cmpldi	cr6,11,0
+	cmpldi	cr1,9,4
+	mtctr	8
+
+	/* Copy 1~3 doublewords so the main loop starts
+	at a multiple of 32 bytes.  */
+
+	bf	30,1f
+
+	std	4,0(10)
+	std	4,8(10)
+	addi	10,10,16
+	bf	31,L(big_loop)
+
+	std	4,0(10)
+	addi	10,10,8
+	mr	12,10
+	blt	cr1,L(tail_bytes)
+	b	L(big_loop)
+
+	.align	4
+1:	/* Copy 1 doubleword.  */
+	bf	31,L(big_loop)
+
+	std	4,0(10)
+	addi	10,10,8
+
+	/* Main aligned copy loop.  Copies 32-bytes at a time and
+	   ping-pong through r10 and r12 to avoid AGEN delays.  */
+	.align	4
+L(big_loop):
+	addi	12,10,32
+	std	4,0(10)
+	std	4,8(10)
+	std	4,16(10)
+	std	4,24(10)
+	bdz	L(tail_bytes)
+
+	addi	10,10,64
+	std	4,0(12)
+	std	4,8(12)
+	std	4,16(12)
+	std	4,24(12)
+	bdnz	L(big_loop)
+
+	mr	12,10
+	b	L(tail_bytes)
+
+	.align	4
+L(tail_bytes):
+
+	/* Check for tail bytes.  */
+	beqlr	cr6
+
+	clrldi	0,5,61
+	mtocrf	0x01,0
+
+	/*  At this point we have a tail of 0-7 bytes and we know that the
+	destination is doubleword-aligned.  */
+4:	/* Copy 4 bytes.  */
+	bf	29,2f
+
+	stw	4,0(12)
+	addi	12,12,4
+2:	/* Copy 2 bytes.  */
+	bf	30,1f
+
+	sth	4,0(12)
+	addi	12,12,2
+1:	/* Copy 1 byte.  */
+	bflr	31
+
+	stb	4,0(12)
+	blr
+
+	/* Special case when value is 0 and we have a long length to deal
+	   with.  Use dcbz to zero out 128-bytes at a time.  Before using
+	   dcbz though, we need to get the destination 128-bytes aligned.  */
+	.align	4
+L(huge):
+	andi.	11,10,127
+	neg	0,10
+	beq	L(huge_aligned)
+
+	clrldi	0,0,57
+	subf	5,0,5
+	srdi	0,0,3
+	mtocrf	0x01,0
+
+	/* Get DST aligned to 128 bytes.  */
+8:	bf	28,4f
+
+	std	4,0(10)
+	std	4,8(10)
+	std	4,16(10)
+	std	4,24(10)
+	std	4,32(10)
+	std	4,40(10)
+	std	4,48(10)
+	std	4,56(10)
+	addi	10,10,64
+	.align	4
+4:	bf	29,2f
+
+	std	4,0(10)
+	std	4,8(10)
+	std	4,16(10)
+	std	4,24(10)
+	addi	10,10,32
+	.align	4
+2:	bf	30,1f
+
+	std	4,0(10)
+	std	4,8(10)
+	addi	10,10,16
+	.align	4
+1:	bf	31,L(huge_aligned)
+
+	std	4,0(10)
+	addi	10,10,8
+
+
+L(huge_aligned):
+	srdi	8,5,7
+	clrldi	11,5,57
+	cmpldi	cr6,11,0
+	mtctr	8
+
+	.align	4
+L(huge_loop):
+	dcbz	0,10
+	addi	10,10,128
+	bdnz	L(huge_loop)
+
+	/* Check how many bytes are still left.  */
+	beqlr	cr6
+
+	subf	9,3,10
+	subf	5,9,12
+	srdi	8,5,3
+	cmpldi	cr6,8,0
+	mtocrf	0x01,8
+
+	/* We have a tail o 1~127 bytes.  Copy up to 15 doublewords for
+	speed.  We'll handle the resulting tail bytes later.  */
+	beq	cr6,L(tail)
+
+8:	bf	28,4f
+
+	std	4,0(10)
+	std	4,8(10)
+	std	4,16(10)
+	std	4,24(10)
+	std	4,32(10)
+	std	4,40(10)
+	std	4,48(10)
+	std	4,56(10)
+	addi	10,10,64
+	.align	4
+4:	bf	29,2f
+
+	std	4,0(10)
+	std	4,8(10)
+	std	4,16(10)
+	std	4,24(10)
+	addi	10,10,32
+	.align	4
+2:	bf	30,1f
+
+	std	4,0(10)
+	std	4,8(10)
+	addi	10,10,16
+	.align	4
+1:	bf	31,L(tail)
+
+	std	4,0(10)
+	addi	10,10,8
+
+	/* Handle the rest of the tail bytes here.  */
+L(tail):
+	mtocrf	0x01,5
+
+	.align	4
+4:	bf	29,2f
+
+	stw	4,0(10)
+	addi	10,10,4
+	.align	4
+2:	bf	30,1f
+
+	sth	4,0(10)
+	addi	10,10,2
+	.align	4
+1:	bflr	31
+
+	stb	4,0(10)
+	blr
+
+	/* Expanded tree to copy tail bytes without increments.  */
+	.align	4
+L(copy_tail):
+	bf	29,L(FXX)
+
+	stw	4,0(10)
+	bf	30,L(TFX)
+
+	sth	4,4(10)
+	bflr	31
+
+	stb	4,6(10)
+	blr
+
+	.align	4
+L(FXX):	bf	30,L(FFX)
+
+	sth	4,0(10)
+	bflr	31
+
+	stb	4,2(10)
+	blr
+
+	.align	4
+L(TFX):	bflr	31
+
+	stb	4,4(10)
+	blr
+
+	.align	4
+L(FFX):	bflr	31
+
+	stb	4,0(10)
+	blr
+
+	/* Handle copies of 9~31 bytes.  */
+	.align	4
+L(medium):
+	/* At least 9 bytes to go.  */
+	andi.	11,10,3
+	clrldi	0,0,62
+	beq	L(medium_aligned)
+
+	/* Force 4-bytes alignment for DST.  */
+	mtocrf	0x01,0
+	subf	5,0,5
+1:	/* Copy 1 byte.  */
+	bf	31,2f
+
+	stb	4,0(10)
+	addi	10,10,1
+2:	/* Copy 2 bytes.  */
+	bf	30,L(medium_aligned)
+
+	sth	4,0(10)
+	addi	10,10,2
+
+	.align	4
+L(medium_aligned):
+	/* At least 6 bytes to go, and DST is word-aligned.  */
+	cmpldi	cr1,5,16
+	mtocrf	0x01,5
+	blt	cr1,8f
+
+	/* Copy 16 bytes.  */
+	stw	4,0(10)
+	stw	4,4(10)
+	stw	4,8(10)
+	stw	4,12(10)
+	addi	10,10,16
+8:	/* Copy 8 bytes.  */
+	bf	28,4f
+
+	stw	4,0(10)
+	stw	4,4(10)
+	addi	10,10,8
+4:	/* Copy 4 bytes.  */
+	bf	29,2f
+
+	stw	4,0(10)
+	addi	10,10,4
+2:	/* Copy 2-3 bytes.  */
+	bf	30,1f
+
+	sth	4,0(10)
+	addi	10,10,2
+1:	/* Copy 1 byte.  */
+	bflr	31
+
+	stb	4,0(10)
+	blr
+
+	/* Handles copies of 0~8 bytes.  */
+	.align	4
+L(small):
+	mtocrf	0x01,5
+	bne	cr6,L(copy_tail)
+
+	stw	4,0(10)
+	stw	4,4(10)
+	blr
+
+END_GEN_TB (MEMSET,TB_TOCLESS)
+libc_hidden_builtin_def (memset)
+
+/* Copied from bzero.S to prevent the linker from inserting a stub
+   between bzero and memset.  */
+ENTRY (__bzero)
+	CALL_MCOUNT 3
+	mr	r5,r4
+	li	r4,0
+	b	L(_memset)
+END (__bzero)
+#ifndef __bzero
+weak_alias (__bzero, bzero)
+#endif
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power7/multiarch/Implies b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/multiarch/Implies
new file mode 100644
index 0000000000..bf5d6171a5
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/multiarch/Implies
@@ -0,0 +1 @@
+powerpc/powerpc64/power6/multiarch
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power7/rawmemchr.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/rawmemchr.S
new file mode 100644
index 0000000000..48afb75943
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/rawmemchr.S
@@ -0,0 +1,115 @@
+/* Optimized rawmemchr implementation for PowerPC64/POWER7 using cmpb insn.
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Luis Machado <luisgpm@br.ibm.com>.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+/* int [r3] rawmemchr (void *s [r3], int c [r4])  */
+
+#ifndef RAWMEMCHR
+# define RAWMEMCHR __rawmemchr
+#endif
+	.machine  power7
+ENTRY (RAWMEMCHR)
+	CALL_MCOUNT 2
+	dcbt	0,r3
+	clrrdi	r8,r3,3	      /* Align the address to doubleword boundary.  */
+
+	/* Replicate byte to doubleword.  */
+	insrdi	r4,r4,8,48
+	insrdi	r4,r4,16,32
+	insrdi	r4,r4,32,0
+
+	/* Now r4 has a doubleword of c bytes.  */
+
+	rlwinm	r6,r3,3,26,28 /* Calculate padding.  */
+	ld	r12,0(r8)     /* Load doubleword from memory.  */
+	cmpb	r5,r12,r4     /* Compare each byte against c byte.  */
+#ifdef __LITTLE_ENDIAN__
+	srd	r5,r5,r6
+	sld	r5,r5,r6
+#else
+	sld	r5,r5,r6      /* Move left to discard ignored bits.  */
+	srd	r5,r5,r6      /* Bring the bits back as zeros.  */
+#endif
+	cmpdi	cr7,r5,0      /* If r5 == 0, no c bytes have been found.  */
+	bne	cr7,L(done)
+
+	mtcrf   0x01,r8
+
+	/* Are we now aligned to a quadword boundary?  If so, skip to
+	   the main loop.  Otherwise, go through the alignment code.  */
+
+	bt	28,L(loop)
+
+	/* Handle DWORD2 of pair.  */
+	ldu	r12,8(r8)
+	cmpb	r5,r12,r4
+	cmpdi	cr7,r5,0
+	bne	cr7,L(done)
+	b	L(loop)	      /* We branch here (rather than falling through)
+				 to skip the nops due to heavy alignment
+				 of the loop below.  */
+
+	/* Main loop to look for the end of the string.  Since it's a
+	   small loop (< 8 instructions), align it to 32-bytes.  */
+	.p2align  5
+L(loop):
+	/* Load two doublewords, compare and merge in a
+	   single register for speed.  This is an attempt
+	   to speed up the byte-checking process for bigger strings.  */
+	ld	r12,8(r8)
+	ldu     r11,16(r8)
+	cmpb	r5,r12,r4
+	cmpb	r6,r11,r4
+	or	r7,r5,r6
+	cmpdi	cr7,r7,0
+	beq	cr7,L(loop)
+
+	/* OK, one (or both) of the doublewords contains a 'c' byte.  Check
+	   the first doubleword and decrement the address in case the first
+	   doubleword really contains a c byte.  */
+
+	cmpdi	cr6,r5,0
+	addi	r8,r8,-8
+	bne	cr6,L(done)
+
+	/* The 'c' byte must be in the second doubleword.  Adjust the address
+	   again and move the result of cmpb to r10 so we can calculate the
+	   pointer.  */
+	mr	r5,r6
+	addi	r8,r8,8
+
+	/* r5 has the output of the cmpb instruction, that is, it contains
+	   0xff in the same position as the 'c' byte in the original
+	   doubleword from the string.  Use that fact to find out what is
+	   the position of the byte inside the string.  */
+L(done):
+#ifdef __LITTLE_ENDIAN__
+	addi    r0,r5,-1
+	andc    r0,r0,r5
+	popcntd	r0,r0	      /* Count trailing zeros.  */
+#else
+	cntlzd	r0,r5	      /* Count leading zeros before the match.  */
+#endif
+	srdi	r0,r0,3	      /* Convert leading zeros to bytes.  */
+	add	r3,r8,r0      /* Return address of the matching char.  */
+	blr
+END (RAWMEMCHR)
+weak_alias (__rawmemchr,rawmemchr)
+libc_hidden_builtin_def (__rawmemchr)
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power7/stpncpy.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/stpncpy.S
new file mode 100644
index 0000000000..a346dd7e28
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/stpncpy.S
@@ -0,0 +1,24 @@
+/* Optimized stpncpy implementation for PowerPC64/POWER7.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#define USE_AS_STPNCPY
+#include <sysdeps/powerpc/powerpc64/power7/strncpy.S>
+
+weak_alias (__stpncpy, stpncpy)
+libc_hidden_def (__stpncpy)
+libc_hidden_builtin_def (stpncpy)
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power7/strcasecmp.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/strcasecmp.S
new file mode 100644
index 0000000000..e856b8a593
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/strcasecmp.S
@@ -0,0 +1,126 @@
+/* Optimized strcasecmp implementation for PowerPC64.
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <locale-defines.h>
+
+/* int [r3] strcasecmp (const char *s1 [r3], const char *s2 [r4] )
+
+   or if defined USE_IN_EXTENDED_LOCALE_MODEL:
+
+   int [r3] strcasecmp_l (const char *s1 [r3], const char *s2 [r4],
+                          __locale_t loc [r5]) */
+
+#ifndef STRCMP
+# define __STRCMP __strcasecmp
+# define STRCMP   strcasecmp
+#endif
+
+ENTRY (__STRCMP)
+#ifndef USE_IN_EXTENDED_LOCALE_MODEL
+	CALL_MCOUNT 2
+#else
+	CALL_MCOUNT 3
+#endif
+
+#define rRTN	r3	/* Return value */
+#define rSTR1	r5	/* 1st string */
+#define rSTR2	r4	/* 2nd string */
+#define rLOCARG	r5	/* 3rd argument: locale_t */
+#define rCHAR1	r6	/* Byte read from 1st string */
+#define rCHAR2	r7	/* Byte read from 2nd string */
+#define rADDR1	r8	/* Address of tolower(rCHAR1) */
+#define rADDR2	r12	/* Address of tolower(rCHAR2) */
+#define rLWR1	r8	/* Word tolower(rCHAR1) */
+#define rLWR2	r12	/* Word tolower(rCHAR2) */
+#define rTMP	r9
+#define rLOC	r11	/* Default locale address */
+
+	cmpd	cr7, r3, r4
+#ifndef USE_IN_EXTENDED_LOCALE_MODEL
+	ld 	rTMP, __libc_tsd_LOCALE@got@tprel(r2)
+	add 	rLOC, rTMP, __libc_tsd_LOCALE@tls
+	ld	rLOC, 0(rLOC)
+#else
+	mr	rLOC, rLOCARG
+#endif
+	ld	rLOC, LOCALE_CTYPE_TOLOWER(rLOC)
+	mr	rSTR1, rRTN
+	li	rRTN, 0
+	beqlr	cr7
+
+
+	/* Unrolling loop for POWER: loads are done with 'lbz' plus
+	offset and string descriptors are only updated in the end
+	of loop unrolling. */
+
+	lbz	rCHAR1, 0(rSTR1)	/* Load char from s1 */
+	lbz	rCHAR2, 0(rSTR2)	/* Load char from s2 */
+L(loop):
+	cmpdi	rCHAR1, 0		/* *s1 == '\0' ? */
+	sldi	rADDR1, rCHAR1, 2	/* Calculate address for tolower(*s1) */
+	sldi	rADDR2, rCHAR2, 2	/* Calculate address for tolower(*s2) */
+	lwzx	rLWR1, rLOC, rADDR1	/* Load tolower(*s1) */
+	lwzx	rLWR2, rLOC, rADDR2	/* Load tolower(*s2) */
+	cmpw	cr1, rLWR1, rLWR2	/* r = tolower(*s1) == tolower(*s2) ? */
+	crorc	4*cr1+eq,eq,4*cr1+eq	/* (*s1 != '\0') || (r == 1) */
+	beq	cr1, L(done)
+	lbz	rCHAR1, 1(rSTR1)
+	lbz	rCHAR2, 1(rSTR2)
+	cmpdi	rCHAR1, 0
+	sldi	rADDR1, rCHAR1, 2
+	sldi	rADDR2, rCHAR2, 2
+	lwzx	rLWR1, rLOC, rADDR1
+	lwzx	rLWR2, rLOC, rADDR2
+	cmpw	cr1, rLWR1, rLWR2
+	crorc	4*cr1+eq,eq,4*cr1+eq
+	beq	cr1, L(done)
+	lbz	rCHAR1, 2(rSTR1)
+	lbz	rCHAR2, 2(rSTR2)
+	cmpdi	rCHAR1, 0
+	sldi	rADDR1, rCHAR1, 2
+	sldi	rADDR2, rCHAR2, 2
+	lwzx	rLWR1, rLOC, rADDR1
+	lwzx	rLWR2, rLOC, rADDR2
+	cmpw	cr1, rLWR1, rLWR2
+	crorc	4*cr1+eq,eq,4*cr1+eq
+	beq	cr1, L(done)
+	lbz	rCHAR1, 3(rSTR1)
+	lbz	rCHAR2, 3(rSTR2)
+	cmpdi	rCHAR1, 0
+	/* Increment both string descriptors */
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+	sldi	rADDR1, rCHAR1, 2
+	sldi	rADDR2, rCHAR2, 2
+	lwzx	rLWR1, rLOC, rADDR1
+	lwzx	rLWR2, rLOC, rADDR2
+	cmpw	cr1, rLWR1, rLWR2
+	crorc	4*cr1+eq,eq,4*cr1+eq
+	beq     cr1,L(done)
+	lbz	rCHAR1, 0(rSTR1)	/* Load char from s1 */
+	lbz	rCHAR2, 0(rSTR2)	/* Load char from s2 */
+	b	L(loop)
+L(done):
+	subf	r0, rLWR2, rLWR1
+	extsw	rRTN, r0
+	blr
+END (__STRCMP)
+
+weak_alias (__STRCMP, STRCMP)
+libc_hidden_builtin_def (__STRCMP)
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power7/strcasecmp_l.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/strcasecmp_l.S
new file mode 100644
index 0000000000..c13c4ebcb8
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/strcasecmp_l.S
@@ -0,0 +1,5 @@
+#define USE_IN_EXTENDED_LOCALE_MODEL
+#define STRCMP   strcasecmp_l
+#define __STRCMP __strcasecmp_l
+
+#include "strcasecmp.S"
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power7/strchr.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/strchr.S
new file mode 100644
index 0000000000..a18e2e101c
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/strchr.S
@@ -0,0 +1,230 @@
+/* Optimized strchr implementation for PowerPC64/POWER7 using cmpb insn.
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Luis Machado <luisgpm@br.ibm.com>.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+#ifndef STRCHR
+# define STRCHR strchr
+#endif
+
+/* int [r3] strchr (char *s [r3], int c [r4])  */
+	.machine  power7
+ENTRY (STRCHR)
+	CALL_MCOUNT 2
+	dcbt	0,r3
+	clrrdi	r8,r3,3	      /* Align the address to doubleword boundary.  */
+	cmpdi	cr7,r4,0
+	ld	r12,0(r8)     /* Load doubleword from memory.  */
+	li	r0,0	      /* Doubleword with null chars to use
+				 with cmpb.  */
+
+	rlwinm	r6,r3,3,26,28 /* Calculate padding.  */
+
+	beq	cr7,L(null_match)
+
+	/* Replicate byte to doubleword.  */
+	insrdi	r4,r4,8,48
+	insrdi	r4,r4,16,32
+	insrdi  r4,r4,32,0
+
+	/* Now r4 has a doubleword of c bytes and r0 has
+	   a doubleword of null bytes.  */
+
+	cmpb	r10,r12,r4     /* Compare each byte against c byte.  */
+	cmpb	r11,r12,r0     /* Compare each byte against null byte.  */
+
+	/* Move the doublewords left and right to discard the bits that are
+	   not part of the string and bring them back as zeros.  */
+#ifdef __LITTLE_ENDIAN__
+	srd	r10,r10,r6
+	srd	r11,r11,r6
+	sld	r10,r10,r6
+	sld	r11,r11,r6
+#else
+	sld	r10,r10,r6
+	sld	r11,r11,r6
+	srd	r10,r10,r6
+	srd	r11,r11,r6
+#endif
+	or	r5,r10,r11    /* OR the results to speed things up.  */
+	cmpdi	cr7,r5,0      /* If r5 == 0, no c or null bytes
+				 have been found.  */
+	bne	cr7,L(done)
+
+	mtcrf   0x01,r8
+
+	/* Are we now aligned to a doubleword boundary?  If so, skip to
+	   the main loop.  Otherwise, go through the alignment code.  */
+
+	bt	28,L(loop)
+
+	/* Handle WORD2 of pair.  */
+	ldu	r12,8(r8)
+	cmpb    r10,r12,r4
+	cmpb	r11,r12,r0
+	or	r5,r10,r11
+	cmpdi	cr7,r5,0
+	bne	cr7,L(done)
+	b	L(loop)	      /* We branch here (rather than falling through)
+				 to skip the nops due to heavy alignment
+				 of the loop below.  */
+
+	.p2align  5
+L(loop):
+	/* Load two doublewords, compare and merge in a
+	   single register for speed.  This is an attempt
+	   to speed up the null-checking process for bigger strings.  */
+	ld	r12,8(r8)
+	ldu	r9,16(r8)
+	cmpb	r10,r12,r4
+	cmpb	r11,r12,r0
+	cmpb	r6,r9,r4
+	cmpb	r7,r9,r0
+	or	r12,r10,r11
+	or	r9,r6,r7
+	or	r5,r12,r9
+	cmpdi	cr7,r5,0
+	beq	cr7,L(loop)
+
+	/* OK, one (or both) of the doublewords contains a c/null byte.  Check
+	   the first doubleword and decrement the address in case the first
+	   doubleword really contains a c/null byte.  */
+
+	cmpdi	cr6,r12,0
+	addi	r8,r8,-8
+	bne	cr6,L(done)
+
+	/* The c/null byte must be in the second doubleword.  Adjust the
+	   address again and move the result of cmpb to r10 so we can calculate
+	   the pointer.  */
+
+	mr	r10,r6
+	mr	r11,r7
+	addi	r8,r8,8
+
+	/* r10/r11 have the output of the cmpb instructions, that is,
+	   0xff in the same position as the c/null byte in the original
+	   doubleword from the string.  Use that to calculate the pointer.  */
+L(done):
+#ifdef __LITTLE_ENDIAN__
+	addi    r3,r10,-1
+	andc    r3,r3,r10
+	popcntd	r0,r3
+	addi    r4,r11,-1
+	andc    r4,r4,r11
+	cmpld	cr7,r3,r4
+	bgt	cr7,L(no_match)
+#else
+	cntlzd	r0,r10	      /* Count leading zeros before c matches.  */
+	cmpld	cr7,r11,r10
+	bgt	cr7,L(no_match)
+#endif
+	srdi	r0,r0,3	      /* Convert leading zeros to bytes.  */
+	add	r3,r8,r0      /* Return address of the matching c byte
+				 or null in case c was not found.  */
+	blr
+
+	.align	4
+L(no_match):
+	li	r3,0
+	blr
+
+/* We are here because strchr was called with a null byte.  */
+	.align	4
+L(null_match):
+	/* r0 has a doubleword of null bytes.  */
+
+	cmpb	r5,r12,r0     /* Compare each byte against null bytes.  */
+
+	/* Move the doublewords left and right to discard the bits that are
+	   not part of the string and bring them back as zeros.  */
+#ifdef __LITTLE_ENDIAN__
+	srd	r5,r5,r6
+	sld	r5,r5,r6
+#else
+	sld	r5,r5,r6
+	srd	r5,r5,r6
+#endif
+	cmpdi	cr7,r5,0      /* If r10 == 0, no c or null bytes
+				 have been found.  */
+	bne	cr7,L(done_null)
+
+	mtcrf   0x01,r8
+
+	/* Are we now aligned to a quadword boundary?  If so, skip to
+	   the main loop.  Otherwise, go through the alignment code.  */
+
+	bt	28,L(loop_null)
+
+	/* Handle WORD2 of pair.  */
+	ldu	r12,8(r8)
+	cmpb    r5,r12,r0
+	cmpdi	cr7,r5,0
+	bne	cr7,L(done_null)
+	b	L(loop_null)  /* We branch here (rather than falling through)
+				 to skip the nops due to heavy alignment
+				 of the loop below.  */
+
+	/* Main loop to look for the end of the string.  Since it's a
+	   small loop (< 8 instructions), align it to 32-bytes.  */
+	.p2align  5
+L(loop_null):
+	/* Load two doublewords, compare and merge in a
+	   single register for speed.  This is an attempt
+	   to speed up the null-checking process for bigger strings.  */
+	ld	r12,8(r8)
+	ldu     r11,16(r8)
+	cmpb	r5,r12,r0
+	cmpb	r10,r11,r0
+	or	r6,r5,r10
+	cmpdi	cr7,r6,0
+	beq	cr7,L(loop_null)
+
+	/* OK, one (or both) of the doublewords contains a null byte.  Check
+	   the first doubleword and decrement the address in case the first
+	   doubleword really contains a null byte.  */
+
+	cmpdi	cr6,r5,0
+	addi	r8,r8,-8
+	bne	cr6,L(done_null)
+
+	/* The null byte must be in the second doubleword.  Adjust the address
+	   again and move the result of cmpb to r10 so we can calculate the
+	   pointer.  */
+
+	mr	r5,r10
+	addi	r8,r8,8
+
+	/* r5 has the output of the cmpb instruction, that is, it contains
+	   0xff in the same position as the null byte in the original
+	   doubleword from the string.  Use that to calculate the pointer.  */
+L(done_null):
+#ifdef __LITTLE_ENDIAN__
+	addi    r0,r5,-1
+	andc    r0,r0,r5
+	popcntd	r0,r0
+#else
+	cntlzd	r0,r5	      /* Count leading zeros before the match.  */
+#endif
+	srdi	r0,r0,3	      /* Convert leading zeros to bytes.  */
+	add	r3,r8,r0      /* Return address of the matching null byte.  */
+	blr
+END (STRCHR)
+weak_alias (strchr, index)
+libc_hidden_builtin_def (strchr)
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power7/strchrnul.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/strchrnul.S
new file mode 100644
index 0000000000..27bc1f0682
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/strchrnul.S
@@ -0,0 +1,131 @@
+/* Optimized strchrnul implementation for PowerPC64/POWER7 using cmpb insn.
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Luis Machado <luisgpm@br.ibm.com>.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+#ifndef STRCHRNUL
+# define STRCHRNUL __strchrnul
+#endif
+/* int [r3] strchrnul (char *s [r3], int c [r4])  */
+	.machine  power7
+ENTRY (STRCHRNUL)
+	CALL_MCOUNT 2
+	dcbt	0,r3
+	clrrdi	r8,r3,3	      /* Align the address to doubleword boundary.  */
+
+	/* Replicate byte to doubleword.  */
+	insrdi	r4,r4,8,48
+	insrdi	r4,r4,16,32
+	insrdi	r4,r4,32,0
+
+	rlwinm	r6,r3,3,26,28 /* Calculate padding.  */
+	ld	r12,0(r8)     /* Load doubleword from memory.  */
+	li	r0,0	      /* Doubleword with null chars to use
+				 with cmpb.  */
+
+	/* Now r4 has a doubleword of c bytes and r0 has
+	   a doubleword of null bytes.  */
+
+	cmpb	r10,r12,r0    /* Compare each byte against c byte.  */
+	cmpb	r9,r12,r4     /* Compare each byte against null byte.  */
+
+	/* Move the doublewords left and right to discard the bits that are
+	   not part of the string and to bring them back as zeros.  */
+#ifdef __LITTLE_ENDIAN__
+	srd	r10,r10,r6
+	srd	r9,r9,r6
+	sld	r10,r10,r6
+	sld	r9,r9,r6
+#else
+	sld	r10,r10,r6
+	sld	r9,r9,r6
+	srd	r10,r10,r6
+	srd	r9,r9,r6
+#endif
+	or	r5,r9,r10     /* OR the results to speed things up.  */
+	cmpdi	cr7,r5,0      /* If r5 == 0, no c or null bytes
+				 have been found.  */
+	bne	cr7,L(done)
+
+	mtcrf   0x01,r8
+
+	/* Are we now aligned to a quadword boundary?  If so, skip to
+	   the main loop.  Otherwise, go through the alignment code.  */
+
+	bt	28,L(loop)
+
+	/* Handle DWORD2 of pair.  */
+	ldu	r12,8(r8)
+	cmpb	r10,r12,r0
+	cmpb	r9,r12,r4
+	or	r5,r9,r10
+	cmpdi	cr7,r5,0
+	bne	cr7,L(done)
+	b	L(loop)	      /* We branch here (rather than falling through)
+				 to skip the nops due to heavy alignment
+				 of the loop below.  */
+
+	.p2align  5
+L(loop):
+	/* Load two doublewords, compare and merge in a
+	   single register for speed.  This is an attempt
+	   to speed up the null-checking process for bigger strings.  */
+	ld	r12,8(r8)
+	ldu     r11,16(r8)
+	cmpb	r10,r12,r0
+	cmpb	r9,r12,r4
+	cmpb	r6,r11,r0
+	cmpb	r7,r11,r4
+	or	r5,r9,r10
+	or	r10,r6,r7
+	or	r11,r5,r10
+	cmpdi	cr7,r11,0
+	beq	cr7,L(loop)
+
+	/* OK, one (or both) of the doublewords contains a c/null byte.  Check
+	   the first doubleword and decrement the address in case the first
+	   doubleword really contains a c/null byte.  */
+
+	cmpdi	cr6,r5,0
+	addi	r8,r8,-8
+	bne	cr6,L(done)
+
+	/* The c/null byte must be in the second doubleword.  Adjust the
+	   address again and move the result of cmpb to r5 so we can calculate
+	   the pointer.  */
+	mr	r5,r10
+	addi	r8,r8,8
+
+	/* r5 has the output of the cmpb instruction, that is, it contains
+	   0xff in the same position as the c/null byte in the original
+	   doubleword from the string.  Use that to calculate the pointer.  */
+L(done):
+#ifdef __LITTLE_ENDIAN__
+	addi    r0,r5,-1
+	andc    r0,r0,r5
+	popcntd	r0,r0
+#else
+	cntlzd	r0,r5	      /* Count leading zeros before the match.  */
+#endif
+	srdi	r0,r0,3	      /* Convert leading zeros to bytes.  */
+	add	r3,r8,r0      /* Return address of matching c/null byte.  */
+	blr
+END (STRCHRNUL)
+weak_alias (STRCHRNUL, strchrnul)
+libc_hidden_builtin_def (STRCHRNUL)
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power7/strcmp.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/strcmp.S
new file mode 100644
index 0000000000..14e14f457e
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/strcmp.S
@@ -0,0 +1,168 @@
+/* Optimized strcmp implementation for Power7 using 'cmpb' instruction
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* The optimization is achieved here through cmpb instruction.
+   8byte aligned strings are processed with double word comparision
+   and unaligned strings are handled effectively with loop unrolling
+   technique  */
+
+#include <sysdep.h>
+
+#ifndef STRCMP
+# define STRCMP strcmp
+#endif
+
+/* int [r3] strcmp (const char *s1 [r3], const char *s2 [r4])  */
+
+	.machine	power7
+EALIGN (STRCMP, 4, 0)
+	CALL_MCOUNT 2
+
+	or r9, r3, r4
+	rldicl. r10, r9, 0, 61	/* are s1 and s2 8 byte aligned..?  */
+	bne cr0, L(process_unaligned_bytes)
+	li	r5, 0
+
+	.align 4
+/* process input parameters on double word aligned boundary  */
+L(unrollDword):
+	ld	r8,0(r3)
+	ld	r10,0(r4)
+	cmpb	r7,r8,r5
+	cmpdi	cr7,r7,0
+	mr	r9,r7
+	bne 	cr7,L(null_found)
+	cmpld	cr7,r8,r10
+	bne	cr7,L(different)
+
+	ld	r8,8(r3)
+	ld	r10,8(r4)
+	cmpb	r7,r8,r5
+	cmpdi	cr7,r7,0
+	mr	r9,r7
+	bne 	cr7,L(null_found)
+	cmpld	cr7,r8,r10
+	bne	cr7,L(different)
+
+	ld	r8,16(r3)
+	ld	r10,16(r4)
+	cmpb	r7,r8,r5
+	cmpdi	cr7,r7,0
+	mr	r9,r7
+	bne 	cr7,L(null_found)
+	cmpld	cr7,r8,r10
+	bne	cr7,L(different)
+
+	ld	r8,24(r3)
+	ld	r10,24(r4)
+	cmpb	r7,r8,r5
+	cmpdi	cr7,r7,0
+	mr	r9,r7
+	bne 	cr7,L(null_found)
+	cmpld	cr7,r8,r10
+	bne	cr7,L(different)
+
+	addi r3, r3, 32
+	addi r4, r4, 32
+	beq cr7, L(unrollDword)
+
+	.align 4
+L(null_found):
+#ifdef __LITTLE_ENDIAN__
+	neg	r7,r9
+	and	r9,r9,r7
+	li	r7,-1
+	cntlzd	r9,r9
+	subfic	r9,r9,71
+	sld	r9,r7,r9
+#else
+	cntlzd	r9,r9
+	li	r7,-1
+	addi	r9,r9,8
+	srd	r9,r7,r9
+#endif
+	or	r8,r8,r9
+	or	r10,r10,r9
+
+L(different):
+	cmpb	r9,r8,r10
+#ifdef __LITTLE_ENDIAN__
+	addi	r7,r9,1
+	andc	r9,r7,r9
+	cntlzd	r9,r9
+	subfic	r9,r9,63
+#else
+	not	r9,r9
+	cntlzd	r9,r9
+	subfic	r9,r9,56
+#endif
+	srd	r3,r8,r9
+	srd	r10,r10,r9
+	rldicl	r10,r10,0,56
+	rldicl	r3,r3,0,56
+	subf	r3,r10,r3
+	blr
+
+	.align 4
+L(process_unaligned_bytes):
+	lbz r9, 0(r3)		/* load byte from s1  */
+	lbz r10, 0(r4)		/* load byte from s2  */
+	cmpdi cr7, r9, 0	/* compare *s1 with NULL  */
+	beq cr7, L(diffOfNULL)	/* if *s1 is NULL , return *s1 - *s2  */
+	cmplw cr7, r9, r10	/* compare *s1 and *s2  */
+	bne cr7, L(ComputeDiff)	/* branch to compute difference and return  */
+
+	lbz r9, 1(r3)		/* load next byte from s1  */
+	lbz r10, 1(r4)		/* load next byte from s2  */
+	cmpdi cr7, r9, 0	/* compare *s1 with NULL  */
+	beq cr7, L(diffOfNULL)	/* if *s1 is NULL , return *s1 - *s2  */
+	cmplw cr7, r9, r10	/* compare *s1 and *s2  */
+	bne cr7, L(ComputeDiff)	/* branch to compute difference and return  */
+
+	lbz r9, 2(r3)		/* unroll 3rd byte here  */
+	lbz r10, 2(r4)
+	cmpdi cr7, r9, 0
+	beq cr7, L(diffOfNULL)
+	cmplw cr7, r9, r10
+	bne 7, L(ComputeDiff)
+
+	lbz r9, 3(r3)		/* unroll 4th byte now  */
+	lbz r10, 3(r4)
+	addi r3, r3, 4		/* increment s1 by unroll factor  */
+	cmpdi cr7, r9, 0
+	cmplw cr6, 9, r10
+	beq cr7, L(diffOfNULL)
+	addi r4, r4, 4		/* increment s2 by unroll factor  */
+	beq cr6, L(process_unaligned_bytes)	/* unroll byte processing  */
+
+	.align 4
+L(ComputeDiff):
+	extsw r9, r9
+	subf r10, r10, r9	/* compute s1 - s2  */
+	extsw r3, r10
+	blr			/* return  */
+
+	.align 4
+L(diffOfNULL):
+	li r9, 0
+	subf r10, r10, r9	/* compute s1 - s2  */
+	extsw r3, r10		/* sign extend result  */
+	blr			/* return  */
+
+END (STRCMP)
+libc_hidden_builtin_def (strcmp)
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power7/strlen.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/strlen.S
new file mode 100644
index 0000000000..63848c460c
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/strlen.S
@@ -0,0 +1,107 @@
+/* Optimized strlen implementation for PowerPC64/POWER7 using cmpb insn.
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Luis Machado <luisgpm@br.ibm.com>.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+/* int [r3] strlen (char *s [r3])  */
+
+#ifndef STRLEN
+# define STRLEN strlen
+#endif
+	.machine  power7
+ENTRY (STRLEN)
+	CALL_MCOUNT 1
+	dcbt	0,r3
+	clrrdi	r4,r3,3	      /* Align the address to doubleword boundary.  */
+	rlwinm	r6,r3,3,26,28 /* Calculate padding.  */
+	li	r0,0	      /* Doubleword with null chars to use
+				 with cmpb.  */
+	li	r5,-1	      /* MASK = 0xffffffffffffffff.  */
+	ld	r12,0(r4)     /* Load doubleword from memory.  */
+#ifdef __LITTLE_ENDIAN__
+	sld	r5,r5,r6
+#else
+	srd	r5,r5,r6      /* MASK = MASK >> padding.  */
+#endif
+	orc	r9,r12,r5     /* Mask bits that are not part of the string.  */
+	cmpb	r10,r9,r0     /* Check for null bytes in DWORD1.  */
+	cmpdi	cr7,r10,0     /* If r10 == 0, no null's have been found.  */
+	bne	cr7,L(done)
+
+	mtcrf   0x01,r4
+
+	/* Are we now aligned to a quadword boundary?  If so, skip to
+	   the main loop.  Otherwise, go through the alignment code.  */
+
+	bt	28,L(loop)
+
+	/* Handle DWORD2 of pair.  */
+	ldu	r12,8(r4)
+	cmpb	r10,r12,r0
+	cmpdi	cr7,r10,0
+	bne	cr7,L(done)
+
+	/* Main loop to look for the end of the string.  Since it's a
+	   small loop (< 8 instructions), align it to 32-bytes.  */
+	.p2align  5
+L(loop):
+	/* Load two doublewords, compare and merge in a
+	   single register for speed.  This is an attempt
+	   to speed up the null-checking process for bigger strings.  */
+
+	ld	r12, 8(r4)
+	ldu	r11, 16(r4)
+	cmpb	r10,r12,r0
+	cmpb	r9,r11,r0
+	or	r8,r9,r10     /* Merge everything in one doubleword.  */
+	cmpdi	cr7,r8,0
+	beq	cr7,L(loop)
+
+	/* OK, one (or both) of the doublewords contains a null byte.  Check
+	   the first doubleword and decrement the address in case the first
+	   doubleword really contains a null byte.  */
+
+	cmpdi	cr6,r10,0
+	addi	r4,r4,-8
+	bne	cr6,L(done)
+
+	/* The null byte must be in the second doubleword.  Adjust the address
+	   again and move the result of cmpb to r10 so we can calculate the
+	   length.  */
+
+	mr	r10,r9
+	addi	r4,r4,8
+
+	/* r10 has the output of the cmpb instruction, that is, it contains
+	   0xff in the same position as the null byte in the original
+	   doubleword from the string.  Use that to calculate the length.  */
+L(done):
+#ifdef __LITTLE_ENDIAN__
+	addi	r9, r10, -1   /* Form a mask from trailing zeros.  */
+	andc	r9, r9, r10
+	popcntd r0, r9	      /* Count the bits in the mask.  */
+#else
+	cntlzd	r0,r10	      /* Count leading zeros before the match.  */
+#endif
+	subf	r5,r3,r4
+	srdi	r0,r0,3	      /* Convert leading/trailing zeros to bytes.  */
+	add	r3,r5,r0      /* Compute final length.  */
+	blr
+END (STRLEN)
+libc_hidden_builtin_def (strlen)
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power7/strncmp.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/strncmp.S
new file mode 100644
index 0000000000..d53b31be8e
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/strncmp.S
@@ -0,0 +1,227 @@
+/* Optimized strcmp implementation for POWER7/PowerPC64.
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+#ifndef STRNCMP
+# define STRNCMP strncmp
+#endif
+
+/* See strlen.s for comments on how the end-of-string testing works.  */
+
+/* int [r3] strncmp (const char *s1 [r3],
+		     const char *s2 [r4],
+		     size_t size [r5])  */
+
+EALIGN (STRNCMP,5,0)
+	CALL_MCOUNT 3
+
+#define rTMP2	r0
+#define rRTN	r3
+#define rSTR1	r3	/* first string arg */
+#define rSTR2	r4	/* second string arg */
+#define rN	r5	/* max string length */
+#define rWORD1	r6	/* current word in s1 */
+#define rWORD2	r7	/* current word in s2 */
+#define rWORD3  r10
+#define rWORD4  r11
+#define rFEFE	r8	/* constant 0xfefefefefefefeff (-0x0101010101010101) */
+#define r7F7F	r9	/* constant 0x7f7f7f7f7f7f7f7f */
+#define rNEG	r10	/* ~(word in s1 | 0x7f7f7f7f7f7f7f7f) */
+#define rBITDIF	r11	/* bits that differ in s1 & s2 words */
+#define rTMP	r12
+
+	dcbt	0,rSTR1
+	nop
+	or	rTMP,rSTR2,rSTR1
+	lis	r7F7F,0x7f7f
+	dcbt	0,rSTR2
+	nop
+	clrldi.	rTMP,rTMP,61
+	cmpldi	cr1,rN,0
+	lis	rFEFE,-0x101
+	bne	L(unaligned)
+/* We are doubleword aligned so set up for two loops.  first a double word
+   loop, then fall into the byte loop if any residual.  */
+	srdi.	rTMP,rN,3
+	clrldi	rN,rN,61
+	addi	rFEFE,rFEFE,-0x101
+	addi	r7F7F,r7F7F,0x7f7f
+	cmpldi	cr1,rN,0
+	beq	L(unaligned)
+
+	mtctr	rTMP
+	ld	rWORD1,0(rSTR1)
+	ld	rWORD2,0(rSTR2)
+	sldi	rTMP,rFEFE,32
+	insrdi	r7F7F,r7F7F,32,0
+	add	rFEFE,rFEFE,rTMP
+	b	L(g1)
+
+L(g0):
+	ldu	rWORD1,8(rSTR1)
+	bne	cr1,L(different)
+	ldu	rWORD2,8(rSTR2)
+L(g1):	add	rTMP,rFEFE,rWORD1
+	nor	rNEG,r7F7F,rWORD1
+	bdz	L(tail)
+	and.	rTMP,rTMP,rNEG
+	cmpd	cr1,rWORD1,rWORD2
+	beq	L(g0)
+
+/* OK. We've hit the end of the string. We need to be careful that
+   we don't compare two strings as different because of gunk beyond
+   the end of the strings...  */
+
+#ifdef __LITTLE_ENDIAN__
+L(endstring):
+	addi    rTMP2, rTMP, -1
+	beq	cr1, L(equal)
+	andc    rTMP2, rTMP2, rTMP
+	rldimi	rTMP2, rTMP2, 1, 0
+	and	rWORD2, rWORD2, rTMP2	/* Mask off gunk.  */
+	and	rWORD1, rWORD1, rTMP2
+	cmpd	cr1, rWORD1, rWORD2
+	beq	cr1, L(equal)
+	cmpb	rBITDIF, rWORD1, rWORD2	/* 0xff on equal bytes.  */
+	addi	rNEG, rBITDIF, 1
+	orc	rNEG, rNEG, rBITDIF	/* 0's below LS differing byte.  */
+	sldi	rNEG, rNEG, 8		/* 1's above LS differing byte.  */
+	andc	rWORD1, rWORD1, rNEG	/* mask off MS bytes.  */
+	andc	rWORD2, rWORD2, rNEG
+	xor.	rBITDIF, rWORD1, rWORD2
+	sub	rRTN, rWORD1, rWORD2
+	blt	L(highbit)
+	sradi	rRTN, rRTN, 63		/* must return an int.  */
+	ori	rRTN, rRTN, 1
+	blr
+L(equal):
+	li	rRTN, 0
+	blr
+
+L(different):
+	ld	rWORD1, -8(rSTR1)
+	cmpb	rBITDIF, rWORD1, rWORD2	/* 0xff on equal bytes.  */
+	addi	rNEG, rBITDIF, 1
+	orc	rNEG, rNEG, rBITDIF	/* 0's below LS differing byte.  */
+	sldi	rNEG, rNEG, 8		/* 1's above LS differing byte.  */
+	andc	rWORD1, rWORD1, rNEG	/* mask off MS bytes.  */
+	andc	rWORD2, rWORD2, rNEG
+	xor.	rBITDIF, rWORD1, rWORD2
+	sub	rRTN, rWORD1, rWORD2
+	blt	L(highbit)
+	sradi	rRTN, rRTN, 63
+	ori	rRTN, rRTN, 1
+	blr
+L(highbit):
+	sradi	rRTN, rWORD2, 63
+	ori	rRTN, rRTN, 1
+	blr
+
+#else
+L(endstring):
+	and	rTMP,r7F7F,rWORD1
+	beq	cr1,L(equal)
+	add	rTMP,rTMP,r7F7F
+	xor.	rBITDIF,rWORD1,rWORD2
+	andc	rNEG,rNEG,rTMP
+	blt	L(highbit)
+	cntlzd	rBITDIF,rBITDIF
+	cntlzd	rNEG,rNEG
+	addi	rNEG,rNEG,7
+	cmpd	cr1,rNEG,rBITDIF
+	sub	rRTN,rWORD1,rWORD2
+	blt	cr1,L(equal)
+	sradi	rRTN,rRTN,63		/* must return an int.  */
+	ori	rRTN,rRTN,1
+	blr
+L(equal):
+	li	rRTN,0
+	blr
+
+L(different):
+	ld	rWORD1,-8(rSTR1)
+	xor.	rBITDIF,rWORD1,rWORD2
+	sub	rRTN,rWORD1,rWORD2
+	blt	L(highbit)
+	sradi	rRTN,rRTN,63
+	ori	rRTN,rRTN,1
+	blr
+L(highbit):
+	sradi	rRTN,rWORD2,63
+	ori	rRTN,rRTN,1
+	blr
+#endif
+
+/* Oh well.  In this case, we just do a byte-by-byte comparison.  */
+	.align	4
+L(tail):
+	and.	rTMP,rTMP,rNEG
+	cmpd	cr1,rWORD1,rWORD2
+	bne	L(endstring)
+	addi	rSTR1,rSTR1,8
+	bne	cr1,L(different)
+	addi	rSTR2,rSTR2,8
+	cmpldi	cr1,rN,0
+L(unaligned):
+	mtctr	rN
+	ble	cr1,L(ux)
+L(uz):
+	lbz	rWORD1,0(rSTR1)
+	lbz	rWORD2,0(rSTR2)
+	.align	4
+L(u1):
+	cmpdi	cr1,rWORD1,0
+	bdz	L(u4)
+	cmpd	rWORD1,rWORD2
+	beq	cr1,L(u4)
+	bne	L(u4)
+	lbzu	rWORD3,1(rSTR1)
+	lbzu	rWORD4,1(rSTR2)
+	cmpdi	cr1,rWORD3,0
+	bdz	L(u3)
+	cmpd	rWORD3,rWORD4
+	beq	cr1,L(u3)
+	bne	L(u3)
+	lbzu	rWORD1,1(rSTR1)
+	lbzu	rWORD2,1(rSTR2)
+	cmpdi	cr1,rWORD1,0
+	bdz	L(u4)
+	cmpd	rWORD1,rWORD2
+	beq	cr1,L(u4)
+	bne	L(u4)
+	lbzu	rWORD3,1(rSTR1)
+	lbzu	rWORD4,1(rSTR2)
+	cmpdi	cr1,rWORD3,0
+	bdz	L(u3)
+	cmpd	rWORD3,rWORD4
+	beq	cr1,L(u3)
+	bne	L(u3)
+	lbzu	rWORD1,1(rSTR1)
+	lbzu	rWORD2,1(rSTR2)
+	b	L(u1)
+
+L(u3):  sub	rRTN,rWORD3,rWORD4
+	blr
+L(u4):	sub	rRTN,rWORD1,rWORD2
+	blr
+L(ux):
+	li	rRTN,0
+	blr
+END (STRNCMP)
+libc_hidden_builtin_def (strncmp)
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power7/strncpy.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/strncpy.S
new file mode 100644
index 0000000000..0224f74898
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/strncpy.S
@@ -0,0 +1,722 @@
+/* Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+/* Implements the functions
+
+   char * [r3] strncpy (char *dst [r3], const char *src [r4], size_t n [r5])
+
+   AND
+
+   char * [r3] stpncpy (char *dst [r3], const char *src [r4], size_t n [r5])
+
+   The algorithm is as follows:
+   > if src and dest are 8 byte aligned, perform double word copy
+     else
+   > copy byte by byte on unaligned addresses.
+
+   The aligned comparison are made using cmpb instructions.  */
+
+/* The focus on optimization for performance improvements are as follows:
+   1. data alignment [gain from aligned memory access on read/write]
+   2. POWER7 gains performance with loop unrolling/unwinding
+      [gain by reduction of branch penalty].
+   3. The final pad with null bytes is done by calling an optimized
+      memset.  */
+
+#ifdef USE_AS_STPNCPY
+# ifndef STPNCPY
+#  define FUNC_NAME __stpncpy
+# else
+#  define FUNC_NAME STPNCPY
+# endif
+#else
+# ifndef STRNCPY
+#  define FUNC_NAME strncpy
+# else
+#  define FUNC_NAME STRNCPY
+# endif
+#endif  /* !USE_AS_STPNCPY  */
+
+#define		FRAMESIZE	(FRAME_MIN_SIZE+32)
+
+#ifndef MEMSET
+/* For builds with no IFUNC support, local calls should be made to internal
+   GLIBC symbol (created by libc_hidden_builtin_def).  */
+# ifdef SHARED
+#  define MEMSET   __GI_memset
+# else
+#  define MEMSET   memset
+# endif
+#endif
+
+	.machine  power7
+EALIGN(FUNC_NAME, 4, 0)
+	CALL_MCOUNT 3
+
+	mflr r0			/* load link register LR to r0  */
+	or r10, r3, r4		/* to verify source and destination  */
+	rldicl. r8, r10, 0, 61	/* is double word aligned .. ?  */
+
+	std r19, -8(r1)		/* save callers register , r19  */
+	std r18, -16(r1)	/* save callers register , r18  */
+	std r0, 16(r1)		/* store the link register  */
+	stdu r1, -FRAMESIZE(r1)	/* create the stack frame  */
+
+	mr r9, r3		/* save r3 into r9 for use  */
+	mr r18, r3		/* save r3 for retCode of strncpy  */
+	bne 0, L(unaligned)
+
+L(aligned):
+	srdi r11, r5, 3		/* compute count for CTR ; count = n/8  */
+	cmpldi cr7, r11, 3	/* if count > 4 ; perform unrolling 4 times  */
+	ble 7, L(update1)
+
+	ld r10, 0(r4)		/* load doubleWord from src  */
+	cmpb r8, r10, r8	/* compare src with NULL ,we read just now  */
+	cmpdi cr7, r8, 0	/* if cmpb returned NULL ; we continue  */
+	bne cr7, L(update3)
+
+	std r10, 0(r3)		/* copy doubleword at offset=0  */
+	ld r10, 8(r4)		/* load next doubleword from offset=8  */
+	cmpb r8, r10, r8	/* compare src with NULL , we read just now  */
+	cmpdi cr7, r8, 0	/* if cmpb returned NULL ; we continue  */
+	bne 7,L(HopBy8)
+
+	addi r8, r11, -4
+	mr r7, r3
+	srdi r8, r8, 2
+	mr r6, r4
+	addi r8, r8, 1
+	li r12, 0
+	mtctr r8
+	b L(dwordCopy)
+
+	.p2align 4
+L(dWordUnroll):
+	std r8, 16(r9)
+	ld r8, 24(r4)		/* load dword,perform loop unrolling again  */
+	cmpb r10, r8, r10
+	cmpdi cr7, r10, 0
+	bne cr7, L(HopBy24)
+
+	std r8, 24(r7)		/* copy dword at offset=24  */
+	addi r9, r9, 32
+	addi r4, r4, 32
+	bdz  L(leftDwords)	/* continue with loop on counter  */
+
+	ld r3, 32(r6)
+	cmpb r8, r3, r10
+	cmpdi cr7, r8, 0
+	bne cr7, L(update2)
+
+	std r3, 32(r7)
+	ld r10, 40(r6)
+	cmpb r8, r10, r8
+	cmpdi cr7, r8, 0
+	bne cr7, L(HopBy40)
+
+	mr r6, r4		/* update values  */
+	mr r7, r9
+	mr r11, r0
+	mr r5, r19
+
+L(dwordCopy):
+	std r10, 8(r9)		/* copy dword at offset=8  */
+	addi r19, r5, -32
+	addi r0, r11, -4
+	ld r8, 16(r4)
+	cmpb r10, r8, r12
+	cmpdi cr7, r10, 0
+	beq cr7, L(dWordUnroll)
+
+	addi r9, r9, 16		/* increment dst by 16  */
+	addi r4, r4, 16		/* increment src by 16  */
+	addi r5, r5, -16	/* decrement length 'n' by 16  */
+	addi r0, r11, -2	/* decrement loop counter  */
+
+L(dWordUnrollOFF):
+	ld r10, 0(r4)		/* load first dword  */
+	li r8, 0		/* load mask  */
+	cmpb r8, r10, r8
+	cmpdi cr7, r8, 0
+	bne cr7, L(byte_by_byte)
+	mtctr r0
+	li r7, 0
+	b L(CopyDword)
+
+	.p2align 4
+L(loadDWordandCompare):
+	ld r10, 0(r4)
+	cmpb r8, r10, r7
+	cmpdi cr7, r8, 0
+	bne cr7, L(byte_by_byte)
+
+L(CopyDword):
+	addi r9, r9, 8
+	std r10, -8(r9)
+	addi r4, r4, 8
+	addi r5, r5, -8
+	bdnz L(loadDWordandCompare)
+
+L(byte_by_byte):
+	cmpldi cr7, r5, 3
+	ble cr7, L(verifyByte)
+	srdi r10, r5, 2
+	mr r19, r9
+	mtctr r10
+	b L(firstByteUnroll)
+
+	.p2align 4
+L(bytes_unroll):
+	lbz r10, 1(r4)		/* load byte from src  */
+	cmpdi cr7, r10, 0	/* compare for NULL  */
+	stb r10, 1(r19)		/* store byte to dst  */
+	beq cr7, L(updtDestComputeN2ndByte)
+
+	addi r4, r4, 4		/* advance src  */
+
+	lbz r10, -2(r4)		/* perform loop unrolling for byte r/w  */
+	cmpdi cr7, r10, 0
+	stb r10, 2(r19)
+	beq cr7, L(updtDestComputeN3rdByte)
+
+	lbz r10, -1(r4)		/* perform loop unrolling for byte r/w  */
+	addi r19, r19, 4
+	cmpdi cr7, r10, 0
+	stb r10, -1(r19)
+	beq cr7, L(ComputeNByte)
+
+	bdz L(update0)
+
+L(firstByteUnroll):
+	lbz r10, 0(r4)		/* perform loop unrolling for byte r/w  */
+	cmpdi cr7, 10, 0
+	stb r10, 0(r19)
+	bne cr7, L(bytes_unroll)
+	addi r19, r19, 1
+
+L(ComputeNByte):
+	subf r9, r19, r9	/* compute 'n'n bytes to fill  */
+	add r8, r9, r5
+
+L(zeroFill):
+	cmpdi cr7, r8, 0	/* compare if length is zero  */
+	beq cr7, L(update3return)
+
+	mr r3, r19		/* fill buffer with  */
+	li r4, 0		/* zero fill buffer  */
+	mr r5, r8		/* how many bytes to fill buffer with  */
+	bl MEMSET		/* call optimized memset  */
+	nop
+
+L(update3return):
+#ifdef USE_AS_STPNCPY
+	addi r3, r19, -1	/* update return value  */
+#endif
+
+L(hop2return):
+#ifndef USE_AS_STPNCPY
+	mr r3, r18		/* set return value  */
+#endif
+	addi r1, r1, FRAMESIZE	/* restore stack pointer  */
+	ld r0, 16(r1)		/* read the saved link register  */
+	ld r18, -16(r1)		/* restore callers save register, r18  */
+	ld r19, -8(r1)		/* restore callers save register, r19  */
+	mtlr r0			/* branch to link register  */
+	blr			/* return  */
+
+	.p2align 4
+L(update0):
+	mr r9, r19
+
+	.p2align 4
+L(verifyByte):
+	rldicl. r8, r5, 0, 62
+#ifdef USE_AS_STPNCPY
+	mr r3, r9
+#endif
+	beq cr0, L(hop2return)
+	mtctr r8
+	addi r4, r4, -1
+	mr r19, r9
+	b L(oneBYone)
+
+	.p2align 4
+L(proceed):
+	bdz L(done)
+
+L(oneBYone):
+	lbzu r10, 1(r4)		/* copy byte  */
+	addi r19, r19, 1
+	addi r8, r8, -1
+	cmpdi cr7, r10, 0
+	stb r10, -1(r19)
+	bne cr7, L(proceed)
+	b L(zeroFill)
+
+	.p2align 4
+L(done):
+	addi r1, r1, FRAMESIZE	/* restore stack pointer  */
+#ifdef USE_AS_STPNCPY
+	mr r3, r19		/* set the return value  */
+#else
+	mr r3, r18		/* set the return value  */
+#endif
+	ld r0, 16(r1)		/* read the saved link register  */
+	ld r18, -16(r1)		/* restore callers save register, r18  */
+	ld r19, -8(r1)		/* restore callers save register, r19  */
+	mtlr r0			/* branch to link register  */
+	blr			/* return  */
+
+L(update1):
+	mr r0, r11
+	mr r19, r5
+
+	.p2align 4
+L(leftDwords):
+	cmpdi cr7, r0, 0
+	mr r5, r19
+	bne cr7, L(dWordUnrollOFF)
+	b L(byte_by_byte)
+
+	.p2align 4
+L(updtDestComputeN2ndByte):
+	addi r19, r19, 2	/* update dst by 2  */
+	subf r9, r19, r9	/* compute distance covered  */
+	add r8, r9, r5
+	b L(zeroFill)
+
+	.p2align 4
+L(updtDestComputeN3rdByte):
+	addi r19, r19, 3	/* update dst by 3  */
+	subf r9, r19, r9	/* compute distance covered  */
+	add r8, r9, r5
+	b L(zeroFill)
+
+	.p2align 4
+L(HopBy24):
+	addi r9, r9, 24		/* increment dst by 24  */
+	addi r4, r4, 24		/* increment src by 24  */
+	addi r5, r5, -24	/* decrement length 'n' by 24  */
+	addi r0, r11, -3	/* decrement loop counter  */
+	b L(dWordUnrollOFF)
+
+	.p2align 4
+L(update2):
+	mr r5, r19
+	b L(dWordUnrollOFF)
+
+	.p2align 4
+L(HopBy40):
+	addi r9, r7, 40		/* increment dst by 40  */
+	addi r4, r6, 40		/* increment src by 40  */
+	addi r5, r5, -40	/* decrement length 'n' by 40  */
+	addi r0, r11, -5	/* decrement loop counter  */
+	b L(dWordUnrollOFF)
+
+L(update3):
+	mr r0, r11
+	b L(dWordUnrollOFF)
+
+L(HopBy8):
+	addi r9, r3, 8		/* increment dst by 8  */
+	addi r4, r4, 8		/* increment src by 8  */
+	addi r5, r5, -8		/* decrement length 'n' by 8  */
+	addi r0, r11, -1	/* decrement loop counter  */
+	b L(dWordUnrollOFF)
+
+L(unaligned):
+	cmpdi	r5, 16		/* Proceed byte by byte for less than 16  */
+	ble	L(byte_by_byte)
+	rldicl	r7, r3, 0, 61
+	rldicl	r6, r4, 0, 61
+	cmpdi	r6, 0	/* Check src alignment */
+	beq	L(srcaligndstunalign)
+	/* src is unaligned */
+	rlwinm	r10, r4, 3,26,28	/* Calculate padding.  */
+	clrrdi	r4, r4, 3	/* Align the addr to dw boundary */
+	ld	r8, 0(r4)	/* Load doubleword from memory.  */
+	li	r0, 0
+	/* Discard bits not part of the string */
+#ifdef __LITTLE_ENDIAN__
+	srd	r7, r8, r10
+#else
+	sld	r7, r8, r10
+#endif
+	cmpb	r0, r7, r0	/* Compare each byte against null */
+	/* Discard bits not part of the string */
+#ifdef __LITTLE_ENDIAN__
+	sld	r0, r0, r10
+#else
+	srd	r0, r0, r10
+#endif
+	cmpdi	r0, 0
+	bne     L(bytebybyte)	/* if it has null, copy byte by byte */
+	subfic	r6, r6, 8
+	rlwinm	r12, r3, 3,26,28	/* Calculate padding in bits.  */
+	rldicl	r9, r3, 0, 61	/* Calculate padding in bytes. */
+	addi	r3, r3, -1
+
+	cmpdi	r12, 0	/* check dest alignment */
+	beq     L(srcunaligndstalign)
+
+	/* both src and dst unaligned */
+#ifdef __LITTLE_ENDIAN__
+	sld	r8, r7, r10
+	mr	r11, r10
+	addi	r11, r11, -8	/* Adjust byte pointer on loaded dw */
+#else
+	srd	r8, r7, r10
+	subfic	r11, r10, 64
+#endif
+	/* dst alignment is greater then src alignment? */
+	cmpd    cr7, r12, r10
+	ble     cr7, L(dst_align_small)
+	/* src alignment is less than dst */
+
+	/* Calculate the dst alignment difference  */
+	subfic	r7, r9, 8
+	mtctr	r7
+
+	/* Write until dst is aligned  */
+	cmpdi	r0, r7, 4
+	blt     L(storebyte1)	/* less than 4, store byte by byte  */
+	beq     L(equal1)	/* if its 4, store word  */
+	addi	r0, r7, -4	/* greater than 4, so stb and stw  */
+	mtctr	r0
+L(storebyte1):
+#ifdef __LITTLE_ENDIAN__
+	addi	r11, r11, 8	/* Adjust byte pointer on loaded dw  */
+#else
+	addi	r11, r11, -8
+#endif
+	srd	r7, r8, r11
+	stbu	r7, 1(r3)
+	addi	r5, r5, -1
+	bdnz    L(storebyte1)
+
+	subfic	r7, r9, 8	/* Check the remaining bytes  */
+	cmpdi	r0, r7, 4
+	blt     L(proceed1)
+
+	.align 4
+L(equal1):
+#ifdef __LITTLE_ENDIAN__
+	addi	r11, r11, 8	/* Adjust byte pointer on loaded dw  */
+	srd	r7, r8, r11
+#else
+	subfic	r11, r11, 64
+	sld	r7, r8, r11
+	srdi	r7, r7, 32
+#endif
+	stw	r7, 1(r3)
+	addi	r3, r3, 4
+	addi	r5, r5, -4
+
+L(proceed1):
+	mr	r7, r8
+	/* calculate the Left over bytes to be written  */
+	subfic	r11, r10, 64
+	subfic	r12, r12, 64
+	subf	r12, r12, r11	/* remaining bytes on second dw  */
+	subfic	r10, r12, 64	/* remaining bytes on first dw  */
+	subfic	r9, r9, 8
+	subf	r6, r9, r6	/* recalculate padding  */
+L(srcunaligndstalign):
+	addi	r3, r3, 1
+	subfic	r12, r10, 64	/* remaining bytes on second dw  */
+	addi	r4, r4, 8
+	li	r0,0
+	b       L(storedouble)
+
+	.align 4
+L(dst_align_small):
+	mtctr	r6
+	/* Write until src is aligned  */
+L(storebyte2):
+#ifdef __LITTLE_ENDIAN__
+	addi	r11, r11, 8	/* Adjust byte pointer on dw  */
+#else
+	addi	r11, r11, -8
+#endif
+	srd	r7, r8, r11
+	stbu	r7, 1(r3)
+	addi	r5, r5, -1
+	bdnz    L(storebyte2)
+
+	addi	r4, r4, 8	/* Increment src pointer  */
+	addi	r3, r3, 1	/* Increment dst pointer  */
+	mr	r9, r3
+	li	r8, 0
+	cmpd    cr7, r12, r10
+	beq     cr7, L(aligned)
+	rldicl	r6, r3, 0, 61	/* Recalculate padding */
+	mr	r7, r6
+
+	/* src is algined */
+L(srcaligndstunalign):
+	mr	r9, r3
+	mr	r6, r7
+	ld	r8, 0(r4)
+	subfic	r10, r7, 8
+	mr	r7, r8
+	li	r0, 0	/* Check null */
+	cmpb	r0, r8, r0
+	cmpdi	r0, 0
+	bne     L(byte_by_byte)	/* Do byte by byte if there is NULL  */
+	rlwinm	r12, r3, 3,26,28	/* Calculate padding  */
+	addi	r3, r3, -1
+	/* write byte by byte until aligned  */
+#ifdef __LITTLE_ENDIAN__
+	li	r11, -8
+#else
+	li	r11, 64
+#endif
+	mtctr	r10
+	cmpdi	r0, r10, 4
+	blt     L(storebyte)
+	beq     L(equal)
+	addi	r0, r10, -4
+	mtctr	r0
+L(storebyte):
+#ifdef __LITTLE_ENDIAN__
+	addi	r11, r11, 8	/* Adjust byte pointer on  dw  */
+#else
+	addi	r11, r11, -8
+#endif
+	srd	r7, r8, r11
+	stbu	r7, 1(r3)
+	addi	r5, r5, -1
+	bdnz    L(storebyte)
+
+	cmpdi	r0, r10, 4
+	blt     L(align)
+
+	.align 4
+L(equal):
+#ifdef __LITTLE_ENDIAN__
+	addi	r11, r11, 8
+	srd	r7, r8, r11
+#else
+	subfic	r11, r11, 64
+	sld	r7, r8, r11
+	srdi	r7, r7, 32
+#endif
+	stw	r7, 1(r3)
+	addi	r5, r5, -4
+	addi	r3, r3, 4
+L(align):
+	addi	r3, r3, 1
+	addi	r4, r4, 8	/* Increment src pointer  */
+	subfic	r10, r12, 64
+	li	r0, 0
+	/* dst addr aligned to 8 */
+L(storedouble):
+	cmpdi	r5, 8
+	ble	L(null1)
+	ld	r7, 0(r4)	/* load next dw  */
+	cmpb	r0, r7, r0
+	cmpdi	r0, 0	/* check for null on each new dw  */
+	bne     L(null)
+#ifdef __LITTLE_ENDIAN__
+	srd	r9, r8, r10	/* bytes from first dw  */
+	sld	r11, r7, r12	/* bytes from second dw  */
+#else
+	sld	r9, r8, r10
+	srd	r11, r7, r12
+#endif
+	or	r11, r9, r11	/* make as a single dw  */
+	std	r11, 0(r3)	/* store as std on aligned addr  */
+	mr	r8, r7		/* still few bytes left to be written  */
+	addi	r3, r3, 8	/* increment dst addr  */
+	addi	r4, r4, 8	/* increment src addr  */
+	addi	r5, r5, -8
+	b       L(storedouble)	/* Loop until NULL  */
+
+	.align 4
+
+/* We've hit the end of the string.  Do the rest byte-by-byte.  */
+L(null):
+	addi	r3, r3, -1
+	mr	r10, r12
+	mtctr	r6
+#ifdef __LITTLE_ENDIAN__
+	subfic	r10, r10, 64
+	addi	r10, r10, -8
+#endif
+	cmpdi	r0, r5, 4
+	blt	L(loop)
+	cmpdi	r0, r6, 4
+	blt     L(loop)
+
+	/* we can still use stw if leftover >= 4  */
+#ifdef __LITTLE_ENDIAN__
+	addi	r10, r10, 8
+	srd	r11, r8, r10
+#else
+	subfic	r10, r10, 64
+	sld	r11, r8, r10
+	srdi	r11, r11, 32
+#endif
+	stw	r11, 1(r3)
+	addi	r5, r5, -4
+	addi	r3, r3, 4
+	cmpdi	r0, r5, 0
+	beq	L(g1)
+	cmpdi	r0, r6, 4
+	beq     L(bytebybyte1)
+	addi	r10, r10, 32
+#ifdef __LITTLE_ENDIAN__
+	addi	r10, r10, -8
+#else
+	subfic	r10, r10, 64
+#endif
+	addi	r0, r6, -4
+	mtctr	r0
+	/* remaining byte by byte part of first dw  */
+L(loop):
+#ifdef __LITTLE_ENDIAN__
+	addi	r10, r10, 8
+#else
+	addi	r10, r10, -8
+#endif
+	srd	r0, r8, r10
+	stbu	r0, 1(r3)
+	addi	r5, r5, -1
+	cmpdi	r0, r5, 0
+	beq	L(g1)
+	bdnz    L(loop)
+L(bytebybyte1):
+	addi	r3, r3, 1
+	/* remaining byte by byte part of second dw   */
+L(bytebybyte):
+	addi	r3, r3, -8
+	addi	r4, r4, -1
+
+#ifdef __LITTLE_ENDIAN__
+	extrdi. r0, r7, 8, 56
+	stbu	r7, 8(r3)
+	addi	r5, r5, -1
+	beq	L(g2)
+	cmpdi	r5, 0
+	beq	L(g1)
+	extrdi. r0, r7, 8, 48
+	stbu	r0, 1(r3)
+	addi	r5, r5, -1
+	beq	L(g2)
+	cmpdi	r5, 0
+	beq	L(g1)
+	extrdi. r0, r7, 8, 40
+	stbu	r0, 1(r3)
+	addi	r5, r5, -1
+	beq	L(g2)
+	cmpdi	r5, 0
+	beq	L(g1)
+	extrdi. r0, r7, 8, 32
+	stbu	r0, 1(r3)
+	addi	r5, r5, -1
+	beq	L(g2)
+	cmpdi	r5, 0
+	beq	L(g1)
+	extrdi. r0, r7, 8, 24
+	stbu	r0, 1(r3)
+	addi	r5, r5, -1
+	beq	L(g2)
+	cmpdi	r5, 0
+	beq	L(g1)
+	extrdi. r0, r7, 8, 16
+	stbu	r0, 1(r3)
+	addi	r5, r5, -1
+	beq	L(g2)
+	cmpdi	r5, 0
+	beq	L(g1)
+	extrdi. r0, r7, 8, 8
+	stbu	r0, 1(r3)
+	addi	r5, r5, -1
+	beq	L(g2)
+	cmpdi	r5, 0
+	beq	L(g1)
+	extrdi	r0, r7, 8, 0
+	stbu	r0, 1(r3)
+	addi	r5, r5, -1
+	b	L(g2)
+#else
+	extrdi. r0, r7, 8, 0
+	stbu	r0, 8(r3)
+	addi	r5, r5, -1
+	beq	L(g2)
+	cmpdi	r5, 0
+	beq	L(g1)
+	extrdi. r0, r7, 8, 8
+	stbu	r0, 1(r3)
+	addi	r5, r5, -1
+	beq	L(g2)
+	cmpdi	r5, 0
+	beq	L(g1)
+	extrdi. r0, r7, 8, 16
+	stbu	r0, 1(r3)
+	addi	r5, r5, -1
+	beq	L(g2)
+	cmpdi	r5, 0
+	beq	L(g1)
+	extrdi. r0, r7, 8, 24
+	stbu	r0, 1(r3)
+	addi	r5, r5, -1
+	beq	L(g2)
+	cmpdi	r5, 0
+	beq	L(g1)
+	extrdi. r0, r7, 8, 32
+	stbu	r0, 1(r3)
+	addi	r5, r5, -1
+	beq	L(g2)
+	cmpdi	r5, 0
+	beq	L(g1)
+	extrdi. r0, r7, 8, 40
+	stbu	r0, 1(r3)
+	addi	r5, r5, -1
+	beq	L(g2)
+	cmpdi	r5, 0
+	beq	L(g1)
+	extrdi. r0, r7, 8, 48
+	stbu	r0, 1(r3)
+	addi	r5, r5, -1
+	beq	L(g2)
+	cmpdi	r5, 0
+	beq	L(g1)
+	stbu	r7, 1(r3)
+	addi	r5, r5, -1
+	b	L(g2)
+#endif
+L(g1):
+#ifdef USE_AS_STPNCPY
+	addi	r3, r3, 1
+#endif
+L(g2):
+	addi	r3, r3, 1
+	mr	r19, r3
+	mr	r8, r5
+	b	L(zeroFill)
+L(null1):
+	mr	r9, r3
+	subf	r4, r6, r4
+	b	L(byte_by_byte)
+END(FUNC_NAME)
+#ifndef USE_AS_STPNCPY
+libc_hidden_builtin_def (strncpy)
+#endif
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power7/strnlen.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/strnlen.S
new file mode 100644
index 0000000000..a970b6ce30
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/strnlen.S
@@ -0,0 +1,182 @@
+/* Optimized strnlen implementation for PowerPC64/POWER7 using cmpb insn.
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Luis Machado <luisgpm@br.ibm.com>.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+#ifndef STRNLEN
+# define STRNLEN __strnlen
+#endif
+
+/* int [r3] strnlen (char *s [r3], int size [r4])  */
+	.machine  power7
+ENTRY (STRNLEN)
+	CALL_MCOUNT 2
+	dcbt	0,r3
+	clrrdi	r8,r3,3
+	add	r7,r3,r4      /* Calculate the last acceptable address.  */
+	cmpldi	r4,32
+	li	r0,0	      /* Doubleword with null chars.  */
+	addi	r7,r7,-1
+
+	/* If we have less than 33 bytes to search, skip to a faster code.  */
+	ble	L(small_range)
+
+	rlwinm	r6,r3,3,26,28 /* Calculate padding.  */
+	ld	r12,0(r8)     /* Load doubleword from memory.  */
+	cmpb	r10,r12,r0    /* Check for null bytes in DWORD1.  */
+#ifdef __LITTLE_ENDIAN__
+	srd	r10,r10,r6
+	sld	r10,r10,r6
+#else
+	sld	r10,r10,r6
+	srd	r10,r10,r6
+#endif
+	cmpldi	cr7,r10,0     /* If r10 == 0, no null's have been found.  */
+	bne	cr7,L(done)
+
+	clrrdi	r7,r7,3       /* Address of last doubleword.  */
+	mtcrf   0x01,r8
+	/* Are we now aligned to a quadword boundary?  If so, skip to
+	   the main loop.  Otherwise, go through the alignment code.  */
+
+	bt	28,L(loop_setup)
+
+	/* Handle DWORD2 of pair.  */
+	ldu	r12,8(r8)
+	cmpb	r10,r12,r0
+	cmpldi	cr7,r10,0
+	bne	cr7,L(done)
+
+L(loop_setup):
+	/* The last dword we want to read in the loop below is the one
+	   containing the last byte of the string, ie. the dword at
+	   (s + size - 1) & ~7, or r7.  The first dword read is at
+	   r8 + 8, we read 2 * cnt dwords, so the last dword read will
+	   be at r8 + 8 + 16 * cnt - 8.  Solving for cnt gives
+	   cnt = (r7 - r8) / 16  */
+	sub	r5,r7,r8
+	srdi	r6,r5,4	      /* Number of loop iterations.  */
+	mtctr	r6	      /* Setup the counter.  */
+
+	/* Main loop to look for the null byte in the string.  Since
+	   it's a small loop (< 8 instructions), align it to 32-bytes.  */
+	.p2align  5
+L(loop):
+	/* Load two doublewords, compare and merge in a
+	   single register for speed.  This is an attempt
+	   to speed up the null-checking process for bigger strings.  */
+
+	ld	r12,8(r8)
+	ldu	r11,16(r8)
+	cmpb	r10,r12,r0
+	cmpb	r9,r11,r0
+	or	r5,r9,r10     /* Merge everything in one doubleword.  */
+	cmpldi	cr7,r5,0
+	bne	cr7,L(found)
+	bdnz	L(loop)
+
+	/* We may have one more dword to read.  */
+	cmpld	cr6,r8,r7
+	beq	cr6,L(end_max)
+
+	ldu	r12,8(r8)
+	cmpb	r10,r12,r0
+	cmpldi	cr6,r10,0
+	bne	cr6,L(done)
+
+L(end_max):
+	mr	r3,r4
+	blr
+
+	/* OK, one (or both) of the doublewords contains a null byte.  Check
+	   the first doubleword and decrement the address in case the first
+	   doubleword really contains a null byte.  */
+	.align	4
+L(found):
+	cmpldi	cr6,r10,0
+	addi	r8,r8,-8
+	bne	cr6,L(done)
+
+	/* The null byte must be in the second doubleword.  Adjust the address
+	   again and move the result of cmpb to r10 so we can calculate the
+	   length.  */
+
+	mr	r10,r9
+	addi	r8,r8,8
+
+	/* r10 has the output of the cmpb instruction, that is, it contains
+	   0xff in the same position as the null byte in the original
+	   doubleword from the string.  Use that to calculate the length.
+	   We need to make sure the null char is *before* the end of the
+	   range.  */
+L(done):
+#ifdef __LITTLE_ENDIAN__
+	addi	r0,r10,-1
+	andc	r0,r0,r10
+	popcntd	r0,r0
+#else
+	cntlzd	r0,r10	      /* Count leading zeros before the match.  */
+#endif
+	sub	r3,r8,r3
+	srdi	r0,r0,3	      /* Convert leading/trailing zeros to bytes.  */
+	add	r3,r3,r0      /* Length until the match.  */
+	cmpld	r3,r4
+	blelr
+	mr	r3,r4
+	blr
+
+/* Deals with size <= 32.  */
+	.align	4
+L(small_range):
+	cmpldi	r4,0
+	beq	L(end_max)
+
+	clrrdi	r7,r7,3       /* Address of last doubleword.  */
+
+	rlwinm	r6,r3,3,26,28 /* Calculate padding.  */
+	ld	r12,0(r8)     /* Load doubleword from memory.  */
+	cmpb	r10,r12,r0    /* Check for null bytes in DWORD1.  */
+#ifdef __LITTLE_ENDIAN__
+	srd	r10,r10,r6
+	sld	r10,r10,r6
+#else
+	sld	r10,r10,r6
+	srd	r10,r10,r6
+#endif
+	cmpldi	cr7,r10,0
+	bne	cr7,L(done)
+
+	cmpld	r8,r7
+	beq	L(end_max)
+
+	.p2align  5
+L(loop_small):
+	ldu	r12,8(r8)
+	cmpb	r10,r12,r0
+	cmpldi	cr6,r10,0
+	bne	cr6,L(done)
+	cmpld	r8,r7
+	bne	L(loop_small)
+	mr	r3,r4
+	blr
+
+END (STRNLEN)
+libc_hidden_def (__strnlen)
+weak_alias (__strnlen, strnlen)
+libc_hidden_def (strnlen)
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power7/strrchr.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/strrchr.S
new file mode 100644
index 0000000000..c22393deb5
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/strrchr.S
@@ -0,0 +1,260 @@
+/* Optimized strrchr implementation for PowerPC64/POWER7 using cmpb insn.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+/* int [r3] strrchr (char *s [r3], int c [r4])  */
+
+#ifndef STRRCHR
+# define STRRCHR strrchr
+#endif
+
+	.machine  power7
+ENTRY (STRRCHR)
+	CALL_MCOUNT 2
+	dcbt	0,r3
+	clrrdi	r8,r3,3	      /* Align the address to doubleword boundary.  */
+	cmpdi	cr7,r4,0
+	ld	r12,0(r8)     /* Load doubleword from memory.  */
+	li	r9,0	      /* used to store last occurence */
+	li	r0,0	      /* Doubleword with null chars to use
+				 with cmpb.  */
+
+	rlwinm	r6,r3,3,26,28 /* Calculate padding.  */
+
+	beq	cr7,L(null_match)
+
+	/* Replicate byte to doubleword.  */
+	insrdi	r4,r4,8,48
+	insrdi	r4,r4,16,32
+	insrdi	r4,r4,32,0
+
+	/* r4 is changed now ,if its passed as more chars
+	   check for null again */
+	cmpdi	cr7,r4,0
+	beq	cr7,L(null_match)
+	/* Now r4 has a doubleword of c bytes and r0 has
+	   a doubleword of null bytes.  */
+
+	cmpb	r10,r12,r4     /* Compare each byte against c byte.  */
+	cmpb	r11,r12,r0     /* Compare each byte against null byte.  */
+
+	/* Move the doublewords left and right to discard the bits that are
+	   not part of the string and bring them back as zeros.  */
+#ifdef __LITTLE_ENDIAN__
+	srd	r10,r10,r6
+	srd	r11,r11,r6
+	sld	r10,r10,r6
+	sld	r11,r11,r6
+#else
+	sld	r10,r10,r6
+	sld	r11,r11,r6
+	srd	r10,r10,r6
+	srd	r11,r11,r6
+#endif
+	or	r5,r10,r11    /* OR the results to speed things up.  */
+	cmpdi	cr7,r5,0      /* If r5 == 0, no c or null bytes
+				 have been found.  */
+	bne	cr7,L(done)
+
+L(align):
+	mtcrf	0x01,r8
+
+	/* Are we now aligned to a doubleword boundary?  If so, skip to
+	   the main loop.  Otherwise, go through the alignment code.  */
+
+	bt	28,L(loop)
+
+	/* Handle WORD2 of pair.  */
+	ldu	r12,8(r8)
+	cmpb	r10,r12,r4
+	cmpb	r11,r12,r0
+	or	r5,r10,r11
+	cmpdi	cr7,r5,0
+	bne	cr7,L(done)
+	b	L(loop)	      /* We branch here (rather than falling through)
+				 to skip the nops due to heavy alignment
+				 of the loop below.  */
+	.p2align  5
+L(loop):
+	/* Load two doublewords, compare and merge in a
+	   single register for speed.  This is an attempt
+	   to speed up the null-checking process for bigger strings.  */
+	ld	r12,8(r8)
+	ldu	r7,16(r8)
+	cmpb	r10,r12,r4
+	cmpb	r11,r12,r0
+	cmpb	r6,r7,r4
+	cmpb	r7,r7,r0
+	or	r12,r10,r11
+	or	r5,r6,r7
+	or	r5,r12,r5
+	cmpdi	cr7,r5,0
+	beq	cr7,L(loop)
+
+	/* OK, one (or both) of the doublewords contains a c/null byte.  Check
+	   the first doubleword and decrement the address in case the first
+	   doubleword really contains a c/null byte.  */
+	cmpdi	cr6,r12,0
+	addi	r8,r8,-8
+	bne	cr6,L(done)
+
+	/* The c/null byte must be in the second doubleword.  Adjust the
+	   address again and move the result of cmpb to r10 so we can calculate
+	   the pointer.  */
+
+	mr	r10,r6
+	mr	r11,r7
+	addi	r8,r8,8
+
+	/* r10/r11 have the output of the cmpb instructions, that is,
+	   0xff in the same position as the c/null byte in the original
+	   doubleword from the string.  Use that to calculate the pointer.  */
+
+L(done):
+	/* if there are more than one 0xff in r11, find the first pos of ff
+	   in r11 and fill r10 with 0 from that position */
+	cmpdi	cr7,r11,0
+	beq	cr7,L(no_null)
+#ifdef __LITTLE_ENDIAN__
+	addi	r3,r11,-1
+	andc	r3,r3,r11
+	popcntd r0,r3
+#else
+	cntlzd	r0,r11
+#endif
+	subfic	r0,r0,63
+	li	r6,-1
+#ifdef __LITTLE_ENDIAN__
+	srd	r0,r6,r0
+#else
+	sld	r0,r6,r0
+#endif
+	and	r10,r0,r10
+L(no_null):
+#ifdef __LITTLE_ENDIAN__
+	cntlzd	r0,r10		/* Count leading zeros before c matches.  */
+	addi	r3,r10,-1
+	andc	r3,r3,r10
+	addi	r10,r11,-1
+	andc	r10,r10,r11
+	cmpld	cr7,r3,r10
+	bgt	cr7,L(no_match)
+#else
+	addi	r3,r10,-1	/* Count trailing zeros before c matches.  */
+	andc	r3,r3,r10
+	popcntd	r0,r3
+	cmpld	cr7,r11,r10
+	bgt	cr7,L(no_match)
+#endif
+	srdi	r0,r0,3		/* Convert trailing zeros to bytes.  */
+	subfic	r0,r0,7
+	add	r9,r8,r0      /* Return address of the matching c byte
+				 or null in case c was not found.  */
+	li	r0,0
+	cmpdi	cr7,r11,0     /* If r11 == 0, no null's have been found.  */
+	beq	cr7,L(align)
+
+	.align	4
+L(no_match):
+	mr	r3,r9
+	blr
+
+/* We are here because strrchr was called with a null byte.  */
+	.align	4
+L(null_match):
+	/* r0 has a doubleword of null bytes.  */
+
+	cmpb	r5,r12,r0     /* Compare each byte against null bytes.  */
+
+	/* Move the doublewords left and right to discard the bits that are
+	   not part of the string and bring them back as zeros.  */
+#ifdef __LITTLE_ENDIAN__
+	srd	r5,r5,r6
+	sld	r5,r5,r6
+#else
+	sld	r5,r5,r6
+	srd	r5,r5,r6
+#endif
+	cmpdi	cr7,r5,0      /* If r10 == 0, no c or null bytes
+				 have been found.  */
+	bne	cr7,L(done_null)
+
+	mtcrf	0x01,r8
+
+	/* Are we now aligned to a quadword boundary?  If so, skip to
+	   the main loop.  Otherwise, go through the alignment code.  */
+
+	bt	28,L(loop_null)
+
+	/* Handle WORD2 of pair.  */
+	ldu	r12,8(r8)
+	cmpb	r5,r12,r0
+	cmpdi	cr7,r5,0
+	bne	cr7,L(done_null)
+	b	L(loop_null)  /* We branch here (rather than falling through)
+				 to skip the nops due to heavy alignment
+				 of the loop below.  */
+
+	/* Main loop to look for the end of the string.  Since it's a
+	   small loop (< 8 instructions), align it to 32-bytes.  */
+	.p2align  5
+L(loop_null):
+	/* Load two doublewords, compare and merge in a
+	   single register for speed.  This is an attempt
+	   to speed up the null-checking process for bigger strings.  */
+	ld	r12,8(r8)
+	ldu	r11,16(r8)
+	cmpb	r5,r12,r0
+	cmpb	r10,r11,r0
+	or	r6,r5,r10
+	cmpdi	cr7,r6,0
+	beq	cr7,L(loop_null)
+
+	/* OK, one (or both) of the doublewords contains a null byte.  Check
+	   the first doubleword and decrement the address in case the first
+	   doubleword really contains a null byte.  */
+
+	cmpdi	cr6,r5,0
+	addi	r8,r8,-8
+	bne	cr6,L(done_null)
+
+	/* The null byte must be in the second doubleword.  Adjust the address
+	   again and move the result of cmpb to r10 so we can calculate the
+	   pointer.  */
+
+	mr	r5,r10
+	addi	r8,r8,8
+
+	/* r5 has the output of the cmpb instruction, that is, it contains
+	   0xff in the same position as the null byte in the original
+	   doubleword from the string.  Use that to calculate the pointer.  */
+L(done_null):
+#ifdef __LITTLE_ENDIAN__
+	addi	r0,r5,-1
+	andc	r0,r0,r5
+	popcntd	r0,r0
+#else
+	cntlzd	r0,r5	      /* Count leading zeros before the match.  */
+#endif
+	srdi	r0,r0,3	      /* Convert trailing zeros to bytes.  */
+	add	r3,r8,r0      /* Return address of the matching null byte.  */
+	blr
+END (STRRCHR)
+weak_alias (strrchr, rindex)
+libc_hidden_builtin_def (strrchr)
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power7/strstr-ppc64.c b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/strstr-ppc64.c
new file mode 100644
index 0000000000..a917b2157e
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/strstr-ppc64.c
@@ -0,0 +1,27 @@
+/* Optimized strstr implementation for PowerPC64/POWER7.
+   Copyright (C) 2015-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <string.h>
+
+#define STRSTR __strstr_ppc
+#undef libc_hidden_builtin_def
+#define libc_hidden_builtin_def(__name)
+
+extern __typeof (strstr) __strstr_ppc attribute_hidden;
+
+#include <string/strstr.c>
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power7/strstr.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/strstr.S
new file mode 100644
index 0000000000..260db2ed6d
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/strstr.S
@@ -0,0 +1,521 @@
+/* Optimized strstr implementation for PowerPC64/POWER7.
+   Copyright (C) 2015-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+/* Char * [r3] strstr (char *s [r3], char * pat[r4])  */
+
+/* The performance gain is obtained using aligned memory access, load
+ * doubleword and usage of cmpb instruction for quicker comparison.  */
+
+#define ITERATIONS	64
+
+#ifndef STRSTR
+# define STRSTR strstr
+#endif
+
+#ifndef STRLEN
+/* For builds with no IFUNC support, local calls should be made to internal
+   GLIBC symbol (created by libc_hidden_builtin_def).  */
+# ifdef SHARED
+#  define STRLEN   __GI_strlen
+# else
+#  define STRLEN   strlen
+# endif
+#endif
+
+#ifndef STRNLEN
+/* For builds with no IFUNC support, local calls should be made to internal
+   GLIBC symbol (created by libc_hidden_builtin_def).  */
+# ifdef SHARED
+#  define STRNLEN   __GI_strnlen
+# else
+#  define STRNLEN  __strnlen
+# endif
+#endif
+
+#ifndef STRCHR
+# ifdef SHARED
+#  define STRCHR   __GI_strchr
+# else
+#  define STRCHR   strchr
+# endif
+#endif
+
+#define	FRAMESIZE	(FRAME_MIN_SIZE+32)
+	.machine  power7
+EALIGN (STRSTR, 4, 0)
+	CALL_MCOUNT 2
+	mflr	r0			/* Load link register LR to r0.  */
+	std	r31, -8(r1)		/* Save callers register r31.  */
+	std	r30, -16(r1)		/* Save callers register r30.  */
+	std	r29, -24(r1)		/* Save callers register r29.  */
+	std	r28, -32(r1)		/* Save callers register r28.  */
+	std	r0, 16(r1)		/* Store the link register.  */
+	cfi_offset(r31, -8)
+	cfi_offset(r30, -16)
+	cfi_offset(r28, -32)
+	cfi_offset(r29, -24)
+	cfi_offset(lr, 16)
+	stdu	r1, -FRAMESIZE(r1)	/* Create the stack frame.  */
+	cfi_adjust_cfa_offset(FRAMESIZE)
+
+	dcbt	0, r3
+	dcbt	0, r4
+	cmpdi	cr7, r3, 0
+	beq	cr7, L(retnull)
+	cmpdi	cr7, r4, 0
+	beq	cr7, L(retnull)
+
+	mr	r29, r3
+	mr	r30, r4
+	mr	r3, r4
+	bl	STRLEN
+	nop
+
+	cmpdi	cr7, r3, 0	/* If search str is null.  */
+	beq	cr7, L(ret_r3)
+
+	mr	r31, r3
+	mr	r4, r3
+	mr	r3, r29
+	bl	STRNLEN
+	nop
+
+	cmpd	cr7, r3, r31 	/* If len(r3) < len(r4).  */
+	blt	cr7, L(retnull)
+	mr	r3, r29
+	lbz	r4, 0(r30)
+	bl	STRCHR
+	nop
+
+	mr	r11, r3
+	/* If first char of search str is not present.  */
+	cmpdi	cr7, r3, 0
+	ble	cr7, L(end)
+	/* Reg r28 is used to count the number of iterations. */
+	li	r28, 0
+	rldicl	r8, r3, 0, 52	/* Page cross check.  */
+	cmpldi	cr7, r8, 4096-16
+	bgt	cr7, L(bytebybyte)
+
+	rldicl	r8, r30, 0, 52
+	cmpldi	cr7, r8, 4096-16
+	bgt	cr7, L(bytebybyte)
+
+	/* If len(r4) < 8 handle in a different way.  */
+	/* Shift position based on null and use cmpb.  */
+	cmpdi	cr7, r31, 8
+	blt	cr7, L(lessthan8)
+
+	/* Len(r4) >= 8 reaches here.  */
+	mr	r8, r3		/* Save r3 for future use.  */
+	mr	r4, r30		/* Restore r4.  */
+	li	r0, 0
+	rlwinm	r10, r30, 3, 26, 28	/* Calculate padding in bits.  */
+	clrrdi	r4, r4, 3	/* Make r4 aligned to 8.  */
+	ld	r6, 0(r4)
+	addi	r4, r4, 8
+	cmpdi	cr7, r10, 0	/* Check if its already aligned?  */
+	beq	cr7, L(begin1)
+#ifdef __LITTLE_ENDIAN__
+	srd	r6, r6, r10	/* Discard unwanted bits.  */
+#else
+	sld	r6, r6, r10
+#endif
+	ld	r9, 0(r4)
+	subfic	r10, r10, 64
+#ifdef __LITTLE_ENDIAN__
+	sld	r9, r9, r10	/* Discard unwanted bits.  */
+#else
+	srd	r9, r9, r10
+#endif
+	or	r6, r6, r9	/* Form complete search str.  */
+L(begin1):
+	mr	r29, r6
+	rlwinm	r10, r3, 3, 26, 28
+	clrrdi	r3, r3, 3
+	ld	r5, 0(r3)
+	cmpb	r9, r0, r6	/* Check if input has null.  */
+	cmpdi	cr7, r9, 0
+	bne	cr7, L(return3)
+	cmpb	r9, r0, r5	/* Check if input has null.  */
+#ifdef __LITTLE_ENDIAN__
+	srd	r9, r9, r10
+#else
+	sld	r9, r9, r10
+#endif
+	cmpdi	cr7, r9, 0
+	bne	cr7, L(retnull)
+
+	li	r12, -8		/* Shift values.  */
+	li	r11, 72		/* Shift values.  */
+	cmpdi	cr7, r10, 0
+	beq	cr7, L(nextbyte1)
+	mr	r12, r10
+	addi	r12, r12, -8
+	subfic	r11, r12, 64
+
+L(nextbyte1):
+	ldu	r7, 8(r3) 	/* Load next dw.  */
+	addi	r12, r12, 8	/* Shift one byte and compare.  */
+	addi	r11, r11, -8
+#ifdef __LITTLE_ENDIAN__
+	srd	r9, r5, r12	/* Rotate based on mask.  */
+	sld	r10, r7, r11
+#else
+	sld	r9, r5, r12
+	srd	r10, r7, r11
+#endif
+	/* Form single dw from few bytes on first load and second load.  */
+	or	r10, r9, r10
+	/* Check for null in the formed dw.  */
+	cmpb	r9, r0, r10
+	cmpdi	cr7, r9, 0
+	bne	cr7, L(retnull)
+	/* Cmpb search str and input str.  */
+	cmpb	r9, r10, r6
+	cmpdi	cr7, r9, -1
+	beq	cr7, L(match)
+	addi	r8, r8, 1
+	b	L(begin)
+
+	.align	4
+L(match):
+	/* There is a match of 8 bytes, check next bytes.  */
+	cmpdi	cr7, r31, 8
+	beq	cr7, L(return)
+	/* Update next starting point r8.  */
+	srdi	r9, r11, 3
+	subf	r9, r9, r3
+	mr	r8, r9
+
+L(secondmatch):
+	mr	r5, r7
+	rlwinm	r10, r30, 3, 26, 28	/* Calculate padding in bits.  */
+	ld	r6, 0(r4)
+	addi	r4, r4, 8
+	cmpdi	cr7, r10, 0	/* Check if its already aligned?  */
+	beq	cr7, L(proceed3)
+#ifdef __LITTLE_ENDIAN__
+	srd	r6, r6, r10	/* Discard unwanted bits.  */
+	cmpb	r9, r0, r6
+	sld	r9, r9, r10
+#else
+	sld	r6, r6, r10
+	cmpb	r9, r0, r6
+	srd	r9, r9, r10
+#endif
+	cmpdi	cr7, r9, 0
+	bne	cr7, L(proceed3)
+	ld	r9, 0(r4)
+	subfic	r10, r10, 64
+#ifdef __LITTLE_ENDIAN__
+	sld	r9, r9, r10	/* Discard unwanted bits.  */
+#else
+	srd	r9, r9, r10
+#endif
+	or	r6, r6, r9	/* Form complete search str.  */
+
+L(proceed3):
+	li	r7, 0
+	addi	r3, r3, 8
+	cmpb	r9, r0, r5
+	cmpdi	cr7, r9, 0
+	bne	cr7, L(proceed4)
+	ld	r7, 0(r3)
+L(proceed4):
+#ifdef __LITTLE_ENDIAN__
+	srd	r9, r5, r12
+	sld	r10, r7, r11
+#else
+	sld	r9, r5, r12
+	srd	r10, r7, r11
+#endif
+	/* Form single dw with few bytes from first and second load.  */
+	or	r10, r9, r10
+	cmpb	r9, r0, r6
+	cmpdi	cr7, r9, 0
+	bne	cr7, L(return4)
+	/* Check for null in the formed dw.  */
+	cmpb	r9, r0, r10
+	cmpdi	cr7, r9, 0
+	bne	cr7, L(retnull)
+	/* If the next 8 bytes dont match, start search again.  */
+	cmpb	r9, r10, r6
+	cmpdi	cr7, r9, -1
+	bne	cr7, L(reset)
+	/* If the next 8 bytes match, load and compare next 8.  */
+	b	L(secondmatch)
+
+	.align	4
+L(reset):
+	/* Start the search again.  */
+	addi	r8, r8, 1
+	b	L(begin)
+
+	.align	4
+L(return3):
+	/* Count leading zeros and compare partial dw.  */
+#ifdef __LITTLE_ENDIAN__
+	addi	r7, r9, -1
+	andc	r7, r7, r9
+	popcntd	r7, r7
+	subfic	r7, r7, 64
+	sld	r10, r5, r7
+	sld	r6, r6, r7
+#else
+	cntlzd	r7, r9
+	subfic	r7, r7, 64
+	srd	r10, r5, r7
+	srd	r6, r6, r7
+#endif
+	cmpb	r9, r10, r6
+	cmpdi	cr7, r9, -1
+	addi	r8, r8, 1
+	/* Start search again if there is no match.  */
+	bne	cr7, L(begin)
+	/* If the words match, update return values.  */
+	subfic	r7, r7, 64
+	srdi	r7, r7, 3
+	add	r3, r3, r7
+	subf	r3, r31, r3
+	b	L(end)
+
+	.align	4
+L(return4):
+	/* Count leading zeros and compare partial dw.  */
+#ifdef __LITTLE_ENDIAN__
+	addi	r7, r9, -1
+	andc	r7, r7, r9
+	popcntd	r7, r7
+	subfic	r7, r7, 64
+	sld	r10, r10, r7
+	sld	r6, r6, r7
+#else
+	cntlzd	r7, r9
+	subfic	r7, r7, 64
+	srd	r10, r10, r7
+	srd	r6, r6, r7
+#endif
+	cmpb	r9, r10, r6
+	cmpdi	cr7, r9, -1
+	addi	r8, r8, 1
+	bne	cr7, L(begin)
+	subfic	r7, r7, 64
+	srdi	r11, r11, 3
+	subf	r3, r11, r3
+	srdi	r7, r7, 3
+	add	r3, r3, r7
+	subf	r3, r31, r3
+	b	L(end)
+
+	.align	4
+L(begin):
+	mr	r3, r8
+	/* When our iterations exceed ITERATIONS,fall back to default. */
+	addi	r28, r28, 1
+	cmpdi	cr7, r28, ITERATIONS
+	beq	cr7, L(default)
+	lbz	r4, 0(r30)
+	bl	STRCHR
+	nop
+	/* If first char of search str is not present.  */
+	cmpdi	cr7, r3, 0
+	ble	cr7, L(end)
+	mr	r8, r3
+	mr	r4, r30		/* Restore r4.  */
+	li	r0, 0
+	mr	r6, r29
+	clrrdi	r4, r4, 3
+	addi	r4, r4, 8
+	b	L(begin1)
+
+	/* Handle less than 8 search string.  */
+	.align	4
+L(lessthan8):
+	mr	r4, r3
+	mr	r9, r30
+	li	r0, 0
+
+	rlwinm	r10, r9, 3, 26, 28	/* Calculate padding in bits.  */
+	srdi	r8, r10, 3	/* Padding in bytes.  */
+	clrrdi	r9, r9, 3	/* Make r4 aligned to 8.  */
+	ld	r6, 0(r9)
+	cmpdi	cr7, r10, 0	/* Check if its already aligned?  */
+	beq	cr7, L(proceed2)
+#ifdef __LITTLE_ENDIAN__
+	srd	r6, r6, r10	/* Discard unwanted bits.  */
+#else
+	sld	r6, r6, r10
+#endif
+	subfic	r8, r8, 8
+	cmpd	cr7, r8, r31	/* Next load needed?  */
+	bge	cr7, L(proceed2)
+	ld	r7, 8(r9)
+	subfic	r10, r10, 64
+#ifdef __LITTLE_ENDIAN__
+	sld	r7, r7, r10	/* Discard unwanted bits.  */
+#else
+	srd	r7, r7, r10
+#endif
+	or	r6, r6, r7	/* Form complete search str.  */
+L(proceed2):
+	mr	r29, r6
+	rlwinm	r10, r3, 3, 26, 28
+	clrrdi	r7, r3, 3	/* Make r3 aligned.  */
+	ld	r5, 0(r7)
+	sldi	r8, r31, 3
+	subfic	r8, r8, 64
+#ifdef __LITTLE_ENDIAN__
+	sld	r6, r6, r8
+	cmpb	r9, r0, r5
+	srd	r9, r9, r10
+#else
+	srd	r6, r6, r8
+	cmpb	r9, r0, r5
+	sld	r9, r9, r10
+#endif
+	cmpdi	cr7, r9, 0
+	bne	cr7, L(noload)
+	cmpdi	cr7, r10, 0
+	beq	cr7, L(continue)
+	ld	r7, 8(r7)
+L(continue1):
+	mr	r12, r10
+	addi	r12, r12, -8
+	subfic	r11, r12, 64
+	b	L(nextbyte)
+
+	.align	4
+L(continue):
+	ld	r7, 8(r7)
+	li	r12, -8		/* Shift values.  */
+	li	r11, 72		/* Shift values.  */
+L(nextbyte):
+	addi	r12, r12, 8	/* Mask for rotation.  */
+	addi	r11, r11, -8
+#ifdef __LITTLE_ENDIAN__
+	srd	r9, r5, r12
+	sld	r10, r7, r11
+	or	r10, r9, r10
+	sld	r10, r10, r8
+	cmpb	r9, r0, r10
+	srd	r9, r9, r8
+#else
+	sld	r9, r5, r12
+	srd	r10, r7, r11
+	or	r10, r9, r10
+	srd	r10, r10, r8
+	cmpb	r9, r0, r10
+	sld	r9, r9, r8
+#endif
+	cmpdi	cr7, r9, 0
+	bne	cr7, L(retnull)
+	cmpb	r9, r10, r6
+	cmpdi	cr7, r9, -1
+	beq	cr7, L(end)
+	addi	r3, r4, 1
+	/* When our iterations exceed ITERATIONS,fall back to default. */
+	addi	r28, r28, 1
+	cmpdi	cr7, r28, ITERATIONS
+	beq	cr7, L(default)
+	lbz	r4, 0(r30)
+	bl	STRCHR
+	nop
+	/* If first char of search str is not present.  */
+	cmpdi	cr7, r3, 0
+	ble	cr7, L(end)
+	mr	r4, r3
+	mr	r6, r29
+	li	r0, 0
+	b	L(proceed2)
+
+	.align	4
+L(noload):
+	/* Reached null in r3, so skip next load.  */
+	li 	r7, 0
+	b	L(continue1)
+
+	.align	4
+L(return):
+	/* Update return values.  */
+	srdi	r9, r11, 3
+	subf	r3, r9, r3
+	b	L(end)
+
+	/* Handling byte by byte.  */
+	.align	4
+L(bytebybyte):
+	mr	r8, r3
+	addi	r8, r8, -1
+L(loop1):
+	addi	r8, r8, 1
+	mr	r3, r8
+	mr	r4, r30
+	lbz	r6, 0(r4)
+	cmpdi	cr7, r6, 0
+	beq	cr7, L(updater3)
+L(loop):
+	lbz	r5, 0(r3)
+	cmpdi	cr7, r5, 0
+	beq	cr7, L(retnull)
+	cmpld	cr7, r6, r5
+	bne	cr7, L(loop1)
+	addi	r3, r3, 1
+	addi	r4, r4, 1
+	lbz	r6, 0(r4)
+	cmpdi	cr7, r6, 0
+	beq	cr7, L(updater3)
+	b	L(loop)
+
+	/* Handling return values.  */
+	.align	4
+L(updater3):
+	subf	r3, r31, r3	/* Reduce len of r4 from r3.  */
+	b	L(end)
+
+	.align	4
+L(ret_r3):
+	mr	r3, r29		/* Return r3.  */
+	b	L(end)
+
+	.align	4
+L(retnull):
+	li	r3, 0		/* Return NULL.  */
+	b	L(end)
+
+	.align	4
+L(default):
+	mr	r4, r30
+	bl	__strstr_ppc
+	nop
+
+	.align	4
+L(end):
+	addi	r1, r1, FRAMESIZE	/* Restore stack pointer.  */
+	cfi_adjust_cfa_offset(-FRAMESIZE)
+	ld	r0, 16(r1)	/* Restore the saved link register.  */
+	ld	r28, -32(r1)	/* Restore callers save register r28.  */
+	ld	r29, -24(r1)	/* Restore callers save register r29.  */
+	ld	r30, -16(r1)	/* Restore callers save register r30.  */
+	ld	r31, -8(r1)	/* Restore callers save register r31.  */
+	mtlr	r0		/* Branch to link register.  */
+	blr
+END (STRSTR)
+libc_hidden_builtin_def (strstr)
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power7/sub_n.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/sub_n.S
new file mode 100644
index 0000000000..848dad5718
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/sub_n.S
@@ -0,0 +1,23 @@
+/* PowerPC64 mpn_lshift --  mpn_add_n/mpn_sub_n -- mpn addition and
+   subtraction.
+   Copyright (C) 2013-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+#define USE_AS_SUB
+#include "add_n.S"