Prepare for radical source tree reorganization. zack/build-layout-experiment

All top-level files and directories are moved into a temporary storage directory, REORG.TODO, except for files that will certainly still exist in their current form at top level when we're done (COPYING, COPYING.LIB, LICENSES, NEWS, README), all old ChangeLog files (which are moved to the new directory OldChangeLogs, instead), and the generated file INSTALL (which is just deleted; in the new order, there will be no generated files checked into version control).
author: Zack Weinberg <zackw@panix.com> 2017-06-08 15:39:03 -0400
committer: Zack Weinberg <zackw@panix.com> 2017-06-08 15:39:03 -0400
commit: 5046dbb4a7eba5eccfd258f92f4735c9ffc8d069 (patch)
tree: 4470480d904b65cf14ca524f96f79eca818c3eaf /REORG.TODO/sysdeps/powerpc/powerpc32/power7
parent: 199fc19d3aaaf57944ef036e15904febe877fc93 (diff)
download: glibc-5046dbb4a7eba5eccfd258f92f4735c9ffc8d069.tar.gz
glibc-5046dbb4a7eba5eccfd258f92f4735c9ffc8d069.tar.xz
glibc-5046dbb4a7eba5eccfd258f92f4735c9ffc8d069.zip
24 files changed, 4567 insertions, 0 deletions
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc32/power7/Implies b/REORG.TODO/sysdeps/powerpc/powerpc32/power7/Implies
new file mode 100644
index 0000000000..c0e1bea435
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc32/power7/Implies
@@ -0,0 +1,2 @@
+powerpc/powerpc32/power6/fpu
+powerpc/powerpc32/power6
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc32/power7/Makefile b/REORG.TODO/sysdeps/powerpc/powerpc32/power7/Makefile
new file mode 100644
index 0000000000..5e8f4a28ba
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc32/power7/Makefile
@@ -0,0 +1,4 @@
+ifeq ($(subdir),string)
+CFLAGS-strncase.c += -funroll-loops
+CFLAGS-strncase_l.c += -funroll-loops
+endif
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc32/power7/fpu/multiarch/Implies b/REORG.TODO/sysdeps/powerpc/powerpc32/power7/fpu/multiarch/Implies
new file mode 100644
index 0000000000..45cbaede9f
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc32/power7/fpu/multiarch/Implies
@@ -0,0 +1 @@
+powerpc/powerpc32/power6/fpu/multiarch
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc32/power7/fpu/s_finite.S b/REORG.TODO/sysdeps/powerpc/powerpc32/power7/fpu/s_finite.S
new file mode 100644
index 0000000000..da4efa0fb9
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc32/power7/fpu/s_finite.S
@@ -0,0 +1,93 @@
+/* finite().  PowerPC32/POWER7 version.
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Luis Machado <luisgpm@br.ibm.com>.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <math_ldbl_opt.h>
+
+/* int __finite(x)  */
+	.section    .rodata.cst8,"aM",@progbits,8
+	.align 3
+.LC0:   /* 1.0 */
+	.quad	    0x3ff0000000000000
+
+	.section    ".text"
+	.type	    __finite, @function
+	.machine    power7
+ENTRY (__finite)
+#ifdef SHARED
+	mflr	r11
+	cfi_register(lr,r11)
+
+	SETUP_GOT_ACCESS(r9,got_label)
+	addis	r9,r9,.LC0-got_label@ha
+	lfd	fp0,.LC0-got_label@l(r9)
+
+	mtlr	r11
+	cfi_same_value (lr)
+#else
+	lis	r9,.LC0@ha
+	lfd	fp0,.LC0@l(r9)
+#endif
+	ftdiv	cr7,fp1,fp0
+	li	r3,1
+	bflr	30
+
+	/* We have -INF/+INF/NaN or a denormal.  */
+
+	stwu	r1,-16(r1)    /* Allocate stack space.  */
+	stfd    fp1,8(r1)     /* Transfer FP to GPR's.  */
+
+	ori	2,2,0	      /* Force a new dispatch group.  */
+	lhz	r0,8+HISHORT(r1) /* Fetch the upper 16 bits of the FP value
+				    (biased exponent and sign bit).  */
+	clrlwi	r0,r0,17      /* r0 = abs(r0).  */
+	addi	r1,r1,16      /* Reset the stack pointer.  */
+	cmpwi	cr7,r0,0x7ff0 /* r4 == 0x7ff0?.  */
+	bltlr	cr7	      /* LT means we have a denormal.  */
+	li	r3,0
+	blr
+	END (__finite)
+
+hidden_def (__finite)
+weak_alias (__finite, finite)
+
+/* It turns out that the 'double' version will also always work for
+   single-precision.  */
+strong_alias (__finite, __finitef)
+hidden_def (__finitef)
+weak_alias (__finitef, finitef)
+
+#ifdef NO_LONG_DOUBLE
+strong_alias (__finite, __finitel)
+weak_alias (__finite, finitel)
+#endif
+
+#if IS_IN (libm)
+# if LONG_DOUBLE_COMPAT (libm, GLIBC_2_0)
+compat_symbol (libm, finite, finitel, GLIBC_2_0)
+# endif
+# if LONG_DOUBLE_COMPAT (libm, GLIBC_2_1)
+compat_symbol (libm, __finite, __finitel, GLIBC_2_1)
+# endif
+#else
+# if LONG_DOUBLE_COMPAT (libc, GLIBC_2_0)
+compat_symbol (libc, __finite, __finitel, GLIBC_2_0);
+compat_symbol (libc, finite, finitel, GLIBC_2_0);
+# endif
+#endif
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc32/power7/fpu/s_finitef.S b/REORG.TODO/sysdeps/powerpc/powerpc32/power7/fpu/s_finitef.S
new file mode 100644
index 0000000000..54bd94176d
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc32/power7/fpu/s_finitef.S
@@ -0,0 +1 @@
+/* This function uses the same code as s_finite.S.  */
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc32/power7/fpu/s_isinf.S b/REORG.TODO/sysdeps/powerpc/powerpc32/power7/fpu/s_isinf.S
new file mode 100644
index 0000000000..668815761a
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc32/power7/fpu/s_isinf.S
@@ -0,0 +1,85 @@
+/* isinf().  PowerPC32/POWER7 version.
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Luis Machado <luisgpm@br.ibm.com>.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <math_ldbl_opt.h>
+
+/* int __isinf(x)  */
+	.section    .rodata.cst8,"aM",@progbits,8
+	.align 3
+.LC0:   /* 1.0 */
+	.quad	    0x3ff0000000000000
+
+	.section    ".text"
+	.type	    __isinf, @function
+	.machine    power7
+ENTRY (__isinf)
+#ifdef SHARED
+	mflr	r11
+	cfi_register(lr,r11)
+
+	SETUP_GOT_ACCESS(r9,got_label)
+	addis	r9,r9,.LC0-got_label@ha
+	lfd	fp0,.LC0-got_label@l(r9)
+
+	mtlr	r11
+	cfi_same_value (lr)
+#else
+	lis	r9,.LC0@ha
+	lfd	fp0,.LC0@l(r9)
+#endif
+	ftdiv	cr7,fp1,fp0
+	li	r3,0
+	bflr    29	      /* If not INF, return.  */
+
+	/* Either we have +INF or -INF.  */
+
+	stwu    r1,-16(r1)    /* Allocate stack space.  */
+	stfd    fp1,8(r1)     /* Transfer FP to GPR's.  */
+	ori	2,2,0	      /* Force a new dispatch group.  */
+	lhz	r4,8+HISHORT(r1) /* Fetch the upper 16 bits of the FP value
+				    (biased exponent and sign bit).  */
+	addi	r1,r1,16      /* Reset the stack pointer.  */
+	cmpwi	cr7,r4,0x7ff0 /* r4 == 0x7ff0?  */
+	li	r3,1
+	beqlr   cr7	      /* EQ means INF, otherwise -INF.  */
+	li      r3,-1
+	blr
+	END (__isinf)
+
+hidden_def (__isinf)
+weak_alias (__isinf, isinf)
+
+/* It turns out that the 'double' version will also always work for
+   single-precision.  */
+strong_alias (__isinf, __isinff)
+hidden_def (__isinff)
+weak_alias (__isinff, isinff)
+
+#ifdef NO_LONG_DOUBLE
+strong_alias (__isinf, __isinfl)
+weak_alias (__isinf, isinfl)
+#endif
+
+#if !IS_IN (libm)
+# if LONG_DOUBLE_COMPAT(libc, GLIBC_2_0)
+compat_symbol (libc, __isinf, __isinfl, GLIBC_2_0);
+compat_symbol (libc, isinf, isinfl, GLIBC_2_0);
+# endif
+#endif
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc32/power7/fpu/s_isinff.S b/REORG.TODO/sysdeps/powerpc/powerpc32/power7/fpu/s_isinff.S
new file mode 100644
index 0000000000..be759e091e
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc32/power7/fpu/s_isinff.S
@@ -0,0 +1 @@
+/* This function uses the same code as s_isinf.S.  */
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc32/power7/fpu/s_isnan.S b/REORG.TODO/sysdeps/powerpc/powerpc32/power7/fpu/s_isnan.S
new file mode 100644
index 0000000000..433137f1c4
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc32/power7/fpu/s_isnan.S
@@ -0,0 +1,90 @@
+/* isnan().  PowerPC32/POWER7 version.
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Luis Machado <luisgpm@br.ibm.com>.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <math_ldbl_opt.h>
+
+/* int __isnan(x)  */
+	.section    .rodata.cst8,"aM",@progbits,8
+	.align 3
+.LC0:   /* 1.0 */
+	.quad	    0x3ff0000000000000
+
+	.section    ".text"
+	.type	    __isnan, @function
+	.machine    power7
+ENTRY (__isnan)
+#ifdef SHARED
+	mflr	r11
+	cfi_register(lr,r11)
+
+	SETUP_GOT_ACCESS(r9,got_label)
+	addis	r9,r9,.LC0-got_label@ha
+	lfd	fp0,.LC0-got_label@l(r9)
+
+	mtlr	r11
+	cfi_same_value (lr)
+#else
+	lis	r9,.LC0@ha
+	lfd	fp0,.LC0@l(r9)
+#endif
+	ftdiv	cr7,fp1,fp0
+	li	r3,0
+	bflr	30	      /* If not NaN or Inf, finish. */
+
+	/* We have -INF/+INF/NaN or a denormal.  */
+
+	stwu	r1,-16(r1)    /* Allocate stack space.  */
+	stfd	fp1,8(r1)     /* Transfer FP to GPR's.  */
+	ori	2,2,0	      /* Force a new dispatch group.  */
+	lwz     r4,8+HIWORD(r1) /* Load the upper half of the FP value.  */
+	lwz     r5,8+LOWORD(r1) /* Load the lower half of the FP value.  */
+	addi	r1,r1,16      /* Reset the stack pointer.  */
+	lis     r0,0x7ff0     /* Load the upper portion for an INF/NaN.  */
+	clrlwi  r4,r4,1	      /* r4 = abs(r4).  */
+	cmpw    cr7,r4,r0     /* if (abs(r4) <= inf).  */
+	cmpwi   cr6,r5,0      /* r5 == 0x00000000?  */
+	bltlr	cr7	      /* LT means we have a denormal.  */
+	bgt	cr7,L(NaN)    /* GT means we have a NaN.  */
+	beqlr	cr6	      /* EQ means we have +/-INF.  */
+L(NaN):
+	li      r3,1	      /* x == NaN?  */
+	blr
+	END (__isnan)
+
+hidden_def (__isnan)
+weak_alias (__isnan, isnan)
+
+/* It turns out that the 'double' version will also always work for
+   single-precision.  */
+strong_alias (__isnan, __isnanf)
+hidden_def (__isnanf)
+weak_alias (__isnanf, isnanf)
+
+#ifdef NO_LONG_DOUBLE
+strong_alias (__isnan, __isnanl)
+weak_alias (__isnan, isnanl)
+#endif
+
+#if !IS_IN (libm)
+# if LONG_DOUBLE_COMPAT(libc, GLIBC_2_0)
+compat_symbol (libc, __isnan, __isnanl, GLIBC_2_0);
+compat_symbol (libc, isnan, isnanl, GLIBC_2_0);
+# endif
+#endif
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc32/power7/fpu/s_isnanf.S b/REORG.TODO/sysdeps/powerpc/powerpc32/power7/fpu/s_isnanf.S
new file mode 100644
index 0000000000..b48c85e0d3
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc32/power7/fpu/s_isnanf.S
@@ -0,0 +1 @@
+/* This function uses the same code as s_isnan.S.  */
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc32/power7/memchr.S b/REORG.TODO/sysdeps/powerpc/powerpc32/power7/memchr.S
new file mode 100644
index 0000000000..9ce8507a82
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc32/power7/memchr.S
@@ -0,0 +1,193 @@
+/* Optimized memchr implementation for PowerPC32/POWER7 using cmpb insn.
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Luis Machado <luisgpm@br.ibm.com>.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+/* int [r3] memchr (char *s [r3], int byte [r4], int size [r5])  */
+	.machine  power7
+ENTRY (__memchr)
+	CALL_MCOUNT
+	dcbt	0,r3
+	clrrwi  r8,r3,2
+	insrwi	r4,r4,8,16    /* Replicate byte to word.  */
+
+	/* Calculate the last acceptable address and check for possible
+	   addition overflow by using satured math:
+	   r7 = r3 + r5
+	   r7 |= -(r7 < x)  */
+	add     r7,r3,r5
+	subfc   r6,r3,r7
+	subfe   r9,r9,r9
+	or      r7,r7,r9
+
+	insrwi	r4,r4,16,0
+	cmplwi	r5,16
+	li	r9, -1
+	rlwinm	r6,r3,3,27,28 /* Calculate padding.  */
+	addi	r7,r7,-1
+#ifdef __LITTLE_ENDIAN__
+	slw	r9,r9,r6
+#else
+	srw	r9,r9,r6
+#endif
+	ble	L(small_range)
+
+	lwz	r12,0(r8)     /* Load word from memory.  */
+	cmpb	r3,r12,r4     /* Check for BYTEs in WORD1.  */
+	and	r3,r3,r9
+	clrlwi	r5,r7,30      /* Byte count - 1 in last word.  */
+	clrrwi	r7,r7,2       /* Address of last word.  */
+	cmplwi	cr7,r3,0      /* If r3 == 0, no BYTEs have been found.  */
+	bne	cr7,L(done)
+
+	mtcrf   0x01,r8
+	/* Are we now aligned to a doubleword boundary?  If so, skip to
+	   the main loop.  Otherwise, go through the alignment code.  */
+	bt	29,L(loop_setup)
+
+	/* Handle WORD2 of pair.  */
+	lwzu	r12,4(r8)
+	cmpb	r3,r12,r4
+	cmplwi	cr7,r3,0
+	bne	cr7,L(done)
+
+L(loop_setup):
+	/* The last word we want to read in the loop below is the one
+	   containing the last byte of the string, ie. the word at
+	   (s + size - 1) & ~3, or r7.  The first word read is at
+	   r8 + 4, we read 2 * cnt words, so the last word read will
+	   be at r8 + 4 + 8 * cnt - 4.  Solving for cnt gives
+	   cnt = (r7 - r8) / 8  */
+	sub	r6,r7,r8
+	srwi	r6,r6,3	      /* Number of loop iterations.  */
+	mtctr	r6            /* Setup the counter.  */
+
+	/* Main loop to look for BYTE in the string.  Since
+	   it's a small loop (8 instructions), align it to 32-bytes.  */
+	.align	5
+L(loop):
+	/* Load two words, compare and merge in a
+	   single register for speed.  This is an attempt
+	   to speed up the byte-checking process for bigger strings.  */
+	lwz	r12,4(r8)
+	lwzu	r11,8(r8)
+	cmpb	r3,r12,r4
+	cmpb	r9,r11,r4
+	or	r6,r9,r3      /* Merge everything in one word.  */
+	cmplwi	cr7,r6,0
+	bne	cr7,L(found)
+	bdnz	L(loop)
+
+	/* We may have one more dword to read.  */
+	cmplw	r8,r7
+	beqlr
+
+	lwzu	r12,4(r8)
+	cmpb	r3,r12,r4
+	cmplwi	cr6,r3,0
+	bne	cr6,L(done)
+	blr
+
+	.align	4
+L(found):
+	/* OK, one (or both) of the words contains BYTE.  Check
+	   the first word and decrement the address in case the first
+	   word really contains BYTE.  */
+	cmplwi	cr6,r3,0
+	addi	r8,r8,-4
+	bne	cr6,L(done)
+
+	/* BYTE must be in the second word.  Adjust the address
+	   again and move the result of cmpb to r3 so we can calculate the
+	   pointer.  */
+
+	mr	r3,r9
+	addi	r8,r8,4
+
+	/* r3 has the output of the cmpb instruction, that is, it contains
+	   0xff in the same position as BYTE in the original
+	   word from the string.  Use that to calculate the pointer.
+	   We need to make sure BYTE is *before* the end of the range.  */
+L(done):
+#ifdef __LITTLE_ENDIAN__
+	addi    r0,r3,-1
+	andc    r0,r0,r3
+	popcntw	r0,r0	      /* Count trailing zeros.  */
+#else
+	cntlzw	r0,r3	      /* Count leading zeros before the match.  */
+#endif
+	cmplw	r8,r7         /* Are we on the last word?  */
+	srwi	r0,r0,3	      /* Convert leading/trailing zeros to bytes.  */
+	add	r3,r8,r0
+	cmplw	cr7,r0,r5     /* If on the last dword, check byte offset.  */
+	bnelr
+	blelr	cr7
+	li	r3,0
+	blr
+
+	.align	4
+L(null):
+	li	r3,0
+	blr
+
+/* Deals with size <= 16.  */
+	.align	4
+L(small_range):
+	cmplwi	r5,0
+	beq	L(null)
+	lwz	r12,0(r8)     /* Load word from memory.  */
+	cmpb	r3,r12,r4     /* Check for BYTE in DWORD1.  */
+	and	r3,r3,r9
+	cmplwi	cr7,r3,0
+	clrlwi	r5,r7,30      /* Byte count - 1 in last word.  */
+	clrrwi	r7,r7,2       /* Address of last word.  */
+	cmplw	r8,r7         /* Are we done already?  */
+	bne	cr7,L(done)
+	beqlr
+
+	lwzu	r12,4(r8)
+	cmpb	r3,r12,r4
+	cmplwi	cr6,r3,0
+	cmplw	r8,r7
+	bne	cr6,L(done)
+	beqlr
+
+	lwzu	r12,4(r8)
+	cmpb	r3,r12,r4
+	cmplwi	cr6,r3,0
+	cmplw	r8,r7
+	bne	cr6,L(done)
+	beqlr
+
+	lwzu	r12,4(r8)
+	cmpb	r3,r12,r4
+	cmplwi	cr6,r3,0
+	cmplw	r8,r7
+	bne	cr6,L(done)
+	beqlr
+
+	lwzu	r12,4(r8)
+	cmpb	r3,r12,r4
+	cmplwi	cr6,r3,0
+	bne	cr6,L(done)
+	blr
+
+END (__memchr)
+weak_alias (__memchr, memchr)
+libc_hidden_builtin_def (memchr)
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc32/power7/memcmp.S b/REORG.TODO/sysdeps/powerpc/powerpc32/power7/memcmp.S
new file mode 100644
index 0000000000..09c9b9bf4d
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc32/power7/memcmp.S
@@ -0,0 +1,1375 @@
+/* Optimized memcmp implementation for POWER7/PowerPC32.
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+/* int [r3] memcmp (const char *s1 [r3],
+		    const char *s2 [r4],
+		    size_t size [r5])  */
+
+	.machine power7
+EALIGN (memcmp, 4, 0)
+	CALL_MCOUNT
+
+#define rRTN	r3
+#define rSTR1	r3	/* first string arg */
+#define rSTR2	r4	/* second string arg */
+#define rN	r5	/* max string length */
+#define rWORD1	r6	/* current word in s1 */
+#define rWORD2	r7	/* current word in s2 */
+#define rWORD3	r8	/* next word in s1 */
+#define rWORD4	r9	/* next word in s2 */
+#define rWORD5	r10	/* next word in s1 */
+#define rWORD6	r11	/* next word in s2 */
+#define rWORD7	r30	/* next word in s1 */
+#define rWORD8	r31	/* next word in s2 */
+
+	xor	r0, rSTR2, rSTR1
+	cmplwi	cr6, rN, 0
+	cmplwi	cr1, rN, 12
+	clrlwi.	r0, r0, 30
+	clrlwi	r12, rSTR1, 30
+	cmplwi	cr5, r12, 0
+	beq-	cr6, L(zeroLength)
+	dcbt	0, rSTR1
+	dcbt	0, rSTR2
+/* If less than 8 bytes or not aligned, use the unaligned
+   byte loop.  */
+	blt	cr1, L(bytealigned)
+	stwu	1, -64(r1)
+	cfi_adjust_cfa_offset(64)
+	stw	rWORD8, 48(r1)
+	stw	rWORD7, 44(r1)
+	cfi_offset(rWORD8, (48-64))
+	cfi_offset(rWORD7, (44-64))
+	bne	L(unaligned)
+/* At this point we know both strings have the same alignment and the
+   compare length is at least 8 bytes.  r12 contains the low order
+   2 bits of rSTR1 and cr5 contains the result of the logical compare
+   of r12 to 0.  If r12 == 0 then we are already word
+   aligned and can perform the word aligned loop.
+
+   Otherwise we know the two strings have the same alignment (but not
+   yet word aligned).  So we force the string addresses to the next lower
+   word boundary and special case this first word using shift left to
+   eliminate bits preceding the first byte.  Since we want to join the
+   normal (word aligned) compare loop, starting at the second word,
+   we need to adjust the length (rN) and special case the loop
+   versioning for the first word. This ensures that the loop count is
+   correct and the first word (shifted) is in the expected register pair. */
+	.align	4
+L(samealignment):
+	clrrwi	rSTR1, rSTR1, 2
+	clrrwi	rSTR2, rSTR2, 2
+	beq	cr5, L(Waligned)
+	add	rN, rN, r12
+	slwi	rWORD6, r12, 3
+	srwi	r0, rN, 4	/* Divide by 16 */
+	andi.	r12, rN, 12	/* Get the word remainder */
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD1, 0, rSTR1
+	lwbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD1, 0(rSTR1)
+	lwz	rWORD2, 0(rSTR2)
+#endif
+	cmplwi	cr1, r12, 8
+	cmplwi	cr7, rN, 16
+	clrlwi	rN, rN, 30
+	beq	L(dPs4)
+	mtctr	r0
+	bgt	cr1, L(dPs3)
+	beq	cr1, L(dPs2)
+
+/* Remainder is 4 */
+	.align	3
+L(dsP1):
+	slw	rWORD5, rWORD1, rWORD6
+	slw	rWORD6, rWORD2, rWORD6
+	cmplw	cr5, rWORD5, rWORD6
+	blt	cr7, L(dP1x)
+/* Do something useful in this cycle since we have to branch anyway.  */
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD1, 0, rSTR1
+	lwbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD1, 4(rSTR1)
+	lwz	rWORD2, 4(rSTR2)
+#endif
+	cmplw	cr7, rWORD1, rWORD2
+	b	L(dP1e)
+/* Remainder is 8 */
+	.align	4
+L(dPs2):
+	slw	rWORD5, rWORD1, rWORD6
+	slw	rWORD6, rWORD2, rWORD6
+	cmplw	cr6, rWORD5, rWORD6
+	blt	cr7, L(dP2x)
+/* Do something useful in this cycle since we have to branch anyway.  */
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD7, 0, rSTR1
+	lwbrx	rWORD8, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD7, 4(rSTR1)
+	lwz	rWORD8, 4(rSTR2)
+#endif
+	cmplw	cr5, rWORD7, rWORD8
+	b	L(dP2e)
+/* Remainder is 12 */
+	.align	4
+L(dPs3):
+	slw	rWORD3, rWORD1, rWORD6
+	slw	rWORD4, rWORD2, rWORD6
+	cmplw	cr1, rWORD3, rWORD4
+	b	L(dP3e)
+/* Count is a multiple of 16, remainder is 0 */
+	.align	4
+L(dPs4):
+	mtctr	r0
+	slw	rWORD1, rWORD1, rWORD6
+	slw	rWORD2, rWORD2, rWORD6
+	cmplw	cr7, rWORD1, rWORD2
+	b	L(dP4e)
+
+/* At this point we know both strings are word aligned and the
+   compare length is at least 8 bytes.  */
+	.align	4
+L(Waligned):
+	andi.	r12, rN, 12	/* Get the word remainder */
+	srwi	r0, rN, 4	/* Divide by 16 */
+	cmplwi	cr1, r12, 8
+	cmplwi	cr7, rN, 16
+	clrlwi	rN, rN, 30
+	beq	L(dP4)
+	bgt	cr1, L(dP3)
+	beq	cr1, L(dP2)
+
+/* Remainder is 4 */
+	.align	4
+L(dP1):
+	mtctr	r0
+/* Normally we'd use rWORD7/rWORD8 here, but since we might exit early
+   (8-15 byte compare), we want to use only volatile registers.  This
+   means we can avoid restoring non-volatile registers since we did not
+   change any on the early exit path.  The key here is the non-early
+   exit path only cares about the condition code (cr5), not about which
+   register pair was used.  */
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD5, 0, rSTR1
+	lwbrx	rWORD6, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD5, 0(rSTR1)
+	lwz	rWORD6, 0(rSTR2)
+#endif
+	cmplw	cr5, rWORD5, rWORD6
+	blt	cr7, L(dP1x)
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD1, 0, rSTR1
+	lwbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD1, 4(rSTR1)
+	lwz	rWORD2, 4(rSTR2)
+#endif
+	cmplw	cr7, rWORD1, rWORD2
+L(dP1e):
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD3, 0, rSTR1
+	lwbrx	rWORD4, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD3, 8(rSTR1)
+	lwz	rWORD4, 8(rSTR2)
+#endif
+	cmplw	cr1, rWORD3, rWORD4
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD5, 0, rSTR1
+	lwbrx	rWORD6, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD5, 12(rSTR1)
+	lwz	rWORD6, 12(rSTR2)
+#endif
+	cmplw	cr6, rWORD5, rWORD6
+	bne	cr5, L(dLcr5x)
+	bne	cr7, L(dLcr7x)
+
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD7, 0, rSTR1
+	lwbrx	rWORD8, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwzu	rWORD7, 16(rSTR1)
+	lwzu	rWORD8, 16(rSTR2)
+#endif
+	bne	cr1, L(dLcr1)
+	cmplw	cr5, rWORD7, rWORD8
+	bdnz	L(dLoop)
+	bne	cr6, L(dLcr6)
+	lwz	rWORD7, 44(r1)
+	lwz	rWORD8, 48(r1)
+	.align	3
+L(dP1x):
+	slwi.	r12, rN, 3
+	bne	cr5, L(dLcr5x)
+	subfic	rN, r12, 32	/* Shift count is 32 - (rN * 8).  */
+	addi	r1, r1, 64
+	cfi_adjust_cfa_offset(-64)
+	bne	L(d00)
+	li	rRTN, 0
+	blr
+
+/* Remainder is 8 */
+	.align	4
+	cfi_adjust_cfa_offset(64)
+L(dP2):
+	mtctr	r0
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD5, 0, rSTR1
+	lwbrx	rWORD6, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD5, 0(rSTR1)
+	lwz	rWORD6, 0(rSTR2)
+#endif
+	cmplw	cr6, rWORD5, rWORD6
+	blt	cr7, L(dP2x)
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD7, 0, rSTR1
+	lwbrx	rWORD8, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD7, 4(rSTR1)
+	lwz	rWORD8, 4(rSTR2)
+#endif
+	cmplw	cr5, rWORD7, rWORD8
+L(dP2e):
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD1, 0, rSTR1
+	lwbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD1, 8(rSTR1)
+	lwz	rWORD2, 8(rSTR2)
+#endif
+	cmplw	cr7, rWORD1, rWORD2
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD3, 0, rSTR1
+	lwbrx	rWORD4, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD3, 12(rSTR1)
+	lwz	rWORD4, 12(rSTR2)
+#endif
+	cmplw	cr1, rWORD3, rWORD4
+#ifndef __LITTLE_ENDIAN__
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#endif
+	bne	cr6, L(dLcr6)
+	bne	cr5, L(dLcr5)
+	b	L(dLoop2)
+/* Again we are on a early exit path (16-23 byte compare), we want to
+   only use volatile registers and avoid restoring non-volatile
+   registers.  */
+	.align	4
+L(dP2x):
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD3, 0, rSTR1
+	lwbrx	rWORD4, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD3, 4(rSTR1)
+	lwz	rWORD4, 4(rSTR2)
+#endif
+	cmplw	cr1, rWORD3, rWORD4
+	slwi.	r12, rN, 3
+	bne	cr6, L(dLcr6x)
+#ifndef __LITTLE_ENDIAN__
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#endif
+	bne	cr1, L(dLcr1x)
+	subfic	rN, r12, 32	/* Shift count is 32 - (rN * 8).  */
+	addi	r1, r1, 64
+	cfi_adjust_cfa_offset(-64)
+	bne	L(d00)
+	li	rRTN, 0
+	blr
+
+/* Remainder is 12 */
+	.align	4
+	cfi_adjust_cfa_offset(64)
+L(dP3):
+	mtctr	r0
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD3, 0, rSTR1
+	lwbrx	rWORD4, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD3, 0(rSTR1)
+	lwz	rWORD4, 0(rSTR2)
+#endif
+	cmplw	cr1, rWORD3, rWORD4
+L(dP3e):
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD5, 0, rSTR1
+	lwbrx	rWORD6, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD5, 4(rSTR1)
+	lwz	rWORD6, 4(rSTR2)
+#endif
+	cmplw	cr6, rWORD5, rWORD6
+	blt	cr7, L(dP3x)
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD7, 0, rSTR1
+	lwbrx	rWORD8, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD7, 8(rSTR1)
+	lwz	rWORD8, 8(rSTR2)
+#endif
+	cmplw	cr5, rWORD7, rWORD8
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD1, 0, rSTR1
+	lwbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD1, 12(rSTR1)
+	lwz	rWORD2, 12(rSTR2)
+#endif
+	cmplw	cr7, rWORD1, rWORD2
+#ifndef __LITTLE_ENDIAN__
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#endif
+	bne	cr1, L(dLcr1)
+	bne	cr6, L(dLcr6)
+	b	L(dLoop1)
+/* Again we are on a early exit path (24-31 byte compare), we want to
+   only use volatile registers and avoid restoring non-volatile
+   registers.  */
+	.align	4
+L(dP3x):
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD1, 0, rSTR1
+	lwbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD1, 8(rSTR1)
+	lwz	rWORD2, 8(rSTR2)
+#endif
+	cmplw	cr7, rWORD1, rWORD2
+	slwi.	r12, rN, 3
+	bne	cr1, L(dLcr1x)
+#ifndef __LITTLE_ENDIAN__
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#endif
+	bne	cr6, L(dLcr6x)
+	subfic	rN, r12, 32	/* Shift count is 32 - (rN * 8).  */
+	bne	cr7, L(dLcr7x)
+	addi	r1, r1, 64
+	cfi_adjust_cfa_offset(-64)
+	bne	L(d00)
+	li	rRTN, 0
+	blr
+
+/* Count is a multiple of 16, remainder is 0 */
+	.align	4
+	cfi_adjust_cfa_offset(64)
+L(dP4):
+	mtctr	r0
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD1, 0, rSTR1
+	lwbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD1, 0(rSTR1)
+	lwz	rWORD2, 0(rSTR2)
+#endif
+	cmplw	cr7, rWORD1, rWORD2
+L(dP4e):
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD3, 0, rSTR1
+	lwbrx	rWORD4, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD3, 4(rSTR1)
+	lwz	rWORD4, 4(rSTR2)
+#endif
+	cmplw	cr1, rWORD3, rWORD4
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD5, 0, rSTR1
+	lwbrx	rWORD6, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD5, 8(rSTR1)
+	lwz	rWORD6, 8(rSTR2)
+#endif
+	cmplw	cr6, rWORD5, rWORD6
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD7, 0, rSTR1
+	lwbrx	rWORD8, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwzu	rWORD7, 12(rSTR1)
+	lwzu	rWORD8, 12(rSTR2)
+#endif
+	cmplw	cr5, rWORD7, rWORD8
+	bne	cr7, L(dLcr7)
+	bne	cr1, L(dLcr1)
+	bdz-	L(d24)		/* Adjust CTR as we start with +4 */
+/* This is the primary loop */
+	.align	4
+L(dLoop):
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD1, 0, rSTR1
+	lwbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD1, 4(rSTR1)
+	lwz	rWORD2, 4(rSTR2)
+#endif
+	cmplw	cr1, rWORD3, rWORD4
+	bne	cr6, L(dLcr6)
+L(dLoop1):
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD3, 0, rSTR1
+	lwbrx	rWORD4, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD3, 8(rSTR1)
+	lwz	rWORD4, 8(rSTR2)
+#endif
+	cmplw	cr6, rWORD5, rWORD6
+	bne	cr5, L(dLcr5)
+L(dLoop2):
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD5, 0, rSTR1
+	lwbrx	rWORD6, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD5, 12(rSTR1)
+	lwz	rWORD6, 12(rSTR2)
+#endif
+	cmplw	cr5, rWORD7, rWORD8
+	bne	cr7, L(dLcr7)
+L(dLoop3):
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD7, 0, rSTR1
+	lwbrx	rWORD8, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwzu	rWORD7, 16(rSTR1)
+	lwzu	rWORD8, 16(rSTR2)
+#endif
+	bne	cr1, L(dLcr1)
+	cmplw	cr7, rWORD1, rWORD2
+	bdnz	L(dLoop)
+
+L(dL4):
+	cmplw	cr1, rWORD3, rWORD4
+	bne	cr6, L(dLcr6)
+	cmplw	cr6, rWORD5, rWORD6
+	bne	cr5, L(dLcr5)
+	cmplw	cr5, rWORD7, rWORD8
+L(d44):
+	bne	cr7, L(dLcr7)
+L(d34):
+	bne	cr1, L(dLcr1)
+L(d24):
+	bne	cr6, L(dLcr6)
+L(d14):
+	slwi.	r12, rN, 3
+	bne	cr5, L(dLcr5)
+L(d04):
+	lwz	rWORD7, 44(r1)
+	lwz	rWORD8, 48(r1)
+	addi	r1, r1, 64
+	cfi_adjust_cfa_offset(-64)
+	subfic	rN, r12, 32	/* Shift count is 32 - (rN * 8).  */
+	beq	L(zeroLength)
+/* At this point we have a remainder of 1 to 3 bytes to compare.  Since
+   we are aligned it is safe to load the whole word, and use
+   shift right to eliminate bits beyond the compare length.  */
+L(d00):
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD1, 0, rSTR1
+	lwbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD1, 4(rSTR1)
+	lwz	rWORD2, 4(rSTR2)
+#endif
+	srw	rWORD1, rWORD1, rN
+	srw	rWORD2, rWORD2, rN
+	sub	rRTN, rWORD1, rWORD2
+	blr
+
+	.align	4
+	cfi_adjust_cfa_offset(64)
+L(dLcr7):
+	lwz	rWORD7, 44(r1)
+	lwz	rWORD8, 48(r1)
+L(dLcr7x):
+	li	rRTN, 1
+	addi	r1, r1, 64
+	cfi_adjust_cfa_offset(-64)
+	bgtlr	cr7
+	li	rRTN, -1
+	blr
+	.align	4
+	cfi_adjust_cfa_offset(64)
+L(dLcr1):
+	lwz	rWORD7, 44(r1)
+	lwz	rWORD8, 48(r1)
+L(dLcr1x):
+	li	rRTN, 1
+	addi	r1, r1, 64
+	cfi_adjust_cfa_offset(-64)
+	bgtlr	cr1
+	li	rRTN, -1
+	blr
+	.align	4
+	cfi_adjust_cfa_offset(64)
+L(dLcr6):
+	lwz	rWORD7, 44(r1)
+	lwz	rWORD8, 48(r1)
+L(dLcr6x):
+	li	rRTN, 1
+	addi	r1, r1, 64
+	cfi_adjust_cfa_offset(-64)
+	bgtlr	cr6
+	li	rRTN, -1
+	blr
+	.align	4
+	cfi_adjust_cfa_offset(64)
+L(dLcr5):
+	lwz	rWORD7, 44(r1)
+	lwz	rWORD8, 48(r1)
+L(dLcr5x):
+	li	rRTN, 1
+	addi	r1, r1, 64
+	cfi_adjust_cfa_offset(-64)
+	bgtlr	cr5
+	li	rRTN, -1
+	blr
+
+	.align	4
+L(bytealigned):
+	mtctr	rN
+
+/* We need to prime this loop.  This loop is swing modulo scheduled
+   to avoid pipe delays.  The dependent instruction latencies (load to
+   compare to conditional branch) is 2 to 3 cycles.  In this loop each
+   dispatch group ends in a branch and takes 1 cycle.  Effectively
+   the first iteration of the loop only serves to load operands and
+   branches based on compares are delayed until the next loop.
+
+   So we must precondition some registers and condition codes so that
+   we don't exit the loop early on the first iteration.  */
+
+	lbz	rWORD1, 0(rSTR1)
+	lbz	rWORD2, 0(rSTR2)
+	bdz	L(b11)
+	cmplw	cr7, rWORD1, rWORD2
+	lbz	rWORD3, 1(rSTR1)
+	lbz	rWORD4, 1(rSTR2)
+	bdz	L(b12)
+	cmplw	cr1, rWORD3, rWORD4
+	lbzu	rWORD5, 2(rSTR1)
+	lbzu	rWORD6, 2(rSTR2)
+	bdz	L(b13)
+	.align	4
+L(bLoop):
+	lbzu	rWORD1, 1(rSTR1)
+	lbzu	rWORD2, 1(rSTR2)
+	bne	cr7, L(bLcr7)
+
+	cmplw	cr6, rWORD5, rWORD6
+	bdz	L(b3i)
+
+	lbzu	rWORD3, 1(rSTR1)
+	lbzu	rWORD4, 1(rSTR2)
+	bne	cr1, L(bLcr1)
+
+	cmplw	cr7, rWORD1, rWORD2
+	bdz	L(b2i)
+
+	lbzu	rWORD5, 1(rSTR1)
+	lbzu	rWORD6, 1(rSTR2)
+	bne	cr6, L(bLcr6)
+
+	cmplw	cr1, rWORD3, rWORD4
+	bdnz	L(bLoop)
+
+/* We speculatively loading bytes before we have tested the previous
+   bytes.  But we must avoid overrunning the length (in the ctr) to
+   prevent these speculative loads from causing a segfault.  In this
+   case the loop will exit early (before the all pending bytes are
+   tested.  In this case we must complete the pending operations
+   before returning.  */
+L(b1i):
+	bne	cr7, L(bLcr7)
+	bne	cr1, L(bLcr1)
+	b	L(bx56)
+	.align	4
+L(b2i):
+	bne	cr6, L(bLcr6)
+	bne	cr7, L(bLcr7)
+	b	L(bx34)
+	.align	4
+L(b3i):
+	bne	cr1, L(bLcr1)
+	bne	cr6, L(bLcr6)
+	b	L(bx12)
+	.align	4
+L(bLcr7):
+	li	rRTN, 1
+	bgtlr	cr7
+	li	rRTN, -1
+	blr
+L(bLcr1):
+	li	rRTN, 1
+	bgtlr	cr1
+	li	rRTN, -1
+	blr
+L(bLcr6):
+	li	rRTN, 1
+	bgtlr	cr6
+	li	rRTN, -1
+	blr
+
+L(b13):
+	bne	cr7, L(bx12)
+	bne	cr1, L(bx34)
+L(bx56):
+	sub	rRTN, rWORD5, rWORD6
+	blr
+	nop
+L(b12):
+	bne	cr7, L(bx12)
+L(bx34):
+	sub	rRTN, rWORD3, rWORD4
+	blr
+L(b11):
+L(bx12):
+	sub	rRTN, rWORD1, rWORD2
+	blr
+	.align	4
+L(zeroLength):
+	li	rRTN, 0
+	blr
+
+	.align	4
+/* At this point we know the strings have different alignment and the
+   compare length is at least 8 bytes.  r12 contains the low order
+   2 bits of rSTR1 and cr5 contains the result of the logical compare
+   of r12 to 0.  If r12 == 0 then rStr1 is word aligned and can
+   perform the Wunaligned loop.
+
+   Otherwise we know that rSTR1 is not already word aligned yet.
+   So we can force the string addresses to the next lower word
+   boundary and special case this first word using shift left to
+   eliminate bits preceding the first byte.  Since we want to join the
+   normal (Wualigned) compare loop, starting at the second word,
+   we need to adjust the length (rN) and special case the loop
+   versioning for the first W. This ensures that the loop count is
+   correct and the first W (shifted) is in the expected resister pair.  */
+#define rSHL		r29	/* Unaligned shift left count.  */
+#define rSHR		r28	/* Unaligned shift right count.  */
+#define rWORD8_SHIFT	r27	/* Left rotation temp for rWORD2.  */
+#define rWORD2_SHIFT	r26	/* Left rotation temp for rWORD4.  */
+#define rWORD4_SHIFT	r25	/* Left rotation temp for rWORD6.  */
+#define rWORD6_SHIFT	r24	/* Left rotation temp for rWORD8.  */
+	cfi_adjust_cfa_offset(64)
+L(unaligned):
+	stw	rSHL, 40(r1)
+	cfi_offset(rSHL, (40-64))
+	clrlwi	rSHL, rSTR2, 30
+	stw	rSHR, 36(r1)
+	cfi_offset(rSHR, (36-64))
+	beq	cr5, L(Wunaligned)
+	stw	rWORD8_SHIFT, 32(r1)
+	cfi_offset(rWORD8_SHIFT, (32-64))
+/* Adjust the logical start of rSTR2 to compensate for the extra bits
+   in the 1st rSTR1 W.  */
+	sub	rWORD8_SHIFT, rSTR2, r12
+/* But do not attempt to address the W before that W that contains
+   the actual start of rSTR2.  */
+	clrrwi	rSTR2, rSTR2, 2
+	stw	rWORD2_SHIFT, 28(r1)
+/* Compute the left/right shift counts for the unaligned rSTR2,
+   compensating for the logical (W aligned) start of rSTR1.  */
+	clrlwi	rSHL, rWORD8_SHIFT, 30
+	clrrwi	rSTR1, rSTR1, 2
+	stw	rWORD4_SHIFT, 24(r1)
+	slwi	rSHL, rSHL, 3
+	cmplw	cr5, rWORD8_SHIFT, rSTR2
+	add	rN, rN, r12
+	slwi	rWORD6, r12, 3
+	stw	rWORD6_SHIFT, 20(r1)
+	cfi_offset(rWORD2_SHIFT, (28-64))
+	cfi_offset(rWORD4_SHIFT, (24-64))
+	cfi_offset(rWORD6_SHIFT, (20-64))
+	subfic	rSHR, rSHL, 32
+	srwi	r0, rN, 4	/* Divide by 16 */
+	andi.	r12, rN, 12	/* Get the W remainder */
+/* We normally need to load 2 Ws to start the unaligned rSTR2, but in
+   this special case those bits may be discarded anyway.  Also we
+   must avoid loading a W where none of the bits are part of rSTR2 as
+   this may cross a page boundary and cause a page fault.  */
+	li	rWORD8, 0
+	blt	cr5, L(dus0)
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD8, 0, rSTR2
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD8, 0(rSTR2)
+	addi	rSTR2, rSTR2, 4
+#endif
+	slw	rWORD8, rWORD8, rSHL
+
+L(dus0):
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD1, 0, rSTR1
+	lwbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD1, 0(rSTR1)
+	lwz	rWORD2, 0(rSTR2)
+#endif
+	cmplwi	cr1, r12, 8
+	cmplwi	cr7, rN, 16
+	srw	r12, rWORD2, rSHR
+	clrlwi	rN, rN, 30
+	beq	L(duPs4)
+	mtctr	r0
+	or	rWORD8, r12, rWORD8
+	bgt	cr1, L(duPs3)
+	beq	cr1, L(duPs2)
+
+/* Remainder is 4 */
+	.align	4
+L(dusP1):
+	slw	rWORD8_SHIFT, rWORD2, rSHL
+	slw	rWORD7, rWORD1, rWORD6
+	slw	rWORD8, rWORD8, rWORD6
+	bge	cr7, L(duP1e)
+/* At this point we exit early with the first word compare
+   complete and remainder of 0 to 3 bytes.  See L(du14) for details on
+   how we handle the remaining bytes.  */
+	cmplw	cr5, rWORD7, rWORD8
+	slwi.	rN, rN, 3
+	bne	cr5, L(duLcr5)
+	cmplw	cr7, rN, rSHR
+	beq	L(duZeroReturn)
+	li	r0, 0
+	ble	cr7, L(dutrim)
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD2, 0, rSTR2
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD2, 4(rSTR2)
+#endif
+	srw	r0, rWORD2, rSHR
+	b	L(dutrim)
+/* Remainder is 8 */
+	.align	4
+L(duPs2):
+	slw	rWORD6_SHIFT, rWORD2, rSHL
+	slw	rWORD5, rWORD1, rWORD6
+	slw	rWORD6, rWORD8, rWORD6
+	b	L(duP2e)
+/* Remainder is 12 */
+	.align	4
+L(duPs3):
+	slw	rWORD4_SHIFT, rWORD2, rSHL
+	slw	rWORD3, rWORD1, rWORD6
+	slw	rWORD4, rWORD8, rWORD6
+	b	L(duP3e)
+/* Count is a multiple of 16, remainder is 0 */
+	.align	4
+L(duPs4):
+	mtctr	r0
+	or	rWORD8, r12, rWORD8
+	slw	rWORD2_SHIFT, rWORD2, rSHL
+	slw	rWORD1, rWORD1, rWORD6
+	slw	rWORD2, rWORD8, rWORD6
+	b	L(duP4e)
+
+/* At this point we know rSTR1 is word aligned and the
+   compare length is at least 8 bytes.  */
+	.align	4
+L(Wunaligned):
+	stw	rWORD8_SHIFT, 32(r1)
+	clrrwi	rSTR2, rSTR2, 2
+	stw	rWORD2_SHIFT, 28(r1)
+	srwi	r0, rN, 4	/* Divide by 16 */
+	stw	rWORD4_SHIFT, 24(r1)
+	andi.	r12, rN, 12	/* Get the W remainder */
+	stw	rWORD6_SHIFT, 20(r1)
+	cfi_offset(rWORD8_SHIFT, (32-64))
+	cfi_offset(rWORD2_SHIFT, (28-64))
+	cfi_offset(rWORD4_SHIFT, (24-64))
+	cfi_offset(rWORD6_SHIFT, (20-64))
+	slwi	rSHL, rSHL, 3
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD6, 0, rSTR2
+	addi	rSTR2, rSTR2, 4
+	lwbrx	rWORD8, 0, rSTR2
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD6, 0(rSTR2)
+	lwzu	rWORD8, 4(rSTR2)
+#endif
+	cmplwi	cr1, r12, 8
+	cmplwi	cr7, rN, 16
+	clrlwi	rN, rN, 30
+	subfic	rSHR, rSHL, 32
+	slw	rWORD6_SHIFT, rWORD6, rSHL
+	beq	L(duP4)
+	mtctr	r0
+	bgt	cr1, L(duP3)
+	beq	cr1, L(duP2)
+
+/* Remainder is 4 */
+	.align	4
+L(duP1):
+	srw	r12, rWORD8, rSHR
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD7, 0, rSTR1
+	addi	rSTR1, rSTR1, 4
+#else
+	lwz	rWORD7, 0(rSTR1)
+#endif
+	slw	rWORD8_SHIFT, rWORD8, rSHL
+	or	rWORD8, r12, rWORD6_SHIFT
+	blt	cr7, L(duP1x)
+L(duP1e):
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD1, 0, rSTR1
+	lwbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD1, 4(rSTR1)
+	lwz	rWORD2, 4(rSTR2)
+#endif
+	cmplw	cr5, rWORD7, rWORD8
+	srw	r0, rWORD2, rSHR
+	slw	rWORD2_SHIFT, rWORD2, rSHL
+	or	rWORD2, r0, rWORD8_SHIFT
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD3, 0, rSTR1
+	lwbrx	rWORD4, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD3, 8(rSTR1)
+	lwz	rWORD4, 8(rSTR2)
+#endif
+	cmplw	cr7, rWORD1, rWORD2
+	srw	r12, rWORD4, rSHR
+	slw	rWORD4_SHIFT, rWORD4, rSHL
+	bne	cr5, L(duLcr5)
+	or	rWORD4, r12, rWORD2_SHIFT
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD5, 0, rSTR1
+	lwbrx	rWORD6, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD5, 12(rSTR1)
+	lwz	rWORD6, 12(rSTR2)
+#endif
+	cmplw	cr1, rWORD3, rWORD4
+	srw	r0, rWORD6, rSHR
+	slw	rWORD6_SHIFT, rWORD6, rSHL
+	bne	cr7, L(duLcr7)
+	or	rWORD6, r0, rWORD4_SHIFT
+	cmplw	cr6, rWORD5, rWORD6
+	b	L(duLoop3)
+	.align	4
+/* At this point we exit early with the first word compare
+   complete and remainder of 0 to 3 bytes.  See L(du14) for details on
+   how we handle the remaining bytes.  */
+L(duP1x):
+	cmplw	cr5, rWORD7, rWORD8
+	slwi.	rN, rN, 3
+	bne	cr5, L(duLcr5)
+	cmplw	cr7, rN, rSHR
+	beq	L(duZeroReturn)
+	li	r0, 0
+	ble	cr7, L(dutrim)
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD2, 0, rSTR2
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD2, 8(rSTR2)
+#endif
+	srw	r0, rWORD2, rSHR
+	b	L(dutrim)
+/* Remainder is 8 */
+	.align	4
+L(duP2):
+	srw	r0, rWORD8, rSHR
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD5, 0, rSTR1
+	addi	rSTR1, rSTR1, 4
+#else
+	lwz	rWORD5, 0(rSTR1)
+#endif
+	or	rWORD6, r0, rWORD6_SHIFT
+	slw	rWORD6_SHIFT, rWORD8, rSHL
+L(duP2e):
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD7, 0, rSTR1
+	lwbrx	rWORD8, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD7, 4(rSTR1)
+	lwz	rWORD8, 4(rSTR2)
+#endif
+	cmplw	cr6, rWORD5, rWORD6
+	srw	r12, rWORD8, rSHR
+	slw	rWORD8_SHIFT, rWORD8, rSHL
+	or	rWORD8, r12, rWORD6_SHIFT
+	blt	cr7, L(duP2x)
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD1, 0, rSTR1
+	lwbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD1, 8(rSTR1)
+	lwz	rWORD2, 8(rSTR2)
+#endif
+	cmplw	cr5, rWORD7, rWORD8
+	bne	cr6, L(duLcr6)
+	srw	r0, rWORD2, rSHR
+	slw	rWORD2_SHIFT, rWORD2, rSHL
+	or	rWORD2, r0, rWORD8_SHIFT
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD3, 0, rSTR1
+	lwbrx	rWORD4, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD3, 12(rSTR1)
+	lwz	rWORD4, 12(rSTR2)
+#endif
+	cmplw	cr7, rWORD1, rWORD2
+	bne	cr5, L(duLcr5)
+	srw	r12, rWORD4, rSHR
+	slw	rWORD4_SHIFT, rWORD4, rSHL
+	or	rWORD4, r12, rWORD2_SHIFT
+#ifndef __LITTLE_ENDIAN__
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#endif
+	cmplw	cr1, rWORD3, rWORD4
+	b	L(duLoop2)
+	.align	4
+L(duP2x):
+	cmplw	cr5, rWORD7, rWORD8
+#ifndef __LITTLE_ENDIAN__
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#endif
+	bne	cr6, L(duLcr6)
+	slwi.	rN, rN, 3
+	bne	cr5, L(duLcr5)
+	cmplw	cr7, rN, rSHR
+	beq	L(duZeroReturn)
+	li	r0, 0
+	ble	cr7, L(dutrim)
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD2, 0, rSTR2
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD2, 4(rSTR2)
+#endif
+	srw	r0, rWORD2, rSHR
+	b	L(dutrim)
+
+/* Remainder is 12 */
+	.align	4
+L(duP3):
+	srw	r12, rWORD8, rSHR
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD3, 0, rSTR1
+	addi	rSTR1, rSTR1, 4
+#else
+	lwz	rWORD3, 0(rSTR1)
+#endif
+	slw	rWORD4_SHIFT, rWORD8, rSHL
+	or	rWORD4, r12, rWORD6_SHIFT
+L(duP3e):
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD5, 0, rSTR1
+	lwbrx	rWORD6, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD5, 4(rSTR1)
+	lwz	rWORD6, 4(rSTR2)
+#endif
+	cmplw	cr1, rWORD3, rWORD4
+	srw	r0, rWORD6, rSHR
+	slw	rWORD6_SHIFT, rWORD6, rSHL
+	or	rWORD6, r0, rWORD4_SHIFT
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD7, 0, rSTR1
+	lwbrx	rWORD8, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD7, 8(rSTR1)
+	lwz	rWORD8, 8(rSTR2)
+#endif
+	cmplw	cr6, rWORD5, rWORD6
+	bne	cr1, L(duLcr1)
+	srw	r12, rWORD8, rSHR
+	slw	rWORD8_SHIFT, rWORD8, rSHL
+	or	rWORD8, r12, rWORD6_SHIFT
+	blt	cr7, L(duP3x)
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD1, 0, rSTR1
+	lwbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD1, 12(rSTR1)
+	lwz	rWORD2, 12(rSTR2)
+#endif
+	cmplw	cr5, rWORD7, rWORD8
+	bne	cr6, L(duLcr6)
+	srw	r0, rWORD2, rSHR
+	slw	rWORD2_SHIFT, rWORD2, rSHL
+	or	rWORD2, r0, rWORD8_SHIFT
+#ifndef __LITTLE_ENDIAN__
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#endif
+	cmplw	cr7, rWORD1, rWORD2
+	b	L(duLoop1)
+	.align	4
+L(duP3x):
+#ifndef __LITTLE_ENDIAN__
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#endif
+#if 0
+/* Huh?  We've already branched on cr1!  */
+	bne	cr1, L(duLcr1)
+#endif
+	cmplw	cr5, rWORD7, rWORD8
+	bne	cr6, L(duLcr6)
+	slwi.	rN, rN, 3
+	bne	cr5, L(duLcr5)
+	cmplw	cr7, rN, rSHR
+	beq	L(duZeroReturn)
+	li	r0, 0
+	ble	cr7, L(dutrim)
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD2, 0, rSTR2
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD2, 4(rSTR2)
+#endif
+	srw	r0, rWORD2, rSHR
+	b	L(dutrim)
+
+/* Count is a multiple of 16, remainder is 0 */
+	.align	4
+L(duP4):
+	mtctr	r0
+	srw	r0, rWORD8, rSHR
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD1, 0, rSTR1
+	addi	rSTR1, rSTR1, 4
+#else
+	lwz	rWORD1, 0(rSTR1)
+#endif
+	slw	rWORD2_SHIFT, rWORD8, rSHL
+	or	rWORD2, r0, rWORD6_SHIFT
+L(duP4e):
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD3, 0, rSTR1
+	lwbrx	rWORD4, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD3, 4(rSTR1)
+	lwz	rWORD4, 4(rSTR2)
+#endif
+	cmplw	cr7, rWORD1, rWORD2
+	srw	r12, rWORD4, rSHR
+	slw	rWORD4_SHIFT, rWORD4, rSHL
+	or	rWORD4, r12, rWORD2_SHIFT
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD5, 0, rSTR1
+	lwbrx	rWORD6, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD5, 8(rSTR1)
+	lwz	rWORD6, 8(rSTR2)
+#endif
+	cmplw	cr1, rWORD3, rWORD4
+	bne	cr7, L(duLcr7)
+	srw	r0, rWORD6, rSHR
+	slw	rWORD6_SHIFT, rWORD6, rSHL
+	or	rWORD6, r0, rWORD4_SHIFT
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD7, 0, rSTR1
+	lwbrx	rWORD8, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwzu	rWORD7, 12(rSTR1)
+	lwzu	rWORD8, 12(rSTR2)
+#endif
+	cmplw	cr6, rWORD5, rWORD6
+	bne	cr1, L(duLcr1)
+	srw	r12, rWORD8, rSHR
+	slw	rWORD8_SHIFT, rWORD8, rSHL
+	or	rWORD8, r12, rWORD6_SHIFT
+	cmplw	cr5, rWORD7, rWORD8
+	bdz	L(du24)		/* Adjust CTR as we start with +4 */
+/* This is the primary loop */
+	.align	4
+L(duLoop):
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD1, 0, rSTR1
+	lwbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD1, 4(rSTR1)
+	lwz	rWORD2, 4(rSTR2)
+#endif
+	cmplw	cr1, rWORD3, rWORD4
+	bne	cr6, L(duLcr6)
+	srw	r0, rWORD2, rSHR
+	slw	rWORD2_SHIFT, rWORD2, rSHL
+	or	rWORD2, r0, rWORD8_SHIFT
+L(duLoop1):
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD3, 0, rSTR1
+	lwbrx	rWORD4, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD3, 8(rSTR1)
+	lwz	rWORD4, 8(rSTR2)
+#endif
+	cmplw	cr6, rWORD5, rWORD6
+	bne	cr5, L(duLcr5)
+	srw	r12, rWORD4, rSHR
+	slw	rWORD4_SHIFT, rWORD4, rSHL
+	or	rWORD4, r12, rWORD2_SHIFT
+L(duLoop2):
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD5, 0, rSTR1
+	lwbrx	rWORD6, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD5, 12(rSTR1)
+	lwz	rWORD6, 12(rSTR2)
+#endif
+	cmplw	cr5, rWORD7, rWORD8
+	bne	cr7, L(duLcr7)
+	srw	r0, rWORD6, rSHR
+	slw	rWORD6_SHIFT, rWORD6, rSHL
+	or	rWORD6, r0, rWORD4_SHIFT
+L(duLoop3):
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD7, 0, rSTR1
+	lwbrx	rWORD8, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwzu	rWORD7, 16(rSTR1)
+	lwzu	rWORD8, 16(rSTR2)
+#endif
+	cmplw	cr7, rWORD1, rWORD2
+	bne	cr1, L(duLcr1)
+	srw	r12, rWORD8, rSHR
+	slw	rWORD8_SHIFT, rWORD8, rSHL
+	or	rWORD8, r12, rWORD6_SHIFT
+	bdnz	L(duLoop)
+
+L(duL4):
+#if 0
+/* Huh?  We've already branched on cr1!  */
+	bne	cr1, L(duLcr1)
+#endif
+	cmplw	cr1, rWORD3, rWORD4
+	bne	cr6, L(duLcr6)
+	cmplw	cr6, rWORD5, rWORD6
+	bne	cr5, L(duLcr5)
+	cmplw	cr5, rWORD7, rWORD8
+L(du44):
+	bne	cr7, L(duLcr7)
+L(du34):
+	bne	cr1, L(duLcr1)
+L(du24):
+	bne	cr6, L(duLcr6)
+L(du14):
+	slwi.	rN, rN, 3
+	bne	cr5, L(duLcr5)
+/* At this point we have a remainder of 1 to 3 bytes to compare.  We use
+   shift right to eliminate bits beyond the compare length.
+   This allows the use of word subtract to compute the final result.
+
+   However it may not be safe to load rWORD2 which may be beyond the
+   string length. So we compare the bit length of the remainder to
+   the right shift count (rSHR). If the bit count is less than or equal
+   we do not need to load rWORD2 (all significant bits are already in
+   rWORD8_SHIFT).  */
+	cmplw	cr7, rN, rSHR
+	beq	L(duZeroReturn)
+	li	r0, 0
+	ble	cr7, L(dutrim)
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD2, 0, rSTR2
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD2, 4(rSTR2)
+#endif
+	srw	r0, rWORD2, rSHR
+	.align	4
+L(dutrim):
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD1, 0, rSTR1
+#else
+	lwz	rWORD1, 4(rSTR1)
+#endif
+	lwz	rWORD8, 48(r1)
+	subfic	rN, rN, 32	/* Shift count is 32 - (rN * 8).  */
+	or	rWORD2, r0, rWORD8_SHIFT
+	lwz	rWORD7, 44(r1)
+	lwz	rSHL, 40(r1)
+	srw	rWORD1, rWORD1, rN
+	srw	rWORD2, rWORD2, rN
+	lwz	rSHR, 36(r1)
+	lwz	rWORD8_SHIFT, 32(r1)
+	sub	rRTN, rWORD1, rWORD2
+	b	L(dureturn26)
+	.align	4
+L(duLcr7):
+	lwz	rWORD8, 48(r1)
+	lwz	rWORD7, 44(r1)
+	li	rRTN, 1
+	bgt	cr7, L(dureturn29)
+	lwz	rSHL, 40(r1)
+	lwz	rSHR, 36(r1)
+	li	rRTN, -1
+	b	L(dureturn27)
+	.align	4
+L(duLcr1):
+	lwz	rWORD8, 48(r1)
+	lwz	rWORD7, 44(r1)
+	li	rRTN, 1
+	bgt	cr1, L(dureturn29)
+	lwz	rSHL, 40(r1)
+	lwz	rSHR, 36(r1)
+	li	rRTN, -1
+	b	L(dureturn27)
+	.align	4
+L(duLcr6):
+	lwz	rWORD8, 48(r1)
+	lwz	rWORD7, 44(r1)
+	li	rRTN, 1
+	bgt	cr6, L(dureturn29)
+	lwz	rSHL, 40(r1)
+	lwz	rSHR, 36(r1)
+	li	rRTN, -1
+	b	L(dureturn27)
+	.align	4
+L(duLcr5):
+	lwz	rWORD8, 48(r1)
+	lwz	rWORD7, 44(r1)
+	li	rRTN, 1
+	bgt	cr5, L(dureturn29)
+	lwz	rSHL, 40(r1)
+	lwz	rSHR, 36(r1)
+	li	rRTN, -1
+	b	L(dureturn27)
+	.align	3
+L(duZeroReturn):
+	li	rRTN, 0
+	.align	4
+L(dureturn):
+	lwz	rWORD8, 48(r1)
+	lwz	rWORD7, 44(r1)
+L(dureturn29):
+	lwz	rSHL, 40(r1)
+	lwz	rSHR, 36(r1)
+L(dureturn27):
+	lwz	rWORD8_SHIFT, 32(r1)
+L(dureturn26):
+	lwz	rWORD2_SHIFT, 28(r1)
+L(dureturn25):
+	lwz	rWORD4_SHIFT, 24(r1)
+	lwz	rWORD6_SHIFT, 20(r1)
+	addi	r1, r1, 64
+	cfi_adjust_cfa_offset(-64)
+	blr
+END (memcmp)
+
+libc_hidden_builtin_def (memcmp)
+weak_alias (memcmp, bcmp)
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc32/power7/memcpy.S b/REORG.TODO/sysdeps/powerpc/powerpc32/power7/memcpy.S
new file mode 100644
index 0000000000..8e33c1d733
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc32/power7/memcpy.S
@@ -0,0 +1,538 @@
+/* Optimized memcpy implementation for PowerPC32/POWER7.
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Luis Machado <luisgpm@br.ibm.com>.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+/* __ptr_t [r3] memcpy (__ptr_t dst [r3], __ptr_t src [r4], size_t len [r5]);
+   Returns 'dst'.  */
+
+	.machine  power7
+EALIGN (memcpy, 5, 0)
+	CALL_MCOUNT
+
+	stwu    1,-32(1)
+	cfi_adjust_cfa_offset(32)
+	stw	30,20(1)
+	cfi_offset(30,(20-32))
+	stw	31,24(1)
+	mr      30,3
+	cmplwi  cr1,5,31
+	neg	0,3
+	cfi_offset(31,-8)
+	ble	cr1, L(copy_LT_32)  /* If move < 32 bytes use short move
+				    code.  */
+
+	andi.   11,3,15	      /* Check alignment of DST.  */
+	clrlwi  10,4,28	      /* Check alignment of SRC.  */
+	cmplw   cr6,10,11     /* SRC and DST alignments match?  */
+	mr	12,4
+	mr	31,5
+	bne	cr6,L(copy_GE_32_unaligned)
+
+	srwi    9,5,3	      /* Number of full quadwords remaining.  */
+
+	beq	L(copy_GE_32_aligned_cont)
+
+	clrlwi  0,0,29
+	mtcrf   0x01,0
+	subf    31,0,5
+
+	/* Get the SRC aligned to 8 bytes.  */
+
+1:	bf	31,2f
+	lbz	6,0(12)
+	addi    12,12,1
+	stb	6,0(3)
+	addi    3,3,1
+2:	bf      30,4f
+	lhz     6,0(12)
+	addi    12,12,2
+	sth     6,0(3)
+	addi    3,3,2
+4:	bf      29,0f
+	lwz     6,0(12)
+	addi    12,12,4
+	stw     6,0(3)
+	addi    3,3,4
+0:
+	clrlwi  10,12,29      /* Check alignment of SRC again.  */
+	srwi    9,31,3	      /* Number of full doublewords remaining.  */
+
+L(copy_GE_32_aligned_cont):
+
+	clrlwi  11,31,29
+	mtcrf   0x01,9
+
+	srwi    8,31,5
+	cmplwi  cr1,9,4
+	cmplwi  cr6,11,0
+	mr	11,12
+
+	/* Copy 1~3 doublewords so the main loop starts
+	at a multiple of 32 bytes.  */
+
+	bf	30,1f
+	lfd     6,0(12)
+	lfd     7,8(12)
+	addi    11,12,16
+	mtctr   8
+	stfd    6,0(3)
+	stfd    7,8(3)
+	addi    10,3,16
+	bf      31,4f
+	lfd     0,16(12)
+	stfd    0,16(3)
+	blt     cr1,3f
+	addi    11,12,24
+	addi    10,3,24
+	b       4f
+
+	.align  4
+1:	/* Copy 1 doubleword and set the counter.  */
+	mr	10,3
+	mtctr   8
+	bf      31,4f
+	lfd     6,0(12)
+	addi    11,12,8
+	stfd    6,0(3)
+	addi    10,3,8
+
+L(aligned_copy):
+	/* Main aligned copy loop. Copies up to 128-bytes at a time. */
+	.align  4
+4:
+	/* check for any 32-byte or 64-byte lumps that are outside of a
+	   nice 128-byte range.  R8 contains the number of 32-byte
+	   lumps, so drop this into the CR, and use the SO/EQ bits to help
+	   handle the 32- or 64- byte lumps.  Then handle the rest with an
+	   unrolled 128-bytes-at-a-time copy loop. */
+	mtocrf	1,8
+	li	6,16	# 16() index
+	li	7,32	# 32() index
+	li	8,48	# 48() index
+
+L(aligned_32byte):
+	/* if the SO bit (indicating a 32-byte lump) is not set, move along. */
+	bns	cr7,L(aligned_64byte)
+	lxvd2x	6,0,11
+	lxvd2x	7,11,6
+	addi	11,11,32
+	stxvd2x	6,0,10
+	stxvd2x	7,10,6
+	addi	10,10,32
+
+L(aligned_64byte):
+	/* if the EQ bit (indicating a 64-byte lump) is not set, move along. */
+	bne	cr7,L(aligned_128setup)
+	lxvd2x	6,0,11
+	lxvd2x	7,11,6
+	lxvd2x	8,11,7
+	lxvd2x	9,11,8
+	addi	11,11,64
+	stxvd2x	6,0,10
+	stxvd2x	7,10,6
+	stxvd2x	8,10,7
+	stxvd2x	9,10,8
+	addi	10,10,64
+
+L(aligned_128setup):
+	/* Set up for the 128-byte at a time copy loop.  */
+	srwi	8,31,7
+	cmpwi	8,0	# Any 4x lumps left?
+	beq	3f	# if not, move along.
+	lxvd2x	6,0,11
+	lxvd2x	7,11,6
+	mtctr	8	# otherwise, load the ctr and begin.
+	li	8,48	# 48() index
+	b	L(aligned_128loop)
+
+L(aligned_128head):
+	/* for the 2nd + iteration of this loop. */
+	lxvd2x	6,0,11
+	lxvd2x	7,11,6
+L(aligned_128loop):
+	lxvd2x	8,11,7
+	lxvd2x	9,11,8
+	stxvd2x	6,0,10
+	addi	11,11,64
+	stxvd2x	7,10,6
+	stxvd2x	8,10,7
+	stxvd2x	9,10,8
+	lxvd2x	6,0,11
+	lxvd2x	7,11,6
+	addi	10,10,64
+	lxvd2x	8,11,7
+	lxvd2x	9,11,8
+	addi	11,11,64
+	stxvd2x	6,0,10
+	stxvd2x	7,10,6
+	stxvd2x	8,10,7
+	stxvd2x	9,10,8
+	addi	10,10,64
+	bdnz	L(aligned_128head)
+
+3:
+	/* Check for tail bytes.  */
+	clrrwi  0,31,3
+	mtcrf   0x01,31
+	beq	cr6,0f
+
+.L9:
+	add	3,3,0
+	add	12,12,0
+
+	/*  At this point we have a tail of 0-7 bytes and we know that the
+	destination is doubleword-aligned.  */
+4:	/* Copy 4 bytes.  */
+	bf	29,2f
+
+	lwz     6,0(12)
+	addi    12,12,4
+	stw     6,0(3)
+	addi    3,3,4
+2:	/* Copy 2 bytes.  */
+	bf	30,1f
+
+	lhz     6,0(12)
+	addi    12,12,2
+	sth     6,0(3)
+	addi    3,3,2
+1:	/* Copy 1 byte.  */
+	bf	31,0f
+
+	lbz	6,0(12)
+	stb	6,0(3)
+0:	/* Return original DST pointer.  */
+	mr	3,30
+	lwz	30,20(1)
+	lwz     31,24(1)
+	addi    1,1,32
+	blr
+
+	/* Handle copies of 0~31 bytes.  */
+	.align  4
+L(copy_LT_32):
+	cmplwi  cr6,5,8
+	mr	12,4
+	mtcrf   0x01,5
+	ble	cr6,L(copy_LE_8)
+
+	/* At least 9 bytes to go.  */
+	neg	8,4
+	clrrwi  11,4,2
+	andi.   0,8,3
+	cmplwi  cr1,5,16
+	mr	10,5
+	beq	L(copy_LT_32_aligned)
+
+	/* Force 4-bytes alignment for SRC.  */
+	mtocrf  0x01,0
+	subf    10,0,5
+2:	bf	30,1f
+
+	lhz	6,0(12)
+	addi    12,12,2
+	sth	6,0(3)
+	addi    3,3,2
+1:	bf	31,L(end_4bytes_alignment)
+
+	lbz	6,0(12)
+	addi    12,12,1
+	stb	6,0(3)
+	addi    3,3,1
+
+	.align  4
+L(end_4bytes_alignment):
+	cmplwi  cr1,10,16
+	mtcrf   0x01,10
+
+L(copy_LT_32_aligned):
+	/* At least 6 bytes to go, and SRC is word-aligned.  */
+	blt	cr1,8f
+
+	/* Copy 16 bytes.  */
+	lwz	6,0(12)
+	lwz     7,4(12)
+	stw     6,0(3)
+	lwz     8,8(12)
+	stw     7,4(3)
+	lwz     6,12(12)
+	addi    12,12,16
+	stw     8,8(3)
+	stw     6,12(3)
+	addi    3,3,16
+8:	/* Copy 8 bytes.  */
+	bf	28,4f
+
+	lwz     6,0(12)
+	lwz     7,4(12)
+	addi    12,12,8
+	stw     6,0(3)
+	stw     7,4(3)
+	addi    3,3,8
+4:	/* Copy 4 bytes.  */
+	bf	29,2f
+
+	lwz     6,0(12)
+	addi    12,12,4
+	stw     6,0(3)
+	addi    3,3,4
+2:	/* Copy 2-3 bytes.  */
+	bf	30,1f
+
+	lhz     6,0(12)
+	sth     6,0(3)
+	bf      31,0f
+	lbz     7,2(12)
+	stb     7,2(3)
+
+	/* Return original DST pointer.  */
+	mr      3,30
+	lwz     30,20(1)
+	addi    1,1,32
+	blr
+
+	.align  4
+1:	/* Copy 1 byte.  */
+	bf	31,0f
+
+	lbz	6,0(12)
+	stb	6,0(3)
+0:	/* Return original DST pointer.  */
+	mr	3,30
+	lwz	30,20(1)
+	addi    1,1,32
+	blr
+
+	/* Handles copies of 0~8 bytes.  */
+	.align  4
+L(copy_LE_8):
+	bne	cr6,4f
+
+	/* Though we could've used lfd/stfd here, they are still
+	slow for unaligned cases.  */
+
+	lwz	6,0(4)
+	lwz     7,4(4)
+	stw     6,0(3)
+	stw     7,4(3)
+
+	/* Return original DST pointer.  */
+	mr      3,30
+	lwz     30,20(1)
+	addi    1,1,32
+	blr
+
+	.align  4
+4:	/* Copies 4~7 bytes.  */
+	bf	29,2b
+
+	lwz	6,0(4)
+	stw     6,0(3)
+	bf      30,5f
+	lhz     7,4(4)
+	sth     7,4(3)
+	bf      31,0f
+	lbz     8,6(4)
+	stb     8,6(3)
+
+	/* Return original DST pointer.  */
+	mr      3,30
+	lwz     30,20(1)
+	addi    1,1,32
+	blr
+
+	.align  4
+5:	/* Copy 1 byte.  */
+	bf	31,0f
+
+	lbz	6,4(4)
+	stb	6,4(3)
+
+0:	/* Return original DST pointer.  */
+	mr	3,30
+	lwz     30,20(1)
+	addi    1,1,32
+	blr
+
+	/* Handle copies of 32+ bytes where DST is aligned (to quadword) but
+	SRC is not. Use aligned quadword loads from SRC, shifted to realign
+	the data, allowing for aligned DST stores.  */
+	.align  4
+L(copy_GE_32_unaligned):
+	andi.   11,3,15	      /* Check alignment of DST.  */
+	clrlwi  0,0,28	      /* Number of bytes until the 1st
+			      quadword of DST.  */
+	srwi    9,5,4	      /* Number of full quadwords remaining.  */
+
+	beq    L(copy_GE_32_unaligned_cont)
+
+	/* DST is not quadword aligned, get it aligned.  */
+
+	mtcrf   0x01,0
+	subf    31,0,5
+
+	/* Vector instructions work best when proper alignment (16-bytes)
+	is present.  Move 0~15 bytes as needed to get DST quadword-aligned.  */
+1:	/* Copy 1 byte.  */
+	bf	31,2f
+
+	lbz	6,0(12)
+	addi    12,12,1
+	stb	6,0(3)
+	addi    3,3,1
+2:	/* Copy 2 bytes.  */
+	bf	    30,4f
+
+	lhz     6,0(12)
+	addi    12,12,2
+	sth     6,0(3)
+	addi    3,3,2
+4:	/* Copy 4 bytes.  */
+	bf	29,8f
+
+	lwz     6,0(12)
+	addi    12,12,4
+	stw     6,0(3)
+	addi    3,3,4
+8:	/* Copy 8 bytes.  */
+	bf	28,0f
+
+	lfd	6,0(12)
+	addi    12,12,8
+	stfd    6,0(3)
+	addi    3,3,8
+0:
+	clrlwi  10,12,28      /* Check alignment of SRC.  */
+	srwi    9,31,4	      /* Number of full quadwords remaining.  */
+
+	/* The proper alignment is present, it is OK to copy the bytes now.  */
+L(copy_GE_32_unaligned_cont):
+
+	/* Setup two indexes to speed up the indexed vector operations.  */
+	clrlwi  11,31,28
+	li      6,16	      /* Index for 16-bytes offsets.  */
+	li	7,32	      /* Index for 32-bytes offsets.  */
+	cmplwi  cr1,11,0
+	srwi    8,31,5	      /* Setup the loop counter.  */
+	mr      10,3
+	mr      11,12
+	mtcrf   0x01,9
+	cmplwi  cr6,9,1
+#ifdef __LITTLE_ENDIAN__
+	lvsr    5,0,12
+#else
+	lvsl    5,0,12
+#endif
+	lvx     3,0,12
+	bf      31,L(setup_unaligned_loop)
+
+	/* Copy another 16 bytes to align to 32-bytes due to the loop .  */
+	lvx     4,12,6
+#ifdef __LITTLE_ENDIAN__
+	vperm   6,4,3,5
+#else
+	vperm   6,3,4,5
+#endif
+	addi    11,12,16
+	addi    10,3,16
+	stvx    6,0,3
+	vor	3,4,4
+
+L(setup_unaligned_loop):
+	mtctr   8
+	ble     cr6,L(end_unaligned_loop)
+
+	/* Copy 32 bytes at a time using vector instructions.  */
+	.align  4
+L(unaligned_loop):
+
+	/* Note: vr6/vr10 may contain data that was already copied,
+	but in order to get proper alignment, we may have to copy
+	some portions again. This is faster than having unaligned
+	vector instructions though.  */
+
+	lvx	4,11,6	      /* vr4 = r11+16.  */
+#ifdef __LITTLE_ENDIAN__
+	vperm   6,4,3,5
+#else
+	vperm   6,3,4,5
+#endif
+	lvx	3,11,7	      /* vr3 = r11+32.  */
+#ifdef __LITTLE_ENDIAN__
+	vperm   10,3,4,5
+#else
+	vperm   10,4,3,5
+#endif
+	addi    11,11,32
+	stvx    6,0,10
+	stvx    10,10,6
+	addi    10,10,32
+
+	bdnz    L(unaligned_loop)
+
+	.align  4
+L(end_unaligned_loop):
+
+	/* Check for tail bytes.  */
+	clrrwi  0,31,4
+	mtcrf   0x01,31
+	beq	cr1,0f
+
+	add	3,3,0
+	add	12,12,0
+
+	/*  We have 1~15 tail bytes to copy, and DST is quadword aligned.  */
+8:	/* Copy 8 bytes.  */
+	bf	28,4f
+
+	lwz	6,0(12)
+	lwz	7,4(12)
+	addi    12,12,8
+	stw	6,0(3)
+	stw	7,4(3)
+	addi    3,3,8
+4:	/* Copy 4 bytes.  */
+	bf	29,2f
+
+	lwz	6,0(12)
+	addi    12,12,4
+	stw	6,0(3)
+	addi    3,3,4
+2:	/* Copy 2~3 bytes.  */
+	bf	30,1f
+
+	lhz	6,0(12)
+	addi    12,12,2
+	sth	6,0(3)
+	addi    3,3,2
+1:	/* Copy 1 byte.  */
+	bf	31,0f
+
+	lbz	6,0(12)
+	stb	6,0(3)
+0:	/* Return original DST pointer.  */
+	mr	3,30
+	lwz     30,20(1)
+	lwz	31,24(1)
+	addi    1,1,32
+	blr
+
+END (memcpy)
+libc_hidden_builtin_def (memcpy)
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc32/power7/mempcpy.S b/REORG.TODO/sysdeps/powerpc/powerpc32/power7/mempcpy.S
new file mode 100644
index 0000000000..1682fbcd2a
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc32/power7/mempcpy.S
@@ -0,0 +1,482 @@
+/* Optimized mempcpy implementation for POWER7.
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Luis Machado <luisgpm@br.ibm.com>.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+/* __ptr_t [r3] __mempcpy (__ptr_t dst [r3], __ptr_t src [r4], size_t len [r5]);
+	Returns 'dst' + 'len'.  */
+
+	.machine  power7
+EALIGN (__mempcpy, 5, 0)
+	CALL_MCOUNT
+
+	stwu	1,-32(1)
+	cfi_adjust_cfa_offset(32)
+	stw	30,20(1)
+	cfi_offset(30,(20-32))
+	stw	31,24(1)
+	mr	30,3
+	cmplwi	cr1,5,31
+	neg	0,3
+	cfi_offset(31,-8)
+	ble	cr1,L(copy_LT_32)  /* If move < 32 bytes use short move
+					code.  */
+
+	andi.	11,3,7	      /* Check alignment of DST.  */
+	clrlwi	10,4,29	      /* Check alignment of SRC.  */
+	cmplw	cr6,10,11     /* SRC and DST alignments match?  */
+	mr	12,4
+	mr	31,5
+	bne	cr6,L(copy_GE_32_unaligned)
+
+	srwi	9,5,3	      /* Number of full quadwords remaining.  */
+
+	beq	L(copy_GE_32_aligned_cont)
+
+	clrlwi	0,0,29
+	mtcrf	0x01,0
+	subf	31,0,5
+
+	/* Get the SRC aligned to 8 bytes.  */
+
+1:	bf	31,2f
+	lbz	6,0(12)
+	addi	12,12,1
+	stb	6,0(3)
+	addi	3,3,1
+2:	bf	30,4f
+	lhz	6,0(12)
+	addi	12,12,2
+	sth	6,0(3)
+	addi	3,3,2
+4:	bf	29,0f
+	lwz	6,0(12)
+	addi	12,12,4
+	stw	6,0(3)
+	addi	3,3,4
+0:
+	clrlwi	10,12,29      /* Check alignment of SRC again.  */
+	srwi	9,31,3	      /* Number of full doublewords remaining.  */
+
+L(copy_GE_32_aligned_cont):
+
+	clrlwi	11,31,29
+	mtcrf	0x01,9
+
+	srwi	8,31,5
+	cmplwi	cr1,9,4
+	cmplwi	cr6,11,0
+	mr	11,12
+
+	/* Copy 1~3 doublewords so the main loop starts
+	at a multiple of 32 bytes.  */
+
+	bf	30,1f
+	lfd	6,0(12)
+	lfd	7,8(12)
+	addi	11,12,16
+	mtctr	8
+	stfd	6,0(3)
+	stfd	7,8(3)
+	addi	10,3,16
+	bf	31,4f
+	lfd	0,16(12)
+	stfd	0,16(3)
+	blt	cr1,3f
+	addi	11,12,24
+	addi	10,3,24
+	b	4f
+
+	.align	4
+1:	/* Copy 1 doubleword and set the counter.  */
+	mr	10,3
+	mtctr	8
+	bf	31,4f
+	lfd	6,0(12)
+	addi	11,12,8
+	stfd	6,0(3)
+	addi	10,3,8
+
+	.align	4
+4:	/* Main aligned copy loop. Copies 32-bytes at a time.  */
+	lfd	6,0(11)
+	lfd	7,8(11)
+	lfd	8,16(11)
+	lfd	0,24(11)
+	addi	11,11,32
+
+	stfd	6,0(10)
+	stfd	7,8(10)
+	stfd	8,16(10)
+	stfd	0,24(10)
+	addi	10,10,32
+	bdnz	4b
+3:
+
+	/* Check for tail bytes.  */
+
+	clrrwi	0,31,3
+	mtcrf	0x01,31
+	beq	cr6,0f
+
+.L9:
+	add	3,3,0
+	add	12,12,0
+
+	/*  At this point we have a tail of 0-7 bytes and we know that the
+	destination is doubleword-aligned.  */
+4:	/* Copy 4 bytes.  */
+	bf	29,2f
+
+	lwz	6,0(12)
+	addi	12,12,4
+	stw	6,0(3)
+	addi	3,3,4
+2:	/* Copy 2 bytes.  */
+	bf	30,1f
+
+	lhz	6,0(12)
+	addi	12,12,2
+	sth	6,0(3)
+	addi	3,3,2
+1:	/* Copy 1 byte.  */
+	bf	31,0f
+
+	lbz	6,0(12)
+	stb	6,0(3)
+0:	/* Return DST + LEN pointer.  */
+	add	3,30,5
+	lwz	30,20(1)
+	lwz	31,24(1)
+	addi	1,1,32
+	blr
+
+	/* Handle copies of 0~31 bytes.  */
+	.align	4
+L(copy_LT_32):
+	cmplwi	cr6,5,8
+	mr	12,4
+	mtcrf	0x01,5
+	ble	cr6,L(copy_LE_8)
+
+	/* At least 9 bytes to go.  */
+	neg	8,4
+	clrrwi	11,4,2
+	andi.	0,8,3
+	cmplwi	cr1,5,16
+	mr	10,5
+	beq	L(copy_LT_32_aligned)
+
+	/* Force 4-bytes alignment for SRC.  */
+	mtocrf  0x01,0
+	subf	10,0,5
+2:	bf	30,1f
+
+	lhz	6,0(12)
+	addi	12,12,2
+	sth	6,0(3)
+	addi	3,3,2
+1:	bf	31,L(end_4bytes_alignment)
+
+	lbz	6,0(12)
+	addi	12,12,1
+	stb	6,0(3)
+	addi	3,3,1
+
+	.align	4
+L(end_4bytes_alignment):
+	cmplwi	cr1,10,16
+	mtcrf	0x01,10
+
+L(copy_LT_32_aligned):
+	/* At least 6 bytes to go, and SRC is word-aligned.  */
+	blt	cr1,8f
+
+	/* Copy 16 bytes.  */
+	lwz	6,0(12)
+	lwz	7,4(12)
+	stw	6,0(3)
+	lwz	8,8(12)
+	stw	7,4(3)
+	lwz	6,12(12)
+	addi	12,12,16
+	stw	8,8(3)
+	stw	6,12(3)
+	addi	3,3,16
+8:	/* Copy 8 bytes.  */
+	bf	28,4f
+
+	lwz	6,0(12)
+	lwz	7,4(12)
+	addi	12,12,8
+	stw	6,0(3)
+	stw	7,4(3)
+	addi	3,3,8
+4:	/* Copy 4 bytes.  */
+	bf	29,2f
+
+	lwz	6,0(12)
+	addi	12,12,4
+	stw	6,0(3)
+	addi	3,3,4
+2:	/* Copy 2-3 bytes.  */
+	bf	30,1f
+
+	lhz	6,0(12)
+	sth	6,0(3)
+	bf	31,0f
+	lbz	7,2(12)
+	stb	7,2(3)
+
+	/* Return DST + LEN pointer.  */
+	add	3,30,5
+	lwz	30,20(1)
+	addi	1,1,32
+	blr
+
+	.align	4
+1:	/* Copy 1 byte.  */
+	bf	31,0f
+
+	lbz	6,0(12)
+	stb	6,0(3)
+0:	/* Return DST + LEN pointer.  */
+	add	3,30,5
+	lwz	30,20(1)
+	addi	1,1,32
+	blr
+
+	/* Handles copies of 0~8 bytes.  */
+	.align	4
+L(copy_LE_8):
+	bne	cr6,4f
+
+	/* Though we could've used lfd/stfd here, they are still
+	slow for unaligned cases.  */
+
+	lwz	6,0(4)
+	lwz	7,4(4)
+	stw	6,0(3)
+	stw	7,4(3)
+
+	/* Return DST + LEN pointer.  */
+	add	3,30,5
+	lwz	30,20(1)
+	addi	1,1,32
+	blr
+
+	.align	4
+4:	/* Copies 4~7 bytes.  */
+	bf	29,2b
+
+	lwz	6,0(4)
+	stw	6,0(3)
+	bf	30,5f
+	lhz	7,4(4)
+	sth	7,4(3)
+	bf	31,0f
+	lbz	8,6(4)
+	stb	8,6(3)
+
+	/* Return DST + LEN pointer.  */
+	add	3,30,5
+	lwz	30,20(1)
+	addi	1,1,32
+	blr
+
+	.align	4
+5:	/* Copy 1 byte.  */
+	bf	31,0f
+
+	lbz	6,4(4)
+	stb	6,4(3)
+
+0:	/* Return DST + LEN pointer.  */
+	add	3,30,5
+	lwz	30,20(1)
+	addi	1,1,32
+	blr
+
+	/* Handle copies of 32+ bytes where DST is aligned (to quadword) but
+	SRC is not. Use aligned quadword loads from SRC, shifted to realign
+	the data, allowing for aligned DST stores.  */
+	.align	4
+L(copy_GE_32_unaligned):
+	andi.	11,3,15	      /* Check alignment of DST.  */
+	clrlwi	0,0,28	      /* Number of bytes until the 1st
+				 quadword of DST.  */
+	srwi	9,5,4	      /* Number of full quadwords remaining.  */
+
+	beq	L(copy_GE_32_unaligned_cont)
+
+	/* DST is not quadword aligned, get it aligned.  */
+
+	mtcrf	0x01,0
+	subf	31,0,5
+
+	/* Vector instructions work best when proper alignment (16-bytes)
+	is present.  Move 0~15 bytes as needed to get DST quadword-aligned.  */
+1:	/* Copy 1 byte.  */
+	bf	31,2f
+
+	lbz	6,0(12)
+	addi	12,12,1
+	stb	6,0(3)
+	addi	3,3,1
+2:	/* Copy 2 bytes.  */
+	bf		30,4f
+
+	lhz	6,0(12)
+	addi	12,12,2
+	sth	6,0(3)
+	addi	3,3,2
+4:	/* Copy 4 bytes.  */
+	bf	29,8f
+
+	lwz	6,0(12)
+	addi	12,12,4
+	stw	6,0(3)
+	addi	3,3,4
+8:	/* Copy 8 bytes.  */
+	bf	28,0f
+
+	lfd	6,0(12)
+	addi	12,12,8
+	stfd	6,0(3)
+	addi	3,3,8
+0:
+	clrlwi	10,12,28      /* Check alignment of SRC.  */
+	srwi	9,31,4	      /* Number of full quadwords remaining.  */
+
+	/* The proper alignment is present, it is OK to copy the bytes now.  */
+L(copy_GE_32_unaligned_cont):
+
+	/* Setup two indexes to speed up the indexed vector operations.  */
+	clrlwi	11,31,28
+	li	6,16	      /* Index for 16-bytes offsets.  */
+	li	7,32	      /* Index for 32-bytes offsets.  */
+	cmplwi	cr1,11,0
+	srwi	8,31,5	      /* Setup the loop counter.  */
+	mr	10,3
+	mr	11,12
+	mtcrf	0x01,9
+	cmplwi	cr6,9,1
+#ifdef __LITTLE_ENDIAN__
+	lvsr    5,0,12
+#else
+	lvsl    5,0,12
+#endif
+	lvx	3,0,12
+	bf	31,L(setup_unaligned_loop)
+
+	/* Copy another 16 bytes to align to 32-bytes due to the loop .  */
+	lvx	4,12,6
+#ifdef __LITTLE_ENDIAN__
+	vperm   6,4,3,5
+#else
+	vperm   6,3,4,5
+#endif
+	addi	11,12,16
+	addi	10,3,16
+	stvx	6,0,3
+	vor	3,4,4
+
+L(setup_unaligned_loop):
+	mtctr	8
+	ble	cr6,L(end_unaligned_loop)
+
+	/* Copy 32 bytes at a time using vector instructions.  */
+	.align	4
+L(unaligned_loop):
+
+	/* Note: vr6/vr10 may contain data that was already copied,
+	but in order to get proper alignment, we may have to copy
+	some portions again. This is faster than having unaligned
+	vector instructions though.  */
+
+	lvx	4,11,6	      /* vr4 = r11+16.  */
+#ifdef __LITTLE_ENDIAN__
+	vperm   6,4,3,5
+#else
+	vperm   6,3,4,5
+#endif
+	lvx	3,11,7	      /* vr3 = r11+32.  */
+#ifdef __LITTLE_ENDIAN__
+	vperm   10,3,4,5
+#else
+	vperm   10,4,3,5
+#endif
+	addi	11,11,32
+	stvx	6,0,10
+	stvx	10,10,6
+	addi	10,10,32
+
+	bdnz	L(unaligned_loop)
+
+	.align	4
+L(end_unaligned_loop):
+
+	/* Check for tail bytes.  */
+	clrrwi	0,31,4
+	mtcrf	0x01,31
+	beq	cr1,0f
+
+	add	3,3,0
+	add	12,12,0
+
+	/*  We have 1~15 tail bytes to copy, and DST is quadword aligned.  */
+8:	/* Copy 8 bytes.  */
+	bf	28,4f
+
+	lwz	6,0(12)
+	lwz	7,4(12)
+	addi	12,12,8
+	stw	6,0(3)
+	stw	7,4(3)
+	addi	3,3,8
+4:	/* Copy 4 bytes.  */
+	bf	29,2f
+
+	lwz	6,0(12)
+	addi	12,12,4
+	stw	6,0(3)
+	addi	3,3,4
+2:	/* Copy 2~3 bytes.  */
+	bf	30,1f
+
+	lhz	6,0(12)
+	addi	12,12,2
+	sth	6,0(3)
+	addi	3,3,2
+1:	/* Copy 1 byte.  */
+	bf	31,0f
+
+	lbz	6,0(12)
+	stb	6,0(3)
+0:	/* Return DST + LEN pointer.  */
+	add	3,30,5
+	lwz	30,20(1)
+	lwz	31,24(1)
+	addi	1,1,32
+	blr
+
+END (__mempcpy)
+libc_hidden_def (__mempcpy)
+weak_alias (__mempcpy, mempcpy)
+libc_hidden_builtin_def (mempcpy)
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc32/power7/memrchr.S b/REORG.TODO/sysdeps/powerpc/powerpc32/power7/memrchr.S
new file mode 100644
index 0000000000..eb0c1bb8eb
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc32/power7/memrchr.S
@@ -0,0 +1,196 @@
+/* Optimized memrchr implementation for PowerPC32/POWER7 using cmpb insn.
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Luis Machado <luisgpm@br.ibm.com>.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+/* int [r3] memrchr (char *s [r3], int byte [r4], int size [r5])  */
+	.machine  power7
+ENTRY (__memrchr)
+	CALL_MCOUNT
+	add	r7,r3,r5      /* Calculate the last acceptable address.  */
+	neg	r0,r7
+	addi	r7,r7,-1
+	mr	r10,r3
+	clrrwi	r6,r7,7
+	li	r9,3<<5
+	dcbt	r9,r6,16      /* Stream hint, decreasing addresses.  */
+
+	/* Replicate BYTE to word.  */
+	insrwi	r4,r4,8,16
+	insrwi	r4,r4,16,0
+	li	r6,-4
+	li	r9,-1
+	rlwinm	r0,r0,3,27,28 /* Calculate padding.  */
+	clrrwi	r8,r7,2
+	srw	r9,r9,r0
+	cmplwi	r5,16
+	clrrwi	r0,r10,2
+	ble	L(small_range)
+
+#ifdef __LITTLE_ENDIAN__
+	lwzx	r12,0,r8
+#else
+	lwbrx	r12,0,r8      /* Load reversed word from memory.  */
+#endif
+	cmpb	r3,r12,r4     /* Check for BYTE in WORD1.  */
+	and	r3,r3,r9
+	cmplwi	cr7,r3,0      /* If r3 == 0, no BYTEs have been found.  */
+	bne	cr7,L(done)
+
+	mtcrf   0x01,r8
+	/* Are we now aligned to a doubleword boundary?  If so, skip to
+	   the main loop.  Otherwise, go through the alignment code.  */
+	bf	29,L(loop_setup)
+
+	/* Handle WORD2 of pair.  */
+#ifdef __LITTLE_ENDIAN__
+	lwzx	r12,r8,r6
+#else
+	lwbrx	r12,r8,r6
+#endif
+	addi	r8,r8,-4
+	cmpb	r3,r12,r4
+	cmplwi	cr7,r3,0
+	bne	cr7,L(done)
+
+L(loop_setup):
+	/* The last word we want to read in the loop below is the one
+	   containing the first byte of the string, ie. the word at
+	   s & ~3, or r0.  The first word read is at r8 - 4, we
+	   read 2 * cnt words, so the last word read will be at
+	   r8 - 4 - 8 * cnt + 4.  Solving for cnt gives
+	   cnt = (r8 - r0) / 8  */
+	sub	r5,r8,r0
+	addi	r8,r8,-4
+	srwi	r9,r5,3       /* Number of loop iterations.  */
+	mtctr	r9	      /* Setup the counter.  */
+
+	/* Main loop to look for BYTE backwards in the string.
+	   FIXME: Investigate whether 32 byte align helps with this
+	   9 instruction loop.  */
+	.align	5
+L(loop):
+	/* Load two words, compare and merge in a
+	   single register for speed.  This is an attempt
+	   to speed up the byte-checking process for bigger strings.  */
+
+#ifdef __LITTLE_ENDIAN__
+	lwzx	r12,0,r8
+	lwzx	r11,r8,r6
+#else
+	lwbrx	r12,0,r8
+	lwbrx	r11,r8,r6
+#endif
+	cmpb	r3,r12,r4
+	cmpb	r9,r11,r4
+	or	r5,r9,r3      /* Merge everything in one word.  */
+	cmplwi	cr7,r5,0
+	bne	cr7,L(found)
+	addi	r8,r8,-8
+	bdnz	L(loop)
+
+	/* We may have one more word to read.  */
+	cmplw	r8,r0
+	bnelr
+
+#ifdef __LITTLE_ENDIAN__
+	lwzx	r12,0,r8
+#else
+	lwbrx	r12,0,r8
+#endif
+	cmpb	r3,r12,r4
+	cmplwi	cr7,r3,0
+	bne	cr7,L(done)
+	blr
+
+	.align	4
+L(found):
+	/* OK, one (or both) of the words contains BYTE.  Check
+	   the first word.  */
+	cmplwi	cr6,r3,0
+	bne	cr6,L(done)
+
+	/* BYTE must be in the second word.  Adjust the address
+	   again and move the result of cmpb to r3 so we can calculate the
+	   pointer.  */
+
+	mr	r3,r9
+	addi	r8,r8,-4
+
+	/* r3 has the output of the cmpb instruction, that is, it contains
+	   0xff in the same position as BYTE in the original
+	   word from the string.  Use that to calculate the pointer.
+	   We need to make sure BYTE is *before* the end of the
+	   range.  */
+L(done):
+	cntlzw	r9,r3	      /* Count leading zeros before the match.  */
+	cmplw	r8,r0         /* Are we on the last word?  */
+	srwi	r6,r9,3	      /* Convert leading zeros to bytes.  */
+	addi	r0,r6,-3
+	sub	r3,r8,r0
+	cmplw	cr7,r3,r10
+	bnelr
+	bgelr	cr7
+	li	r3,0
+	blr
+
+	.align	4
+L(null):
+	li	r3,0
+	blr
+
+/* Deals with size <= 16.  */
+	.align	4
+L(small_range):
+	cmplwi	r5,0
+	beq	L(null)
+
+#ifdef __LITTLE_ENDIAN__
+	lwzx	r12,0,r8
+#else
+	lwbrx	r12,0,r8      /* Load reversed word from memory.  */
+#endif
+	cmpb	r3,r12,r4     /* Check for BYTE in WORD1.  */
+	and	r3,r3,r9
+	cmplwi	cr7,r3,0
+	bne	cr7,L(done)
+
+	/* Are we done already?  */
+	cmplw	r8,r0
+	addi	r8,r8,-4
+	beqlr
+
+	.align	5
+L(loop_small):
+#ifdef __LITTLE_ENDIAN__
+	lwzx	r12,0,r8
+#else
+	lwbrx	r12,0,r8
+#endif
+	cmpb	r3,r12,r4
+	cmplw	r8,r0
+	cmplwi	cr7,r3,0
+	bne	cr7,L(done)
+	addi	r8,r8,-4
+	bne	L(loop_small)
+	blr
+
+END (__memrchr)
+weak_alias (__memrchr, memrchr)
+libc_hidden_builtin_def (memrchr)
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc32/power7/memset.S b/REORG.TODO/sysdeps/powerpc/powerpc32/power7/memset.S
new file mode 100644
index 0000000000..b431f5086d
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc32/power7/memset.S
@@ -0,0 +1,431 @@
+/* Optimized memset implementation for PowerPC32/POWER7.
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Luis Machado <luisgpm@br.ibm.com>.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+/* __ptr_t [r3] memset (__ptr_t s [r3], int c [r4], size_t n [r5]));
+   Returns 's'.  */
+
+	.machine  power7
+EALIGN (memset, 5, 0)
+	CALL_MCOUNT
+
+	.align	4
+L(_memset):
+	cmplwi	cr7,5,31
+	cmplwi	cr6,5,8
+	mr	10,3		/* Save original argument for later.  */
+	mr	7,1		/* Save original r1 for later.  */
+	cfi_offset(31,-8)
+
+	/* Replicate byte to word.  */
+	insrwi	4,4,8,16
+	insrwi	4,4,16,0
+
+	ble	cr6,L(small)	/* If length <= 8, use short copy code.  */
+
+	neg	0,3
+	ble	cr7,L(medium)	/* If length < 32, use medium copy code.  */
+
+	/* Save our word twice to create a doubleword that we will later
+	   copy to a FPR.  */
+	stwu	1,-32(1)
+	andi.	11,10,7		/* Check alignment of DST.  */
+	mr	12,5
+	stw	4,24(1)
+	stw	4,28(1)
+	beq	L(big_aligned)
+
+	clrlwi	0,0,29
+	mtocrf	0x01,0
+	subf	5,0,5
+
+	/* Get DST aligned to 8 bytes.  */
+1:	bf	31,2f
+
+	stb	4,0(10)
+	addi	10,10,1
+2:	bf	30,4f
+
+	sth	4,0(10)
+	addi	10,10,2
+4:	bf	29,L(big_aligned)
+
+	stw	4,0(10)
+	addi	10,10,4
+
+	.align	4
+L(big_aligned):
+	cmplwi	cr5,5,255
+	li	0,32
+	cmplwi	cr1,5,160
+	dcbtst	0,10
+	cmplwi	cr6,4,0
+	srwi	9,5,3		/* Number of full doublewords remaining.  */
+	crand	27,26,21
+	mtocrf	0x01,9
+	bt	27,L(huge)
+
+	/* From this point on, we'll copy 32+ bytes and the value
+	   isn't 0 (so we can't use dcbz).  */
+
+	srwi	8,5,5
+	clrlwi	11,5,29
+	cmplwi	cr6,11,0
+	cmplwi	cr1,9,4
+	mtctr	8
+
+	/* Copy 1~3 doublewords so the main loop starts
+	at a multiple of 32 bytes.  */
+
+	bf	30,1f
+
+	stw	4,0(10)
+	stw	4,4(10)
+	stw	4,8(10)
+	stw	4,12(10)
+	addi	10,10,16
+	bf	31,L(big_loop)
+
+	stw	4,0(10)
+	stw	4,4(10)
+	addi	10,10,8
+	mr	12,10
+	blt	cr1,L(tail_bytes)
+
+	b	L(big_loop)
+
+	.align	4
+1:	/* Copy 1 doubleword.  */
+	bf	31,L(big_loop)
+
+	stw	4,0(10)
+	stw	4,4(10)
+	addi	10,10,8
+
+	/* First use a 32-bytes loop with stw's to try and avoid the LHS due
+	   to the lfd we will do next.  Also, ping-pong through r10 and r12
+	   to avoid AGEN delays.  */
+	.align	4
+L(big_loop):
+	addi	12,10,32
+	stw	4,0(10)
+	stw	4,4(10)
+	stw	4,8(10)
+	stw	4,12(10)
+	stw	4,16(10)
+	stw	4,20(10)
+	stw	4,24(10)
+	stw	4,28(10)
+	bdz	L(tail_bytes)
+
+	addi	10,10,64
+	stw	4,0(12)
+	stw	4,4(12)
+	stw	4,8(12)
+	stw	4,12(12)
+	stw	4,16(12)
+	stw	4,20(12)
+	stw	4,24(12)
+	stw	4,28(12)
+	bdnz	L(big_loop_fast_setup)
+
+	mr	12,10
+	b	L(tail_bytes)
+
+	/* Now that we're probably past the LHS window, use the VSX to
+	   speed up the loop.  */
+L(big_loop_fast_setup):
+	li	11,24
+	li	6,16
+	lxvdsx	4,1,11
+
+	.align	4
+L(big_loop_fast):
+	addi	12,10,32
+	stxvd2x	4,0,10
+	stxvd2x	4,10,6
+	bdz	L(tail_bytes)
+
+	addi	10,10,64
+	stxvd2x	4,0,12
+	stxvd2x	4,12,6
+	bdnz	L(big_loop_fast)
+
+	mr	12,10
+
+	.align	4
+L(tail_bytes):
+
+	/* Check for tail bytes.  */
+	mr	1,7		/* Restore r1.  */
+	beqlr	cr6
+
+	clrlwi	0,5,29
+	mtocrf	0x01,0
+
+	/*  At this point we have a tail of 0-7 bytes and we know that the
+	destination is doubleword-aligned.  */
+4:	/* Copy 4 bytes.  */
+	bf	29,2f
+
+	stw	4,0(12)
+	addi	12,12,4
+2:	/* Copy 2 bytes.  */
+	bf	30,1f
+
+	sth	4,0(12)
+	addi	12,12,2
+1:	/* Copy 1 byte.  */
+	bflr	31
+
+	stb	4,0(12)
+	blr
+
+
+	/* Special case when value is 0 and we have a long length to deal
+	   with.  Use dcbz to zero out 128-bytes at a time.  Before using
+	   dcbz though, we need to get the destination 128-bytes aligned.  */
+	.align	4
+L(huge):
+	lfd	4,24(1)
+	andi.	11,10,127
+	neg	0,10
+	beq	L(huge_aligned)
+
+	clrlwi	0,0,25
+	subf	5,0,5
+	srwi	0,0,3
+	mtocrf  0x01,0
+
+	/* Get DST aligned to 128 bytes.  */
+8:	bf	28,4f
+
+	stfd	4,0(10)
+	stfd	4,8(10)
+	stfd	4,16(10)
+	stfd	4,24(10)
+	stfd	4,32(10)
+	stfd	4,40(10)
+	stfd	4,48(10)
+	stfd	4,56(10)
+	addi	10,10,64
+	.align	4
+4:	bf	29,2f
+
+	stfd	4,0(10)
+	stfd	4,8(10)
+	stfd	4,16(10)
+	stfd	4,24(10)
+	addi	10,10,32
+	.align	4
+2:	bf	30,1f
+
+	stfd	4,0(10)
+	stfd	4,8(10)
+	addi	10,10,16
+	.align	4
+1:	bf	31,L(huge_aligned)
+
+	stfd	4,0(10)
+	addi	10,10,8
+
+L(huge_aligned):
+	srwi	8,5,7
+	clrlwi	11,5,25
+	cmplwi	cr6,11,0
+	mtctr	8
+
+	/* Copies 128-bytes at a time.  */
+	.align	4
+L(huge_loop):
+	dcbz	0,10
+	addi	10,10,128
+	bdnz	L(huge_loop)
+
+	/* We have a tail of 0~127 bytes to handle.  */
+	mr	1,7		/* Restore r1.  */
+	beqlr	cr6
+
+	subf	9,3,10
+	subf	5,9,12
+	srwi	8,5,3
+	cmplwi	cr6,8,0
+	mtocrf	0x01,8
+
+	/* We have a tail o 1~127 bytes. Copy up to 15 doublewords for
+	speed.  We'll handle the resulting tail bytes later.  */
+	beq	cr6,L(tail)
+
+8:	bf	28,4f
+
+	stfd	4,0(10)
+	stfd	4,8(10)
+	stfd	4,16(10)
+	stfd	4,24(10)
+	stfd	4,32(10)
+	stfd	4,40(10)
+	stfd	4,48(10)
+	stfd	4,56(10)
+	addi	10,10,64
+	.align	4
+4:	bf	29,2f
+
+	stfd	4,0(10)
+	stfd	4,8(10)
+	stfd	4,16(10)
+	stfd	4,24(10)
+	addi	10,10,32
+	.align	4
+2:	bf	30,1f
+
+	stfd	4,0(10)
+	stfd	4,8(10)
+	addi	10,10,16
+	.align	4
+1:	bf	31,L(tail)
+
+	stfd	4,0(10)
+	addi	10,10,8
+
+	/* Handle the rest of the tail bytes here.  */
+L(tail):
+	mtocrf	0x01,5
+
+	.align	4
+4:	bf	29,2f
+
+	stw	4,0(10)
+	addi	10,10,4
+	.align	4
+2:	bf	30,1f
+
+	sth	4,0(10)
+	addi	10,10,2
+	.align	4
+1:	bflr	31
+
+	stb	4,0(10)
+	blr
+
+
+	/* Expanded tree to copy tail bytes without increments.  */
+	.align	4
+L(copy_tail):
+	bf	29,L(FXX)
+
+	stw	4,0(10)
+	bf	30,L(TFX)
+
+	sth	4,4(10)
+	bflr	31
+
+	stb	4,6(10)
+	blr
+
+	.align	4
+L(FXX):	bf	30,L(FFX)
+
+	sth	4,0(10)
+	bflr	31
+
+	stb	4,2(10)
+	blr
+
+	.align	4
+L(TFX):	bflr	31
+
+	stb	4,4(10)
+	blr
+
+	.align	4
+L(FFX):	bflr	31
+
+	stb	4,0(10)
+	blr
+
+	/* Handle copies of 9~31 bytes.  */
+	.align	4
+L(medium):
+	/* At least 9 bytes to go.  */
+	andi.	11,10,3
+	clrlwi	0,0,30
+	beq	L(medium_aligned)
+
+	/* Force 4-bytes alignment for DST.  */
+	mtocrf	0x01,0
+	subf	5,0,5
+1:	/* Copy 1 byte.  */
+	bf	31,2f
+
+	stb	4,0(10)
+	addi	10,10,1
+2:	/* Copy 2 bytes.  */
+	bf	30,L(medium_aligned)
+
+	sth	4,0(10)
+	addi	10,10,2
+
+	.align	4
+L(medium_aligned):
+	/* At least 6 bytes to go, and DST is word-aligned.  */
+	cmplwi	cr1,5,16
+	mtocrf	0x01,5
+	blt	cr1,8f
+
+	/* Copy 16 bytes.  */
+	stw	4,0(10)
+	stw	4,4(10)
+	stw	4,8(10)
+	stw	4,12(10)
+	addi	10,10,16
+8:	/* Copy 8 bytes.  */
+	bf	28,4f
+
+	stw	4,0(10)
+	stw	4,4(10)
+	addi	10,10,8
+4:	/* Copy 4 bytes.  */
+	bf	29,2f
+
+	stw	4,0(10)
+	addi	10,10,4
+2:	/* Copy 2-3 bytes.  */
+	bf	30,1f
+
+	sth	4,0(10)
+	addi	10,10,2
+1:	/* Copy 1 byte.  */
+	bflr	31
+
+	stb	4,0(10)
+	blr
+
+	/* Handles copies of 0~8 bytes.  */
+	.align	4
+L(small):
+	mtocrf	0x01,5
+	bne	cr6,L(copy_tail)
+
+	stw	4,0(10)
+	stw	4,4(10)
+	blr
+
+END (memset)
+libc_hidden_builtin_def (memset)
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc32/power7/multiarch/Implies b/REORG.TODO/sysdeps/powerpc/powerpc32/power7/multiarch/Implies
new file mode 100644
index 0000000000..22c12fd393
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc32/power7/multiarch/Implies
@@ -0,0 +1 @@
+powerpc/powerpc32/power6/multiarch
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc32/power7/rawmemchr.S b/REORG.TODO/sysdeps/powerpc/powerpc32/power7/rawmemchr.S
new file mode 100644
index 0000000000..22edcfb209
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc32/power7/rawmemchr.S
@@ -0,0 +1,110 @@
+/* Optimized rawmemchr implementation for PowerPC32/POWER7 using cmpb insn.
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Luis Machado <luisgpm@br.ibm.com>.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+/* int [r3] rawmemchr (void *s [r3], int c [r4])  */
+	.machine  power7
+ENTRY (__rawmemchr)
+	CALL_MCOUNT
+	dcbt	0,r3
+	clrrwi	r8,r3,2	      /* Align the address to word boundary.  */
+
+	/* Replicate byte to word.  */
+	insrwi	r4,r4,8,16
+	insrwi	r4,r4,16,0
+
+	/* Now r4 has a word of c bytes.  */
+
+	rlwinm	r6,r3,3,27,28 /* Calculate padding.  */
+	lwz	r12,0(r8)     /* Load word from memory.  */
+	cmpb	r5,r12,r4     /* Compare each byte against c byte.  */
+#ifdef __LITTLE_ENDIAN__
+	srw	r5,r5,r6
+	slw	r5,r5,r6
+#else
+	slw	r5,r5,r6      /* Move left to discard ignored bits.  */
+	srw	r5,r5,r6      /* Bring the bits back as zeros.  */
+#endif
+	cmpwi	cr7,r5,0      /* If r5 == 0, no c bytes have been found.  */
+	bne	cr7,L(done)
+
+	mtcrf   0x01,r8
+
+	/* Are we now aligned to a doubleword boundary?  If so, skip to
+	   the main loop.  Otherwise, go through the alignment code.  */
+
+	bt	29,L(loop)
+
+	/* Handle WORD2 of pair.  */
+	lwzu	r12,4(r8)
+	cmpb	r5,r12,r4
+	cmpwi	cr7,r5,0
+	bne	cr7,L(done)
+	b	L(loop)	      /* We branch here (rather than falling through)
+				 to skip the nops due to heavy alignment
+				 of the loop below.  */
+
+	/* Main loop to look for the end of the string.  Since it's a
+	   small loop (< 8 instructions), align it to 32-bytes.  */
+	.p2align  5
+L(loop):
+	/* Load two words, compare and merge in a
+	   single register for speed.  This is an attempt
+	   to speed up the byte-checking process for bigger strings.  */
+	lwz	r12,4(r8)
+	lwzu	r11,8(r8)
+	cmpb	r5,r12,r4
+	cmpb	r6,r11,r4
+	or	r7,r5,r6
+	cmpwi	cr7,r7,0
+	beq	cr7,L(loop)
+
+	/* OK, one (or both) of the words contains a 'c' byte.  Check
+	   the first word and decrement the address in case the first
+	   word really contains a c byte.  */
+
+	cmpwi	cr6,r5,0
+	addi	r8,r8,-4
+	bne	cr6,L(done)
+
+	/* The 'c' byte must be in the second word.  Adjust the address
+	   again and move the result of cmpb to r10 so we can calculate the
+	   pointer.  */
+	mr	r5,r6
+	addi	r8,r8,4
+
+	/* r5 has the output of the cmpb instruction, that is, it contains
+	   0xff in the same position as the 'c' byte in the original
+	   word from the string.  Use that fact to find out what is
+	   the position of the byte inside the string.  */
+L(done):
+#ifdef __LITTLE_ENDIAN__
+	addi    r0,r5,-1
+	andc    r0,r0,r5
+	popcntw	r0,r0
+#else
+	cntlzw	r0,r5	      /* Count leading zeros before the match.  */
+#endif
+	srwi	r0,r0,3	      /* Convert leading zeros to bytes.  */
+	add	r3,r8,r0      /* Return address of the matching char.  */
+	blr
+END (__rawmemchr)
+weak_alias (__rawmemchr,rawmemchr)
+libc_hidden_builtin_def (__rawmemchr)
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc32/power7/strcasecmp.S b/REORG.TODO/sysdeps/powerpc/powerpc32/power7/strcasecmp.S
new file mode 100644
index 0000000000..964875a13b
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc32/power7/strcasecmp.S
@@ -0,0 +1,129 @@
+/* Optimized strcasecmp implementation for PowerPC32.
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <locale-defines.h>
+
+/* int [r3] strcasecmp (const char *s1 [r3], const char *s2 [r4] )
+
+   or if defined USE_IN_EXTENDED_LOCALE_MODEL:
+
+   int [r3] strcasecmp_l (const char *s1 [r3], const char *s2 [r4],
+                          __locale_t loc [r5]) */
+
+#ifndef STRCMP
+# define __STRCMP __strcasecmp
+# define STRCMP   strcasecmp
+#endif
+
+ENTRY (__STRCMP)
+
+#define rRTN	r3	/* Return value */
+#define rSTR1	r5	/* 1st string */
+#define rSTR2	r4	/* 2nd string */
+#define rLOCARG	r5	/* 3rd argument: locale_t */
+#define rCHAR1	r6	/* Byte read from 1st string */
+#define rCHAR2	r7	/* Byte read from 2nd string */
+#define rADDR1	r8	/* Address of tolower(rCHAR1) */
+#define rADDR2	r12	/* Address of tolower(rCHAR2) */
+#define rLWR1	r8	/* Byte tolower(rCHAR1) */
+#define rLWR2	r12	/* Byte tolower(rCHAR2) */
+#define rTMP	r0
+#define rGOT	r9	/* Address of the Global Offset Table */
+#define rLOC	r11	/* Default locale address */
+
+	cmpw    cr7, r3, r4
+#ifndef USE_IN_EXTENDED_LOCALE_MODEL
+# ifdef SHARED
+	mflr	rTMP
+	bcl	20,31,.L1
+.L1:	mflr	rGOT
+	addis	rGOT, rGOT, _GLOBAL_OFFSET_TABLE_-.L1@ha
+	addi 	rGOT, rGOT, _GLOBAL_OFFSET_TABLE_-.L1@l
+	lwz	rLOC, __libc_tsd_LOCALE@got@tprel(rGOT)
+	add 	rLOC, rLOC, __libc_tsd_LOCALE@tls
+	lwz	rLOC, 0(rLOC)
+	mtlr	rTMP
+# else
+	lis	rTMP,_GLOBAL_OFFSET_TABLE_@ha
+	la	rLOC,_GLOBAL_OFFSET_TABLE_@l(rTMP)
+	lwz	rLOC, __libc_tsd_LOCALE@got@tprel(rGOT)
+	add	rLOC, rLOC, __libc_tsd_LOCALE@tls
+	lwz	rLOC, 0(rLOC)
+# endif /* SHARED */
+#else
+	mr	rLOC, rLOCARG
+#endif
+	mr	rSTR1, rRTN
+	lwz	rLOC, LOCALE_CTYPE_TOLOWER(rLOC)
+	li	rRTN, 0
+	beqlr	cr7
+
+	/* Unrolling loop for POWER: loads are done with 'lbz' plus
+	offset and string descriptors are only updated in the end
+	of loop unrolling. */
+
+L(loop):
+	lbz	rCHAR1, 0(rSTR1)	/* Load char from s1 */
+	lbz	rCHAR2, 0(rSTR2)	/* Load char from s2 */
+	sldi	rADDR1, rCHAR1, 2	/* Calculate address for tolower(*s1) */
+	sldi	rADDR2, rCHAR2, 2	/* Calculate address for tolower(*s2) */
+	lwzx	rLWR1, rLOC, rADDR1	/* Load tolower(*s1) */
+	lwzx	rLWR2, rLOC, rADDR2	/* Load tolower(*s2) */
+	cmpwi	cr7, rCHAR1, 0		/* *s1 == '\0' ? */
+	subf.	r3, rLWR2, rLWR1
+	bnelr
+	beqlr	cr7
+	lbz	rCHAR1, 1(rSTR1)
+	lbz	rCHAR2, 1(rSTR2)
+	sldi	rADDR1, rCHAR1, 2
+	sldi	rADDR2, rCHAR2, 2
+	lwzx	rLWR1, rLOC, rADDR1
+	lwzx	rLWR2, rLOC, rADDR2
+	cmpwi	cr7, rCHAR1, 0
+	subf.	r3, rLWR2, rLWR1
+	bnelr
+	beqlr	cr7
+	lbz	rCHAR1, 2(rSTR1)
+	lbz	rCHAR2, 2(rSTR2)
+	sldi	rADDR1, rCHAR1, 2
+	sldi	rADDR2, rCHAR2, 2
+	lwzx	rLWR1, rLOC, rADDR1
+	lwzx	rLWR2, rLOC, rADDR2
+	cmpwi	cr7, rCHAR1, 0
+	subf.	r3, rLWR2, rLWR1
+	bnelr
+	beqlr	cr7
+	lbz	rCHAR1, 3(rSTR1)
+	lbz	rCHAR2, 3(rSTR2)
+	/* Increment both string descriptors */
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+	sldi	rADDR1, rCHAR1, 2
+	sldi	rADDR2, rCHAR2, 2
+	lwzx	rLWR1, rLOC, rADDR1
+	lwzx	rLWR2, rLOC, rADDR2
+	cmpwi	cr7, rCHAR1, 0
+	subf.	r3, rLWR2, rLWR1
+	bnelr
+	bne	cr7,L(loop)
+	blr
+END (__STRCMP)
+
+weak_alias (__STRCMP, STRCMP)
+libc_hidden_builtin_def (__STRCMP)
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc32/power7/strcasecmp_l.S b/REORG.TODO/sysdeps/powerpc/powerpc32/power7/strcasecmp_l.S
new file mode 100644
index 0000000000..c13c4ebcb8
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc32/power7/strcasecmp_l.S
@@ -0,0 +1,5 @@
+#define USE_IN_EXTENDED_LOCALE_MODEL
+#define STRCMP   strcasecmp_l
+#define __STRCMP __strcasecmp_l
+
+#include "strcasecmp.S"
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc32/power7/strchr.S b/REORG.TODO/sysdeps/powerpc/powerpc32/power7/strchr.S
new file mode 100644
index 0000000000..75ca6acb98
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc32/power7/strchr.S
@@ -0,0 +1,225 @@
+/* Optimized strchr implementation for PowerPC32/POWER7 using cmpb insn.
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Luis Machado <luisgpm@br.ibm.com>.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+/* int [r3] strchr (char *s [r3], int c [r4])  */
+	.machine  power7
+ENTRY (strchr)
+	CALL_MCOUNT
+	dcbt	0,r3
+	clrrwi	r8,r3,2	      /* Align the address to word boundary.  */
+	cmpwi	cr7,r4,0
+	lwz	r12,0(r8)     /* Load word from memory.  */
+	li	r0,0	      /* Word with null chars to use
+				 with cmpb.  */
+
+	rlwinm	r6,r3,3,27,28 /* Calculate padding.  */
+
+	beq	cr7,L(null_match)
+
+	/* Replicate byte to word.  */
+	insrwi	r4,r4,8,16
+	insrwi	r4,r4,16,0
+
+	/* Now r4 has a word of c bytes and r0 has
+	   a word of null bytes.  */
+
+	cmpb	r10,r12,r4     /* Compare each byte against c byte.  */
+	cmpb	r11,r12,r0     /* Compare each byte against null byte.  */
+
+	/* Move the words left and right to discard the bits that are
+	   not part of the string and to bring them back as zeros.  */
+#ifdef __LITTLE_ENDIAN__
+	srw	r10,r10,r6
+	srw	r11,r11,r6
+	slw	r10,r10,r6
+	slw	r11,r11,r6
+#else
+	slw	r10,r10,r6
+	slw	r11,r11,r6
+	srw	r10,r10,r6
+	srw	r11,r11,r6
+#endif
+	or	r5,r10,r11    /* OR the results to speed things up.  */
+	cmpwi	cr7,r5,0      /* If r5 == 0, no c or null bytes
+				 have been found.  */
+	bne	cr7,L(done)
+
+	mtcrf   0x01,r8
+
+	/* Are we now aligned to a doubleword boundary?  If so, skip to
+	   the main loop.  Otherwise, go through the alignment code.  */
+
+	bt	29,L(loop)
+
+	/* Handle WORD2 of pair.  */
+	lwzu	r12,4(r8)
+	cmpb	r10,r12,r4
+	cmpb	r11,r12,r0
+	or	r5,r10,r11
+	cmpwi	cr7,r5,0
+	bne	cr7,L(done)
+	b	L(loop)	      /* We branch here (rather than falling through)
+				 to skip the nops due to heavy alignment
+				 of the loop below.  */
+
+	.p2align  5
+L(loop):
+	/* Load two words, compare and merge in a
+	   single register for speed.  This is an attempt
+	   to speed up the null-checking process for bigger strings.  */
+	lwz	r12,4(r8)
+	lwzu	r9,8(r8)
+	cmpb	r10,r12,r4
+	cmpb	r11,r12,r0
+	cmpb	r6,r9,r4
+	cmpb	r7,r9,r0
+	or	r12,r10,r11
+	or	r9,r6,r7
+	or	r5,r12,r9
+	cmpwi	cr7,r5,0
+	beq	cr7,L(loop)
+
+	/* OK, one (or both) of the words contains a c/null byte.  Check
+	   the first word and decrement the address in case the first
+	   word really contains a c/null byte.  */
+
+	cmpwi	cr6,r12,0
+	addi	r8,r8,-4
+	bne	cr6,L(done)
+
+	/* The c/null byte must be in the second word.  Adjust the address
+	   again and move the result of cmpb to r10/r11 so we can calculate
+	   the pointer.  */
+
+	mr	r10,r6
+	mr	r11,r7
+	addi	r8,r8,4
+
+	/* r10/r11 have the output of the cmpb instructions, that is,
+	   0xff in the same position as the c/null byte in the original
+	   word from the string.  Use that to calculate the pointer.  */
+L(done):
+#ifdef __LITTLE_ENDIAN__
+	addi    r3,r10,-1
+	andc    r3,r3,r10
+	popcntw	r0,r3
+	addi    r4,r11,-1
+	andc    r4,r4,r11
+	cmplw	cr7,r3,r4
+	bgt	cr7,L(no_match)
+#else
+	cntlzw	r0,r10	      /* Count leading zeros before c matches.  */
+	cmplw	cr7,r11,r10
+	bgt	cr7,L(no_match)
+#endif
+	srwi	r0,r0,3	      /* Convert leading zeros to bytes.  */
+	add	r3,r8,r0      /* Return address of the matching c byte
+				 or null in case c was not found.  */
+	blr
+
+	.align	4
+L(no_match):
+	li	r3,0
+	blr
+
+/* We are here because strchr was called with a null byte.  */
+	.align	4
+L(null_match):
+	/* r0 has a word of null bytes.  */
+
+	cmpb	r5,r12,r0     /* Compare each byte against null bytes.  */
+
+	/* Move the words left and right to discard the bits that are
+	   not part of the string and bring them back as zeros.  */
+#ifdef __LITTLE_ENDIAN__
+	srw	r5,r5,r6
+	slw	r5,r5,r6
+#else
+	slw	r5,r5,r6
+	srw	r5,r5,r6
+#endif
+	cmpwi	cr7,r5,0      /* If r10 == 0, no c or null bytes
+				 have been found.  */
+	bne	cr7,L(done_null)
+
+	mtcrf   0x01,r8
+
+	/* Are we now aligned to a doubleword boundary?  If so, skip to
+	   the main loop.  Otherwise, go through the alignment code.  */
+
+	bt	29,L(loop_null)
+
+	/* Handle WORD2 of pair.  */
+	lwzu	r12,4(r8)
+	cmpb    r5,r12,r0
+	cmpwi	cr7,r5,0
+	bne	cr7,L(done_null)
+	b	L(loop_null)  /* We branch here (rather than falling through)
+				 to skip the nops due to heavy alignment
+				 of the loop below.  */
+
+	/* Main loop to look for the end of the string.  Since it's a
+	   small loop (< 8 instructions), align it to 32-bytes.  */
+	.p2align  5
+L(loop_null):
+	/* Load two words, compare and merge in a
+	   single register for speed.  This is an attempt
+	   to speed up the null-checking process for bigger strings.  */
+	lwz	r12,4(r8)
+	lwzu    r11,8(r8)
+	cmpb	r5,r12,r0
+	cmpb	r10,r11,r0
+	or	r6,r5,r10
+	cmpwi	cr7,r6,0
+	beq	cr7,L(loop_null)
+
+	/* OK, one (or both) of the words contains a null byte.  Check
+	   the first word and decrement the address in case the first
+	   word really contains a null byte.  */
+
+	cmpwi	cr6,r5,0
+	addi	r8,r8,-4
+	bne	cr6,L(done_null)
+
+	/* The null byte must be in the second word.  Adjust the address
+	   again and move the result of cmpb to r10 so we can calculate the
+	   pointer.  */
+
+	mr	r5,r10
+	addi	r8,r8,4
+
+	/* r5 has the output of the cmpb instruction, that is, it contains
+	   0xff in the same position as the null byte in the original
+	   word from the string.  Use that to calculate the pointer.  */
+L(done_null):
+#ifdef __LITTLE_ENDIAN__
+	addi    r0,r5,-1
+	andc    r0,r0,r5
+	popcntw	r0,r0
+#else
+	cntlzw	r0,r5	      /* Count leading zeros before the match.  */
+#endif
+	srwi	r0,r0,3	      /* Convert leading zeros to bytes.  */
+	add	r3,r8,r0      /* Return address of the matching null byte.  */
+	blr
+END (strchr)
+weak_alias (strchr, index)
+libc_hidden_builtin_def (strchr)
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc32/power7/strchrnul.S b/REORG.TODO/sysdeps/powerpc/powerpc32/power7/strchrnul.S
new file mode 100644
index 0000000000..426137e11d
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc32/power7/strchrnul.S
@@ -0,0 +1,127 @@
+/* Optimized strchrnul implementation for PowerPC32/POWER7 using cmpb insn.
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Luis Machado <luisgpm@br.ibm.com>.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+/* int [r3] strchrnul (char *s [r3], int c [r4])  */
+	.machine  power7
+ENTRY (__strchrnul)
+	CALL_MCOUNT
+	dcbt	0,r3
+	clrrwi	r8,r3,2	      /* Align the address to word boundary.  */
+
+	/* Replicate byte to word.  */
+	insrwi  r4,r4,8,16
+	insrwi  r4,r4,16,0
+
+	rlwinm	r6,r3,3,27,28 /* Calculate padding.  */
+	lwz	r12,0(r8)     /* Load word from memory.  */
+	li	r0,0	      /* Word with null chars to use
+				 with cmpb.  */
+
+	/* Now r4 has a word of c bytes and r0 has
+	   a word of null bytes.  */
+
+	cmpb	r10,r12,r0    /* Compare each byte against c byte.  */
+	cmpb	r9,r12,r4     /* Compare each byte against null byte.  */
+
+	/* Move the words left and right to discard the bits that are
+	   not part of the string and bring them back as zeros.  */
+#ifdef __LITTLE_ENDIAN__
+	srw	r10,r10,r6
+	srw	r9,r9,r6
+	slw	r10,r10,r6
+	slw	r9,r9,r6
+#else
+	slw	r10,r10,r6
+	slw	r9,r9,r6
+	srw	r10,r10,r6
+	srw	r9,r9,r6
+#endif
+	or	r5,r9,r10     /* OR the results to speed things up.  */
+	cmpwi	cr7,r5,0      /* If r5 == 0, no c or null bytes
+				 have been found.  */
+	bne	cr7,L(done)
+
+	mtcrf   0x01,r8
+
+	/* Are we now aligned to a doubleword boundary?  If so, skip to
+	   the main loop.  Otherwise, go through the alignment code.  */
+
+	bt	29,L(loop)
+
+	/* Handle WORD2 of pair.  */
+	lwzu	r12,4(r8)
+	cmpb	r10,r12,r0
+	cmpb	r9,r12,r4
+	or	r5,r9,r10
+	cmpwi	cr7,r5,0
+	bne	cr7,L(done)
+	b	L(loop)	      /* We branch here (rather than falling through)
+				 to skip the nops due to heavy alignment
+				 of the loop below.  */
+
+	.p2align  5
+L(loop):
+	/* Load two words, compare and merge in a
+	   single register for speed.  This is an attempt
+	   to speed up the null-checking process for bigger strings.  */
+	lwz	r12,4(r8)
+	lwzu	r11,8(r8)
+	cmpb	r10,r12,r0
+	cmpb	r9,r12,r4
+	cmpb	r6,r11,r0
+	cmpb	r7,r11,r4
+	or	r5,r9,r10
+	or	r10,r6,r7
+	or	r11,r5,r10
+	cmpwi	cr7,r11,0
+	beq	cr7,L(loop)
+
+	/* OK, one (or both) of the words contains a c/null byte.  Check
+	   the first word and decrement the address in case the first
+	   word really contains a c/null byte.  */
+
+	cmpwi	cr6,r5,0
+	addi	r8,r8,-4
+	bne	cr6,L(done)
+
+	/* The c/null byte must be in the second word.  Adjust the address
+	   again and move the result of cmpb to r5 so we can calculate the
+	   pointer.  */
+	mr	r5,r10
+	addi	r8,r8,4
+
+	/* r5 has the output of the cmpb instruction, that is, it contains
+	   0xff in the same position as the c/null byte in the original
+	   word from the string.  Use that to calculate the pointer.  */
+L(done):
+#ifdef __LITTLE_ENDIAN__
+	addi    r0,r5,-1
+	andc    r0,r0,r5
+	popcntw	r0,r0
+#else
+	cntlzw	r0,r5	      /* Count leading zeros before the match.  */
+#endif
+	srwi	r0,r0,3	      /* Convert leading zeros to bytes.  */
+	add	r3,r8,r0      /* Return address of matching c/null byte.  */
+	blr
+END (__strchrnul)
+weak_alias (__strchrnul,strchrnul)
+libc_hidden_builtin_def (__strchrnul)
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc32/power7/strlen.S b/REORG.TODO/sysdeps/powerpc/powerpc32/power7/strlen.S
new file mode 100644
index 0000000000..3699791fa6
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc32/power7/strlen.S
@@ -0,0 +1,102 @@
+/* Optimized strlen implementation for PowerPC32/POWER7 using cmpb insn.
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Luis Machado <luisgpm@br.ibm.com>.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+/* int [r3] strlen (char *s [r3])  */
+	.machine  power7
+ENTRY (strlen)
+	CALL_MCOUNT
+	dcbt	0,r3
+	clrrwi	r4,r3,2	      /* Align the address to word boundary.  */
+	rlwinm	r6,r3,3,27,28 /* Calculate padding.  */
+	li	r0,0	      /* Word with null chars to use with cmpb.  */
+	li	r5,-1	      /* MASK = 0xffffffffffffffff.  */
+	lwz	r12,0(r4)     /* Load word from memory.  */
+#ifdef __LITTLE_ENDIAN__
+	slw	r5,r5,r6
+#else
+	srw	r5,r5,r6      /* MASK = MASK >> padding.  */
+#endif
+	orc	r9,r12,r5     /* Mask bits that are not part of the string.  */
+	cmpb	r10,r9,r0     /* Check for null bytes in WORD1.  */
+	cmpwi	cr7,r10,0     /* If r10 == 0, no null's have been found.  */
+	bne	cr7,L(done)
+
+	mtcrf   0x01,r4
+
+	/* Are we now aligned to a doubleword boundary?  If so, skip to
+	   the main loop.  Otherwise, go through the alignment code.  */
+
+	bt	29,L(loop)
+
+	/* Handle WORD2 of pair.  */
+	lwzu	r12,4(r4)
+	cmpb	r10,r12,r0
+	cmpwi	cr7,r10,0
+	bne	cr7,L(done)
+
+	/* Main loop to look for the end of the string.  Since it's a
+	   small loop (< 8 instructions), align it to 32-bytes.  */
+	.p2align  5
+L(loop):
+	/* Load two words, compare and merge in a
+	   single register for speed.  This is an attempt
+	   to speed up the null-checking process for bigger strings.  */
+
+	lwz	r12, 4(r4)
+	lwzu	r11, 8(r4)
+	cmpb	r10,r12,r0
+	cmpb	r9,r11,r0
+	or	r8,r9,r10     /* Merge everything in one word.  */
+	cmpwi	cr7,r8,0
+	beq	cr7,L(loop)
+
+	/* OK, one (or both) of the words contains a null byte.  Check
+	   the first word and decrement the address in case the first
+	   word really contains a null byte.  */
+
+	cmpwi	cr6,r10,0
+	addi	r4,r4,-4
+	bne	cr6,L(done)
+
+	/* The null byte must be in the second word.  Adjust the address
+	   again and move the result of cmpb to r10 so we can calculate the
+	   length.  */
+
+	mr	r10,r9
+	addi	r4,r4,4
+
+	/* r10 has the output of the cmpb instruction, that is, it contains
+	   0xff in the same position as the null byte in the original
+	   word from the string.  Use that to calculate the length.  */
+L(done):
+#ifdef __LITTLE_ENDIAN__
+	addi	r9, r10, -1   /* Form a mask from trailing zeros.  */
+	andc	r9, r9, r10
+	popcntw r0, r9	      /* Count the bits in the mask.  */
+#else
+	cntlzw	r0,r10	      /* Count leading zeros before the match.  */
+#endif
+	subf	r5,r3,r4
+	srwi	r0,r0,3	      /* Convert leading zeros to bytes.  */
+	add	r3,r5,r0      /* Compute final length.  */
+	blr
+END (strlen)
+libc_hidden_builtin_def (strlen)
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc32/power7/strncmp.S b/REORG.TODO/sysdeps/powerpc/powerpc32/power7/strncmp.S
new file mode 100644
index 0000000000..d4598e1930
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc32/power7/strncmp.S
@@ -0,0 +1,199 @@
+/* Optimized strcmp implementation for POWER7/PowerPC32.
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+/* See strlen.s for comments on how the end-of-string testing works.  */
+
+/* int [r3] strncmp (const char *s1 [r3],
+		     const char *s2 [r4],
+		     size_t size [r5])  */
+
+EALIGN (strncmp,5,0)
+
+#define rTMP2	r0
+#define rRTN	r3
+#define rSTR1	r3	/* first string arg */
+#define rSTR2	r4	/* second string arg */
+#define rN	r5	/* max string length */
+#define rWORD1	r6	/* current word in s1 */
+#define rWORD2	r7	/* current word in s2 */
+#define rWORD3	r10
+#define rWORD4	r11
+#define rFEFE	r8	/* constant 0xfefefeff (-0x01010101) */
+#define r7F7F	r9	/* constant 0x7f7f7f7f */
+#define rNEG	r10	/* ~(word in s1 | 0x7f7f7f7f) */
+#define rBITDIF	r11	/* bits that differ in s1 & s2 words */
+#define rTMP	r12
+
+	dcbt	0,rSTR1
+	nop
+	or	rTMP,rSTR2,rSTR1
+	lis	r7F7F,0x7f7f
+	dcbt	0,rSTR2
+	nop
+	clrlwi.	rTMP,rTMP,30
+	cmplwi	cr1,rN,0
+	lis	rFEFE,-0x101
+	bne	L(unaligned)
+/* We are word aligned so set up for two loops.  first a word
+   loop, then fall into the byte loop if any residual.  */
+	srwi.	rTMP,rN,2
+	clrlwi	rN,rN,30
+	addi	rFEFE,rFEFE,-0x101
+	addi	r7F7F,r7F7F,0x7f7f
+	cmplwi	cr1,rN,0
+	beq	L(unaligned)
+
+	mtctr	rTMP
+	lwz	rWORD1,0(rSTR1)
+	lwz	rWORD2,0(rSTR2)
+	b	L(g1)
+
+L(g0):
+	lwzu	rWORD1,4(rSTR1)
+	bne	cr1,L(different)
+	lwzu	rWORD2,4(rSTR2)
+L(g1):	add	rTMP,rFEFE,rWORD1
+	nor	rNEG,r7F7F,rWORD1
+	bdz	L(tail)
+	and.	rTMP,rTMP,rNEG
+	cmpw	cr1,rWORD1,rWORD2
+	beq	L(g0)
+
+/* OK. We've hit the end of the string. We need to be careful that
+   we don't compare two strings as different because of gunk beyond
+   the end of the strings...  */
+#ifdef __LITTLE_ENDIAN__
+L(endstring):
+	slwi	rTMP, rTMP, 1
+	addi    rTMP2, rTMP, -1
+	andc    rTMP2, rTMP2, rTMP
+	and	rWORD2, rWORD2, rTMP2		/* Mask off gunk.  */
+	and	rWORD1, rWORD1, rTMP2
+	rlwinm	rTMP2, rWORD2, 8, 0xffffffff	/* Byte reverse word.  */
+	rlwinm	rTMP, rWORD1, 8, 0xffffffff
+	rldimi	rTMP2, rWORD2, 24, 32
+	rldimi	rTMP, rWORD1, 24, 32
+	rlwimi	rTMP2, rWORD2, 24, 16, 23
+	rlwimi	rTMP, rWORD1, 24, 16, 23
+	xor.	rBITDIF, rTMP, rTMP2
+	sub	rRTN, rTMP, rTMP2
+	bgelr
+	ori	rRTN, rTMP2, 1
+	blr
+
+L(different):
+	lwz	rWORD1, -4(rSTR1)
+	rlwinm	rTMP2, rWORD2, 8, 0xffffffff	/* Byte reverse word.  */
+	rlwinm	rTMP, rWORD1, 8, 0xffffffff
+	rldimi	rTMP2, rWORD2, 24, 32
+	rldimi	rTMP, rWORD1, 24, 32
+	rlwimi	rTMP2, rWORD2, 24, 16, 23
+	rlwimi	rTMP, rWORD1, 24, 16, 23
+	xor.	rBITDIF, rTMP, rTMP2
+	sub	rRTN, rTMP, rTMP2
+	bgelr
+	ori	rRTN, rTMP2, 1
+	blr
+
+#else
+L(endstring):
+	and	rTMP,r7F7F,rWORD1
+	beq	cr1,L(equal)
+	add	rTMP,rTMP,r7F7F
+	xor.	rBITDIF,rWORD1,rWORD2
+	andc	rNEG,rNEG,rTMP
+	blt	L(highbit)
+	cntlzw	rBITDIF,rBITDIF
+	cntlzw	rNEG,rNEG
+	addi	rNEG,rNEG,7
+	cmpw	cr1,rNEG,rBITDIF
+	sub	rRTN,rWORD1,rWORD2
+	bgelr	cr1
+L(equal):
+	li	rRTN,0
+	blr
+
+L(different):
+	lwz	rWORD1,-4(rSTR1)
+	xor.	rBITDIF,rWORD1,rWORD2
+	sub	rRTN,rWORD1,rWORD2
+	bgelr
+L(highbit):
+	ori	rRTN, rWORD2, 1
+	blr
+#endif
+
+/* Oh well. In this case, we just do a byte-by-byte comparison.  */
+	.align	4
+L(tail):
+	and.	rTMP,rTMP,rNEG
+	cmpw	cr1,rWORD1,rWORD2
+	bne	L(endstring)
+	addi	rSTR1,rSTR1,4
+	bne	cr1,L(different)
+	addi	rSTR2,rSTR2,4
+	cmplwi	cr1,rN,0
+L(unaligned):
+	mtctr	rN
+	ble	cr1,L(ux)
+L(uz):
+	lbz	rWORD1,0(rSTR1)
+	lbz	rWORD2,0(rSTR2)
+	.align	4
+L(u1):
+	cmpwi	cr1,rWORD1,0
+	bdz	L(u4)
+	cmpw	rWORD1,rWORD2
+	beq	cr1,L(u4)
+	bne	L(u4)
+	lbzu	rWORD3,1(rSTR1)
+	lbzu	rWORD4,1(rSTR2)
+	cmpwi	cr1,rWORD3,0
+	bdz	L(u3)
+	cmpw	rWORD3,rWORD4
+	beq	cr1,L(u3)
+	bne	L(u3)
+	lbzu	rWORD1,1(rSTR1)
+	lbzu	rWORD2,1(rSTR2)
+	cmpwi	cr1,rWORD1,0
+	bdz	L(u4)
+	cmpw	rWORD1,rWORD2
+	beq	cr1,L(u4)
+	bne	L(u4)
+	lbzu	rWORD3,1(rSTR1)
+	lbzu	rWORD4,1(rSTR2)
+	cmpwi	cr1,rWORD3,0
+	bdz	L(u3)
+	cmpw	rWORD3,rWORD4
+	beq	cr1,L(u3)
+	bne	L(u3)
+	lbzu	rWORD1,1(rSTR1)
+	lbzu	rWORD2,1(rSTR2)
+	b	L(u1)
+
+L(u3):  sub	rRTN,rWORD3,rWORD4
+	blr
+L(u4):	sub	rRTN,rWORD1,rWORD2
+	blr
+L(ux):
+	li	rRTN,0
+	blr
+END (strncmp)
+libc_hidden_builtin_def (strncmp)
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc32/power7/strnlen.S b/REORG.TODO/sysdeps/powerpc/powerpc32/power7/strnlen.S
new file mode 100644
index 0000000000..6019d5be5b
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc32/power7/strnlen.S
@@ -0,0 +1,176 @@
+/* Optimized strnlen implementation for PowerPC32/POWER7 using cmpb insn.
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Luis Machado <luisgpm@br.ibm.com>.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+/* int [r3] strnlen (char *s [r3], int size [r4])  */
+	.machine  power7
+ENTRY (__strnlen)
+	CALL_MCOUNT
+	dcbt	0,r3
+	clrrwi	r8,r3,2	      /* Align the address to word boundary.  */
+	add	r7,r3,r4      /* Calculate the last acceptable address.  */
+	cmplwi	r4,16
+	li	r0,0	      /* Word with null chars.  */
+	addi	r7,r7,-1
+	ble	L(small_range)
+
+	rlwinm	r6,r3,3,27,28 /* Calculate padding.  */
+	lwz	r12,0(r8)     /* Load word from memory.  */
+	cmpb	r10,r12,r0    /* Check for null bytes in DWORD1.  */
+#ifdef __LITTLE_ENDIAN__
+	srw	r10,r10,r6
+	slw	r10,r10,r6
+#else
+	slw	r10,r10,r6
+	srw	r10,r10,r6
+#endif
+	cmplwi	cr7,r10,0     /* If r10 == 0, no null's have been found.  */
+	bne	cr7,L(done)
+
+	clrrwi	r7,r7,2       /* Address of last word.  */
+	mtcrf   0x01,r8
+	/* Are we now aligned to a doubleword boundary?  If so, skip to
+	   the main loop.  Otherwise, go through the alignment code.  */
+
+	bt	29,L(loop_setup)
+
+	/* Handle WORD2 of pair.  */
+	lwzu	r12,4(r8)
+	cmpb	r10,r12,r0
+	cmplwi	cr7,r10,0
+	bne	cr7,L(done)
+
+L(loop_setup):
+	/* The last word we want to read in the loop below is the one
+	   containing the last byte of the string, ie. the word at
+	   (s + size - 1) & ~3, or r7.  The first word read is at
+	   r8 + 4, we read 2 * cnt words, so the last word read will
+	   be at r8 + 4 + 8 * cnt - 4.  Solving for cnt gives
+	   cnt = (r7 - r8) / 8  */
+	sub	r5,r7,r8
+	srwi	r6,r5,3	      /* Number of loop iterations.  */
+	mtctr	r6	      /* Setup the counter.  */
+
+	/* Main loop to look for the null byte in the string.  Since
+	   it's a small loop (< 8 instructions), align it to 32-bytes.  */
+	.p2align  5
+L(loop):
+	/* Load two words, compare and merge in a
+	   single register for speed.  This is an attempt
+	   to speed up the null-checking process for bigger strings.  */
+
+	lwz	r12,4(r8)
+	lwzu	r11,8(r8)
+	cmpb	r10,r12,r0
+	cmpb	r9,r11,r0
+	or	r5,r9,r10     /* Merge everything in one word.  */
+	cmplwi	cr7,r5,0
+	bne	cr7,L(found)
+	bdnz	L(loop)
+
+	/* We may have one more word to read.  */
+	cmplw	cr6,r8,r7
+	beq	cr6,L(end_max)
+
+	lwzu	r12,4(r8)
+	cmpb	r10,r12,r0
+	cmplwi	cr6,r10,0
+	bne	cr6,L(done)
+
+L(end_max):
+	mr	r3,r4
+	blr
+
+	/* OK, one (or both) of the words contains a null byte.  Check
+	   the first word and decrement the address in case the first
+	   word really contains a null byte.  */
+	.align	4
+L(found):
+	cmplwi	cr6,r10,0
+	addi	r8,r8,-4
+	bne	cr6,L(done)
+
+	/* The null byte must be in the second word.  Adjust the address
+	   again and move the result of cmpb to r10 so we can calculate the
+	   length.  */
+
+	mr	r10,r9
+	addi	r8,r8,4
+
+	/* r10 has the output of the cmpb instruction, that is, it contains
+	   0xff in the same position as the null byte in the original
+	   word from the string.  Use that to calculate the length.
+	   We need to make sure the null char is *before* the end of the
+	   range.  */
+L(done):
+#ifdef __LITTLE_ENDIAN__
+	addi	r0,r10,-1
+	andc	r0,r0,r10
+	popcntw	r0,r0
+#else
+	cntlzw	r0,r10	      /* Count leading zeros before the match.  */
+#endif
+	sub	r3,r8,r3
+	srwi	r0,r0,3	      /* Convert leading/trailing zeros to bytes.  */
+	add	r3,r3,r0      /* Length until the match.  */
+	cmplw	r3,r4
+	blelr
+	mr	r3,r4
+	blr
+
+/* Deals with size <= 16.  */
+	.align	4
+L(small_range):
+	cmplwi	r4,0
+	beq	L(end_max)
+
+	clrrwi	r7,r7,2       /* Address of last word.  */
+
+	rlwinm	r6,r3,3,27,28 /* Calculate padding.  */
+	lwz	r12,0(r8)     /* Load word from memory.  */
+	cmpb	r10,r12,r0    /* Check for null bytes in WORD1.  */
+#ifdef __LITTLE_ENDIAN__
+	srw	r10,r10,r6
+	slw	r10,r10,r6
+#else
+	slw	r10,r10,r6
+	srw	r10,r10,r6
+#endif
+	cmplwi	cr7,r10,0
+	bne	cr7,L(done)
+
+	cmplw	r8,r7
+	beq	L(end_max)
+
+	.p2align  5
+L(loop_small):
+	lwzu	r12,4(r8)
+	cmpb	r10,r12,r0
+	cmplwi	cr6,r10,0
+	bne	cr6,L(done)
+	cmplw	r8,r7
+	bne	L(loop_small)
+	mr	r3,r4
+	blr
+
+END (__strnlen)
+libc_hidden_def (__strnlen)
+weak_alias (__strnlen, strnlen)
+libc_hidden_builtin_def (strnlen)
author	Zack Weinberg <zackw@panix.com>	2017-06-08 15:39:03 -0400
committer	Zack Weinberg <zackw@panix.com>	2017-06-08 15:39:03 -0400
commit	5046dbb4a7eba5eccfd258f92f4735c9ffc8d069 (patch)
tree	4470480d904b65cf14ca524f96f79eca818c3eaf /REORG.TODO/sysdeps/powerpc/powerpc32/power7
parent	199fc19d3aaaf57944ef036e15904febe877fc93 (diff)
download	glibc-5046dbb4a7eba5eccfd258f92f4735c9ffc8d069.tar.gz glibc-5046dbb4a7eba5eccfd258f92f4735c9ffc8d069.tar.xz glibc-5046dbb4a7eba5eccfd258f92f4735c9ffc8d069.zip