about summary refs log tree commit diff
path: root/sysdeps/powerpc/powerpc64/power7/memcmp.S
diff options
context:
space:
mode:
authorZack Weinberg <zackw@panix.com>2017-06-08 15:39:03 -0400
committerZack Weinberg <zackw@panix.com>2017-06-08 15:39:03 -0400
commit5046dbb4a7eba5eccfd258f92f4735c9ffc8d069 (patch)
tree4470480d904b65cf14ca524f96f79eca818c3eaf /sysdeps/powerpc/powerpc64/power7/memcmp.S
parent199fc19d3aaaf57944ef036e15904febe877fc93 (diff)
downloadglibc-zack/build-layout-experiment.tar.gz
glibc-zack/build-layout-experiment.tar.xz
glibc-zack/build-layout-experiment.zip
Prepare for radical source tree reorganization. zack/build-layout-experiment
All top-level files and directories are moved into a temporary storage
directory, REORG.TODO, except for files that will certainly still
exist in their current form at top level when we're done (COPYING,
COPYING.LIB, LICENSES, NEWS, README), all old ChangeLog files (which
are moved to the new directory OldChangeLogs, instead), and the
generated file INSTALL (which is just deleted; in the new order, there
will be no generated files checked into version control).
Diffstat (limited to 'sysdeps/powerpc/powerpc64/power7/memcmp.S')
-rw-r--r--sysdeps/powerpc/powerpc64/power7/memcmp.S1061
1 files changed, 0 insertions, 1061 deletions
diff --git a/sysdeps/powerpc/powerpc64/power7/memcmp.S b/sysdeps/powerpc/powerpc64/power7/memcmp.S
deleted file mode 100644
index 96ce8cee25..0000000000
--- a/sysdeps/powerpc/powerpc64/power7/memcmp.S
+++ /dev/null
@@ -1,1061 +0,0 @@
-/* Optimized memcmp implementation for POWER7/PowerPC64.
-   Copyright (C) 2010-2017 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <http://www.gnu.org/licenses/>.  */
-
-#include <sysdep.h>
-
-/* int [r3] memcmp (const char *s1 [r3],
-		    const char *s2 [r4],
-		    size_t size [r5])  */
-#ifndef MEMCMP
-# define MEMCMP memcmp
-#endif
-	.machine power7
-EALIGN (MEMCMP, 4, 0)
-	CALL_MCOUNT 3
-
-#define rRTN		r3
-#define rSTR1		r3	/* first string arg */
-#define rSTR2		r4	/* second string arg */
-#define rN		r5	/* max string length */
-#define rWORD1		r6	/* current word in s1 */
-#define rWORD2		r7	/* current word in s2 */
-#define rWORD3		r8	/* next word in s1 */
-#define rWORD4		r9	/* next word in s2 */
-#define rWORD5		r10	/* next word in s1 */
-#define rWORD6		r11	/* next word in s2 */
-
-#define rOFF8		r20	/* 8 bytes offset.  */
-#define rOFF16  	r21	/* 16 bytes offset.  */
-#define rOFF24		r22	/* 24 bytes offset.  */
-#define rOFF32		r23	/* 24 bytes offset.  */
-#define rWORD6_SHIFT	r24	/* Left rotation temp for rWORD8.  */
-#define rWORD4_SHIFT	r25	/* Left rotation temp for rWORD6.  */
-#define rWORD2_SHIFT	r26	/* Left rotation temp for rWORD4.  */
-#define rWORD8_SHIFT	r27	/* Left rotation temp for rWORD2.  */
-#define rSHR		r28	/* Unaligned shift right count.  */
-#define rSHL		r29	/* Unaligned shift left count.  */
-#define rWORD7		r30	/* next word in s1 */
-#define rWORD8		r31	/* next word in s2 */
-
-#define rWORD8SAVE	(-8)
-#define rWORD7SAVE	(-16)
-#define rOFF8SAVE	(-24)
-#define rOFF16SAVE	(-32)
-#define rOFF24SAVE	(-40)
-#define rOFF32SAVE	(-48)
-#define rSHRSAVE	(-56)
-#define rSHLSAVE	(-64)
-#define rWORD8SHIFTSAVE	(-72)
-#define rWORD2SHIFTSAVE	(-80)
-#define rWORD4SHIFTSAVE	(-88)
-#define rWORD6SHIFTSAVE	(-96)
-
-#ifdef __LITTLE_ENDIAN__
-# define LD	ldbrx
-#else
-# define LD	ldx
-#endif
-
-	xor	r0, rSTR2, rSTR1
-	cmpldi	cr6, rN, 0
-	cmpldi	cr1, rN, 12
-	clrldi.	r0, r0, 61
-	clrldi	r12, rSTR1, 61
-	cmpldi	cr5, r12, 0
-	beq-	cr6, L(zeroLength)
-	dcbt	0, rSTR1
-	dcbt	0, rSTR2
-/* If less than 8 bytes or not aligned, use the unaligned
-   byte loop.  */
-	blt	cr1, L(bytealigned)
-	std	rWORD8, rWORD8SAVE(r1)
-	std	rWORD7, rWORD7SAVE(r1)
-	std	rOFF8, rOFF8SAVE(r1)
-	std	rOFF16, rOFF16SAVE(r1)
-	std	rOFF24, rOFF24SAVE(r1)
-	std	rOFF32, rOFF32SAVE(r1)
-	cfi_offset(rWORD8, rWORD8SAVE)
-	cfi_offset(rWORD7, rWORD7SAVE)
-	cfi_offset(rOFF8, rOFF8SAVE)
-	cfi_offset(rOFF16, rOFF16SAVE)
-	cfi_offset(rOFF24, rOFF24SAVE)
-	cfi_offset(rOFF32, rOFF32SAVE)
-
-	li	rOFF8,8
-	li	rOFF16,16
-	li	rOFF24,24
-	li	rOFF32,32
-
-	bne	L(unaligned)
-/* At this point we know both strings have the same alignment and the
-   compare length is at least 8 bytes.  r12 contains the low order
-   3 bits of rSTR1 and cr5 contains the result of the logical compare
-   of r12 to 0.  If r12 == 0 then we are already double word
-   aligned and can perform the DW aligned loop.
-
-   Otherwise we know the two strings have the same alignment (but not
-   yet DW).  So we force the string addresses to the next lower DW
-   boundary and special case this first DW using shift left to
-   eliminate bits preceding the first byte.  Since we want to join the
-   normal (DW aligned) compare loop, starting at the second double word,
-   we need to adjust the length (rN) and special case the loop
-   versioning for the first DW. This ensures that the loop count is
-   correct and the first DW (shifted) is in the expected register pair.  */
-	.align	4
-L(samealignment):
-	clrrdi	rSTR1, rSTR1, 3
-	clrrdi	rSTR2, rSTR2, 3
-	beq	cr5, L(DWaligned)
-	add	rN, rN, r12
-	sldi	rWORD6, r12, 3
-	srdi	r0, rN, 5	/* Divide by 32 */
-	andi.	r12, rN, 24	/* Get the DW remainder */
-	LD	rWORD1, 0, rSTR1
-	LD	rWORD2, 0, rSTR2
-	cmpldi	cr1, r12, 16
-	cmpldi	cr7, rN, 32
-	clrldi	rN, rN, 61
-	beq	L(dPs4)
-	mtctr	r0
-	bgt	cr1, L(dPs3)
-	beq	cr1, L(dPs2)
-
-/* Remainder is 8 */
-	.align	3
-L(dsP1):
-	sld	rWORD5, rWORD1, rWORD6
-	sld	rWORD6, rWORD2, rWORD6
-	cmpld	cr5, rWORD5, rWORD6
-	blt	cr7, L(dP1x)
-/* Do something useful in this cycle since we have to branch anyway.  */
-	LD	rWORD1, rOFF8, rSTR1
-	LD	rWORD2, rOFF8, rSTR2
-	cmpld	cr7, rWORD1, rWORD2
-	b	L(dP1e)
-/* Remainder is 16 */
-	.align	4
-L(dPs2):
-	sld	rWORD5, rWORD1, rWORD6
-	sld	rWORD6, rWORD2, rWORD6
-	cmpld	cr6, rWORD5, rWORD6
-	blt	cr7, L(dP2x)
-/* Do something useful in this cycle since we have to branch anyway.  */
-	LD	rWORD7, rOFF8, rSTR1
-	LD	rWORD8, rOFF8, rSTR2
-	cmpld	cr5, rWORD7, rWORD8
-	b	L(dP2e)
-/* Remainder is 24 */
-	.align	4
-L(dPs3):
-	sld	rWORD3, rWORD1, rWORD6
-	sld	rWORD4, rWORD2, rWORD6
-	cmpld	cr1, rWORD3, rWORD4
-	b	L(dP3e)
-/* Count is a multiple of 32, remainder is 0 */
-	.align	4
-L(dPs4):
-	mtctr	r0
-	sld	rWORD1, rWORD1, rWORD6
-	sld	rWORD2, rWORD2, rWORD6
-	cmpld	cr7, rWORD1, rWORD2
-	b	L(dP4e)
-
-/* At this point we know both strings are double word aligned and the
-   compare length is at least 8 bytes.  */
-	.align	4
-L(DWaligned):
-	andi.	r12, rN, 24	/* Get the DW remainder */
-	srdi	r0, rN, 5	/* Divide by 32 */
-	cmpldi	cr1, r12, 16
-	cmpldi	cr7, rN, 32
-	clrldi	rN, rN, 61
-	beq	L(dP4)
-	bgt	cr1, L(dP3)
-	beq	cr1, L(dP2)
-
-/* Remainder is 8 */
-	.align	4
-L(dP1):
-	mtctr	r0
-/* Normally we'd use rWORD7/rWORD8 here, but since we might exit early
-   (8-15 byte compare), we want to use only volatile registers.  This
-   means we can avoid restoring non-volatile registers since we did not
-   change any on the early exit path.  The key here is the non-early
-   exit path only cares about the condition code (cr5), not about which
-   register pair was used.  */
-	LD	rWORD5, 0, rSTR1
-	LD	rWORD6, 0, rSTR2
-	cmpld	cr5, rWORD5, rWORD6
-	blt	cr7, L(dP1x)
-	LD	rWORD1, rOFF8, rSTR1
-	LD	rWORD2, rOFF8, rSTR2
-	cmpld	cr7, rWORD1, rWORD2
-L(dP1e):
-	LD	rWORD3, rOFF16, rSTR1
-	LD	rWORD4, rOFF16, rSTR2
-	cmpld	cr1, rWORD3, rWORD4
-	LD	rWORD5, rOFF24, rSTR1
-	LD	rWORD6, rOFF24, rSTR2
-	cmpld	cr6, rWORD5, rWORD6
-	bne	cr5, L(dLcr5x)
-	bne	cr7, L(dLcr7x)
-
-	LD	rWORD7, rOFF32, rSTR1
-	LD	rWORD8, rOFF32, rSTR2
-	addi	rSTR1, rSTR1, 32
-	addi	rSTR2, rSTR2, 32
-	bne	cr1, L(dLcr1)
-	cmpld	cr5, rWORD7, rWORD8
-	bdnz	L(dLoop)
-	bne	cr6, L(dLcr6)
-	ld	rWORD8, rWORD8SAVE(r1)
-	ld	rWORD7, rWORD7SAVE(r1)
-	.align	3
-L(dP1x):
-	sldi.	r12, rN, 3
-	bne	cr5, L(dLcr5x)
-	subfic	rN, r12, 64	/* Shift count is 64 - (rN * 8).  */
-	bne	L(d00)
-	ld	rOFF8,  rOFF8SAVE(r1)
-	ld	rOFF16, rOFF16SAVE(r1)
-	ld	rOFF24, rOFF24SAVE(r1)
-	ld	rOFF32, rOFF32SAVE(r1)
-	li	rRTN, 0
-	blr
-
-/* Remainder is 16 */
-	.align	4
-L(dP2):
-	mtctr	r0
-	LD	rWORD5, 0, rSTR1
-	LD	rWORD6, 0, rSTR2
-	cmpld	cr6, rWORD5, rWORD6
-	blt	cr7, L(dP2x)
-	LD	rWORD7, rOFF8, rSTR1
-	LD	rWORD8, rOFF8, rSTR2
-	cmpld	cr5, rWORD7, rWORD8
-L(dP2e):
-	LD	rWORD1, rOFF16, rSTR1
-	LD	rWORD2, rOFF16, rSTR2
-	cmpld	cr7, rWORD1, rWORD2
-	LD	rWORD3, rOFF24, rSTR1
-	LD	rWORD4, rOFF24, rSTR2
-	cmpld	cr1, rWORD3, rWORD4
-	addi	rSTR1, rSTR1, 8
-	addi	rSTR2, rSTR2, 8
-	bne	cr6, L(dLcr6)
-	bne	cr5, L(dLcr5)
-	b	L(dLoop2)
-	.align	4
-L(dP2x):
-	LD	rWORD3, rOFF8, rSTR1
-	LD	rWORD4, rOFF8, rSTR2
-	cmpld	cr1, rWORD3, rWORD4
-	sldi.	r12, rN, 3
-	bne	cr6, L(dLcr6x)
-	addi	rSTR1, rSTR1, 8
-	addi	rSTR2, rSTR2, 8
-	bne	cr1, L(dLcr1x)
-	subfic	rN, r12, 64	/* Shift count is 64 - (rN * 8).  */
-	bne	L(d00)
-	ld	rOFF8,  rOFF8SAVE(r1)
-	ld	rOFF16, rOFF16SAVE(r1)
-	ld	rOFF24, rOFF24SAVE(r1)
-	ld	rOFF32, rOFF32SAVE(r1)
-	li	rRTN, 0
-	blr
-
-/* Remainder is 24 */
-	.align	4
-L(dP3):
-	mtctr	r0
-	LD	rWORD3, 0, rSTR1
-	LD	rWORD4, 0, rSTR2
-	cmpld	cr1, rWORD3, rWORD4
-L(dP3e):
-	LD	rWORD5, rOFF8, rSTR1
-	LD	rWORD6, rOFF8, rSTR2
-	cmpld	cr6, rWORD5, rWORD6
-	blt	cr7, L(dP3x)
-	LD	rWORD7, rOFF16, rSTR1
-	LD	rWORD8, rOFF16, rSTR2
-	cmpld	cr5, rWORD7, rWORD8
-	LD	rWORD1, rOFF24, rSTR1
-	LD	rWORD2, rOFF24, rSTR2
-	cmpld	cr7, rWORD1, rWORD2
-	addi	rSTR1, rSTR1, 16
-	addi	rSTR2, rSTR2, 16
-	bne	cr1, L(dLcr1)
-	bne	cr6, L(dLcr6)
-	b	L(dLoop1)
-/* Again we are on a early exit path (24-31 byte compare), we want to
-   only use volatile registers and avoid restoring non-volatile
-   registers.  */
-	.align	4
-L(dP3x):
-	LD	rWORD1, rOFF16, rSTR1
-	LD	rWORD2, rOFF16, rSTR2
-	cmpld	cr7, rWORD1, rWORD2
-	sldi.	r12, rN, 3
-	bne	cr1, L(dLcr1x)
-	addi	rSTR1, rSTR1, 16
-	addi	rSTR2, rSTR2, 16
-	bne	cr6, L(dLcr6x)
-	subfic	rN, r12, 64	/* Shift count is 64 - (rN * 8).  */
-	bne	cr7, L(dLcr7x)
-	bne	L(d00)
-	ld	rOFF8,  rOFF8SAVE(r1)
-	ld	rOFF16, rOFF16SAVE(r1)
-	ld	rOFF24, rOFF24SAVE(r1)
-	ld	rOFF32, rOFF32SAVE(r1)
-	li	rRTN, 0
-	blr
-
-/* Count is a multiple of 32, remainder is 0 */
-	.align	4
-L(dP4):
-	mtctr	r0
-	LD	rWORD1, 0, rSTR1
-	LD	rWORD2, 0, rSTR2
-	cmpld	cr7, rWORD1, rWORD2
-L(dP4e):
-	LD	rWORD3, rOFF8, rSTR1
-	LD	rWORD4, rOFF8, rSTR2
-	cmpld	cr1, rWORD3, rWORD4
-	LD	rWORD5, rOFF16, rSTR1
-	LD	rWORD6, rOFF16, rSTR2
-	cmpld	cr6, rWORD5, rWORD6
-	LD	rWORD7, rOFF24, rSTR1
-	LD	rWORD8, rOFF24, rSTR2
-	addi	rSTR1, rSTR1, 24
-	addi	rSTR2, rSTR2, 24
-	cmpld	cr5, rWORD7, rWORD8
-	bne	cr7, L(dLcr7)
-	bne	cr1, L(dLcr1)
-	bdz-	L(d24)		/* Adjust CTR as we start with +4 */
-/* This is the primary loop */
-	.align	4
-L(dLoop):
-	LD	rWORD1, rOFF8, rSTR1
-	LD	rWORD2, rOFF8, rSTR2
-	cmpld	cr1, rWORD3, rWORD4
-	bne	cr6, L(dLcr6)
-L(dLoop1):
-	LD	rWORD3, rOFF16, rSTR1
-	LD	rWORD4, rOFF16, rSTR2
-	cmpld	cr6, rWORD5, rWORD6
-	bne	cr5, L(dLcr5)
-L(dLoop2):
-	LD	rWORD5, rOFF24, rSTR1
-	LD	rWORD6, rOFF24, rSTR2
-	cmpld	cr5, rWORD7, rWORD8
-	bne	cr7, L(dLcr7)
-L(dLoop3):
-	LD	rWORD7, rOFF32, rSTR1
-	LD	rWORD8, rOFF32, rSTR2
-	addi	rSTR1, rSTR1, 32
-	addi	rSTR2, rSTR2, 32
-	bne	cr1, L(dLcr1)
-	cmpld	cr7, rWORD1, rWORD2
-	bdnz	L(dLoop)
-
-L(dL4):
-	cmpld	cr1, rWORD3, rWORD4
-	bne	cr6, L(dLcr6)
-	cmpld	cr6, rWORD5, rWORD6
-	bne	cr5, L(dLcr5)
-	cmpld	cr5, rWORD7, rWORD8
-L(d44):
-	bne	cr7, L(dLcr7)
-L(d34):
-	bne	cr1, L(dLcr1)
-L(d24):
-	bne	cr6, L(dLcr6)
-L(d14):
-	sldi.	r12, rN, 3
-	bne	cr5, L(dLcr5)
-L(d04):
-	ld	rWORD8, rWORD8SAVE(r1)
-	ld	rWORD7, rWORD7SAVE(r1)
-	subfic	rN, r12, 64	/* Shift count is 64 - (rN * 8).  */
-	beq	L(duzeroLength)
-/* At this point we have a remainder of 1 to 7 bytes to compare.  Since
-   we are aligned it is safe to load the whole double word, and use
-   shift right double to eliminate bits beyond the compare length.  */
-L(d00):
-	LD	rWORD1, rOFF8, rSTR1
-	LD	rWORD2, rOFF8, rSTR2
-	srd	rWORD1, rWORD1, rN
-	srd	rWORD2, rWORD2, rN
-	cmpld	cr7, rWORD1, rWORD2
-	bne	cr7, L(dLcr7x)
-	ld	rOFF8,  rOFF8SAVE(r1)
-	ld	rOFF16, rOFF16SAVE(r1)
-	ld	rOFF24, rOFF24SAVE(r1)
-	ld	rOFF32, rOFF32SAVE(r1)
-	li	rRTN, 0
-	blr
-
-	.align	4
-L(dLcr7):
-	ld	rWORD8, rWORD8SAVE(r1)
-	ld	rWORD7, rWORD7SAVE(r1)
-L(dLcr7x):
-	ld	rOFF8,  rOFF8SAVE(r1)
-	ld	rOFF16, rOFF16SAVE(r1)
-	ld	rOFF24, rOFF24SAVE(r1)
-	ld	rOFF32, rOFF32SAVE(r1)
-	li	rRTN, 1
-	bgtlr	cr7
-	li	rRTN, -1
-	blr
-	.align	4
-L(dLcr1):
-	ld	rWORD8, rWORD8SAVE(r1)
-	ld	rWORD7, rWORD7SAVE(r1)
-L(dLcr1x):
-	ld	rOFF8,  rOFF8SAVE(r1)
-	ld	rOFF16, rOFF16SAVE(r1)
-	ld	rOFF24, rOFF24SAVE(r1)
-	ld	rOFF32, rOFF32SAVE(r1)
-	li	rRTN, 1
-	bgtlr	cr1
-	li	rRTN, -1
-	blr
-	.align	4
-L(dLcr6):
-	ld	rWORD8, rWORD8SAVE(r1)
-	ld	rWORD7, rWORD7SAVE(r1)
-L(dLcr6x):
-	ld	rOFF8,  rOFF8SAVE(r1)
-	ld	rOFF16, rOFF16SAVE(r1)
-	ld	rOFF24, rOFF24SAVE(r1)
-	ld	rOFF32, rOFF32SAVE(r1)
-	li	rRTN, 1
-	bgtlr	cr6
-	li	rRTN, -1
-	blr
-	.align	4
-L(dLcr5):
-	ld	rWORD8, rWORD8SAVE(r1)
-	ld	rWORD7, rWORD7SAVE(r1)
-L(dLcr5x):
-	ld	rOFF8,  rOFF8SAVE(r1)
-	ld	rOFF16, rOFF16SAVE(r1)
-	ld	rOFF24, rOFF24SAVE(r1)
-	ld	rOFF32, rOFF32SAVE(r1)
-	li	rRTN, 1
-	bgtlr	cr5
-	li	rRTN, -1
-	blr
-
-	.align	4
-L(bytealigned):
-	mtctr	rN
-
-/* We need to prime this loop.  This loop is swing modulo scheduled
-   to avoid pipe delays.  The dependent instruction latencies (load to
-   compare to conditional branch) is 2 to 3 cycles.  In this loop each
-   dispatch group ends in a branch and takes 1 cycle.  Effectively
-   the first iteration of the loop only serves to load operands and
-   branches based on compares are delayed until the next loop.
-
-   So we must precondition some registers and condition codes so that
-   we don't exit the loop early on the first iteration.  */
-
-	lbz	rWORD1, 0(rSTR1)
-	lbz	rWORD2, 0(rSTR2)
-	bdz	L(b11)
-	cmpld	cr7, rWORD1, rWORD2
-	lbz	rWORD3, 1(rSTR1)
-	lbz	rWORD4, 1(rSTR2)
-	bdz	L(b12)
-	cmpld	cr1, rWORD3, rWORD4
-	lbzu	rWORD5, 2(rSTR1)
-	lbzu	rWORD6, 2(rSTR2)
-	bdz	L(b13)
-	.align	4
-L(bLoop):
-	lbzu	rWORD1, 1(rSTR1)
-	lbzu	rWORD2, 1(rSTR2)
-	bne	cr7, L(bLcr7)
-
-	cmpld	cr6, rWORD5, rWORD6
-	bdz	L(b3i)
-
-	lbzu	rWORD3, 1(rSTR1)
-	lbzu	rWORD4, 1(rSTR2)
-	bne	cr1, L(bLcr1)
-
-	cmpld	cr7, rWORD1, rWORD2
-	bdz	L(b2i)
-
-	lbzu	rWORD5, 1(rSTR1)
-	lbzu	rWORD6, 1(rSTR2)
-	bne	cr6, L(bLcr6)
-
-	cmpld	cr1, rWORD3, rWORD4
-	bdnz	L(bLoop)
-
-/* We speculatively loading bytes before we have tested the previous
-   bytes.  But we must avoid overrunning the length (in the ctr) to
-   prevent these speculative loads from causing a segfault.  In this
-   case the loop will exit early (before the all pending bytes are
-   tested.  In this case we must complete the pending operations
-   before returning.  */
-L(b1i):
-	bne	cr7, L(bLcr7)
-	bne	cr1, L(bLcr1)
-	b	L(bx56)
-	.align	4
-L(b2i):
-	bne	cr6, L(bLcr6)
-	bne	cr7, L(bLcr7)
-	b	L(bx34)
-	.align	4
-L(b3i):
-	bne	cr1, L(bLcr1)
-	bne	cr6, L(bLcr6)
-	b	L(bx12)
-	.align	4
-L(bLcr7):
-	li	rRTN, 1
-	bgtlr	cr7
-	li	rRTN, -1
-	blr
-L(bLcr1):
-	li	rRTN, 1
-	bgtlr	cr1
-	li	rRTN, -1
-	blr
-L(bLcr6):
-	li	rRTN, 1
-	bgtlr	cr6
-	li	rRTN, -1
-	blr
-
-L(b13):
-	bne	cr7, L(bx12)
-	bne	cr1, L(bx34)
-L(bx56):
-	sub	rRTN, rWORD5, rWORD6
-	blr
-	nop
-L(b12):
-	bne	cr7, L(bx12)
-L(bx34):
-	sub	rRTN, rWORD3, rWORD4
-	blr
-L(b11):
-L(bx12):
-	sub	rRTN, rWORD1, rWORD2
-	blr
-
-	.align	4
-L(zeroLength):
-	li	rRTN, 0
-	blr
-
-	.align	4
-/* At this point we know the strings have different alignment and the
-   compare length is at least 8 bytes.  r12 contains the low order
-   3 bits of rSTR1 and cr5 contains the result of the logical compare
-   of r12 to 0.  If r12 == 0 then rStr1 is double word
-   aligned and can perform the DWunaligned loop.
-
-   Otherwise we know that rSTR1 is not already DW aligned yet.
-   So we can force the string addresses to the next lower DW
-   boundary and special case this first DW using shift left to
-   eliminate bits preceding the first byte.  Since we want to join the
-   normal (DWaligned) compare loop, starting at the second double word,
-   we need to adjust the length (rN) and special case the loop
-   versioning for the first DW. This ensures that the loop count is
-   correct and the first DW (shifted) is in the expected resister pair.  */
-L(unaligned):
-	std	rSHL, rSHLSAVE(r1)
-	cfi_offset(rSHL, rSHLSAVE)
-	clrldi	rSHL, rSTR2, 61
-	beq	cr6, L(duzeroLength)
-	std	rSHR, rSHRSAVE(r1)
-	cfi_offset(rSHR, rSHRSAVE)
-	beq	cr5, L(DWunaligned)
-	std	rWORD8_SHIFT, rWORD8SHIFTSAVE(r1)
-	cfi_offset(rWORD8_SHIFT, rWORD8SHIFTSAVE)
-/* Adjust the logical start of rSTR2 to compensate for the extra bits
-   in the 1st rSTR1 DW.  */
-	sub	rWORD8_SHIFT, rSTR2, r12
-/* But do not attempt to address the DW before that DW that contains
-   the actual start of rSTR2.  */
-	clrrdi	rSTR2, rSTR2, 3
-	std	rWORD2_SHIFT, rWORD2SHIFTSAVE(r1)
-/* Compute the left/right shift counts for the unaligned rSTR2,
-   compensating for the logical (DW aligned) start of rSTR1.  */
-	clrldi	rSHL, rWORD8_SHIFT, 61
-	clrrdi	rSTR1, rSTR1, 3
-	std	rWORD4_SHIFT, rWORD4SHIFTSAVE(r1)
-	sldi	rSHL, rSHL, 3
-	cmpld	cr5, rWORD8_SHIFT, rSTR2
-	add	rN, rN, r12
-	sldi	rWORD6, r12, 3
-	std	rWORD6_SHIFT, rWORD6SHIFTSAVE(r1)
-	cfi_offset(rWORD2_SHIFT, rWORD2SHIFTSAVE)
-	cfi_offset(rWORD4_SHIFT, rWORD4SHIFTSAVE)
-	cfi_offset(rWORD6_SHIFT, rWORD6SHIFTSAVE)
-	subfic	rSHR, rSHL, 64
-	srdi	r0, rN, 5	/* Divide by 32 */
-	andi.	r12, rN, 24	/* Get the DW remainder */
-/* We normally need to load 2 DWs to start the unaligned rSTR2, but in
-   this special case those bits may be discarded anyway.  Also we
-   must avoid loading a DW where none of the bits are part of rSTR2 as
-   this may cross a page boundary and cause a page fault.  */
-	li	rWORD8, 0
-	blt	cr5, L(dus0)
-	LD	rWORD8, 0, rSTR2
-	addi	rSTR2, rSTR2, 8
-	sld	rWORD8, rWORD8, rSHL
-
-L(dus0):
-	LD	rWORD1, 0, rSTR1
-	LD	rWORD2, 0, rSTR2
-	cmpldi	cr1, r12, 16
-	cmpldi	cr7, rN, 32
-	srd	r12, rWORD2, rSHR
-	clrldi	rN, rN, 61
-	beq	L(duPs4)
-	mtctr	r0
-	or	rWORD8, r12, rWORD8
-	bgt	cr1, L(duPs3)
-	beq	cr1, L(duPs2)
-
-/* Remainder is 8 */
-	.align	4
-L(dusP1):
-	sld	rWORD8_SHIFT, rWORD2, rSHL
-	sld	rWORD7, rWORD1, rWORD6
-	sld	rWORD8, rWORD8, rWORD6
-	bge	cr7, L(duP1e)
-/* At this point we exit early with the first double word compare
-   complete and remainder of 0 to 7 bytes.  See L(du14) for details on
-   how we handle the remaining bytes.  */
-	cmpld	cr5, rWORD7, rWORD8
-	sldi.	rN, rN, 3
-	bne	cr5, L(duLcr5)
-	cmpld	cr7, rN, rSHR
-	beq	L(duZeroReturn)
-	li	r0, 0
-	ble	cr7, L(dutrim)
-	LD	rWORD2, rOFF8, rSTR2
-	srd	r0, rWORD2, rSHR
-	b	L(dutrim)
-/* Remainder is 16 */
-	.align	4
-L(duPs2):
-	sld	rWORD6_SHIFT, rWORD2, rSHL
-	sld	rWORD5, rWORD1, rWORD6
-	sld	rWORD6, rWORD8, rWORD6
-	b	L(duP2e)
-/* Remainder is 24 */
-	.align	4
-L(duPs3):
-	sld	rWORD4_SHIFT, rWORD2, rSHL
-	sld	rWORD3, rWORD1, rWORD6
-	sld	rWORD4, rWORD8, rWORD6
-	b	L(duP3e)
-/* Count is a multiple of 32, remainder is 0 */
-	.align	4
-L(duPs4):
-	mtctr	r0
-	or	rWORD8, r12, rWORD8
-	sld	rWORD2_SHIFT, rWORD2, rSHL
-	sld	rWORD1, rWORD1, rWORD6
-	sld	rWORD2, rWORD8, rWORD6
-	b	L(duP4e)
-
-/* At this point we know rSTR1 is double word aligned and the
-   compare length is at least 8 bytes.  */
-	.align	4
-L(DWunaligned):
-	std	rWORD8_SHIFT, rWORD8SHIFTSAVE(r1)
-	clrrdi	rSTR2, rSTR2, 3
-	std	rWORD2_SHIFT, rWORD2SHIFTSAVE(r1)
-	srdi	r0, rN, 5	/* Divide by 32 */
-	std	rWORD4_SHIFT, rWORD4SHIFTSAVE(r1)
-	andi.	r12, rN, 24	/* Get the DW remainder */
-	std	rWORD6_SHIFT, rWORD6SHIFTSAVE(r1)
-	cfi_offset(rWORD8_SHIFT, rWORD8SHIFTSAVE)
-	cfi_offset(rWORD2_SHIFT, rWORD2SHIFTSAVE)
-	cfi_offset(rWORD4_SHIFT, rWORD4SHIFTSAVE)
-	cfi_offset(rWORD6_SHIFT, rWORD6SHIFTSAVE)
-	sldi	rSHL, rSHL, 3
-	LD	rWORD6, 0, rSTR2
-	LD	rWORD8, rOFF8, rSTR2
-	addi	rSTR2, rSTR2, 8
-	cmpldi	cr1, r12, 16
-	cmpldi	cr7, rN, 32
-	clrldi	rN, rN, 61
-	subfic	rSHR, rSHL, 64
-	sld	rWORD6_SHIFT, rWORD6, rSHL
-	beq	L(duP4)
-	mtctr	r0
-	bgt	cr1, L(duP3)
-	beq	cr1, L(duP2)
-
-/* Remainder is 8 */
-	.align	4
-L(duP1):
-	srd	r12, rWORD8, rSHR
-	LD	rWORD7, 0, rSTR1
-	sld	rWORD8_SHIFT, rWORD8, rSHL
-	or	rWORD8, r12, rWORD6_SHIFT
-	blt	cr7, L(duP1x)
-L(duP1e):
-	LD	rWORD1, rOFF8, rSTR1
-	LD	rWORD2, rOFF8, rSTR2
-	cmpld	cr5, rWORD7, rWORD8
-	srd	r0, rWORD2, rSHR
-	sld	rWORD2_SHIFT, rWORD2, rSHL
-	or	rWORD2, r0, rWORD8_SHIFT
-	LD	rWORD3, rOFF16, rSTR1
-	LD	rWORD4, rOFF16, rSTR2
-	cmpld	cr7, rWORD1, rWORD2
-	srd	r12, rWORD4, rSHR
-	sld	rWORD4_SHIFT, rWORD4, rSHL
-	bne	cr5, L(duLcr5)
-	or	rWORD4, r12, rWORD2_SHIFT
-	LD	rWORD5, rOFF24, rSTR1
-	LD	rWORD6, rOFF24, rSTR2
-	cmpld	cr1, rWORD3, rWORD4
-	srd	r0, rWORD6, rSHR
-	sld	rWORD6_SHIFT, rWORD6, rSHL
-	bne	cr7, L(duLcr7)
-	or	rWORD6, r0, rWORD4_SHIFT
-	cmpld	cr6, rWORD5, rWORD6
-	b	L(duLoop3)
-	.align	4
-/* At this point we exit early with the first double word compare
-   complete and remainder of 0 to 7 bytes.  See L(du14) for details on
-   how we handle the remaining bytes.  */
-L(duP1x):
-	cmpld	cr5, rWORD7, rWORD8
-	sldi.	rN, rN, 3
-	bne	cr5, L(duLcr5)
-	cmpld	cr7, rN, rSHR
-	beq	L(duZeroReturn)
-	li	r0, 0
-	ble	cr7, L(dutrim)
-	LD	rWORD2, rOFF8, rSTR2
-	srd	r0, rWORD2, rSHR
-	b	L(dutrim)
-/* Remainder is 16 */
-	.align	4
-L(duP2):
-	srd	r0, rWORD8, rSHR
-	LD	rWORD5, 0, rSTR1
-	or	rWORD6, r0, rWORD6_SHIFT
-	sld	rWORD6_SHIFT, rWORD8, rSHL
-L(duP2e):
-	LD	rWORD7, rOFF8, rSTR1
-	LD	rWORD8, rOFF8, rSTR2
-	cmpld	cr6, rWORD5, rWORD6
-	srd	r12, rWORD8, rSHR
-	sld	rWORD8_SHIFT, rWORD8, rSHL
-	or	rWORD8, r12, rWORD6_SHIFT
-	blt	cr7, L(duP2x)
-	LD	rWORD1, rOFF16, rSTR1
-	LD	rWORD2, rOFF16, rSTR2
-	cmpld	cr5, rWORD7, rWORD8
-	bne	cr6, L(duLcr6)
-	srd	r0, rWORD2, rSHR
-	sld	rWORD2_SHIFT, rWORD2, rSHL
-	or	rWORD2, r0, rWORD8_SHIFT
-	LD	rWORD3, rOFF24, rSTR1
-	LD	rWORD4, rOFF24, rSTR2
-	cmpld	cr7, rWORD1, rWORD2
-	bne	cr5, L(duLcr5)
-	srd	r12, rWORD4, rSHR
-	sld	rWORD4_SHIFT, rWORD4, rSHL
-	or	rWORD4, r12, rWORD2_SHIFT
-	addi	rSTR1, rSTR1, 8
-	addi	rSTR2, rSTR2, 8
-	cmpld	cr1, rWORD3, rWORD4
-	b	L(duLoop2)
-	.align	4
-L(duP2x):
-	cmpld	cr5, rWORD7, rWORD8
-	addi	rSTR1, rSTR1, 8
-	addi	rSTR2, rSTR2, 8
-	bne	cr6, L(duLcr6)
-	sldi.	rN, rN, 3
-	bne	cr5, L(duLcr5)
-	cmpld	cr7, rN, rSHR
-	beq	L(duZeroReturn)
-	li	r0, 0
-	ble	cr7, L(dutrim)
-	LD	rWORD2, rOFF8, rSTR2
-	srd	r0, rWORD2, rSHR
-	b	L(dutrim)
-
-/* Remainder is 24 */
-	.align	4
-L(duP3):
-	srd	r12, rWORD8, rSHR
-	LD	rWORD3, 0, rSTR1
-	sld	rWORD4_SHIFT, rWORD8, rSHL
-	or	rWORD4, r12, rWORD6_SHIFT
-L(duP3e):
-	LD	rWORD5, rOFF8, rSTR1
-	LD	rWORD6, rOFF8, rSTR2
-	cmpld	cr1, rWORD3, rWORD4
-	srd	r0, rWORD6, rSHR
-	sld	rWORD6_SHIFT, rWORD6, rSHL
-	or	rWORD6, r0, rWORD4_SHIFT
-	LD	rWORD7, rOFF16, rSTR1
-	LD	rWORD8, rOFF16, rSTR2
-	cmpld	cr6, rWORD5, rWORD6
-	bne	cr1, L(duLcr1)
-	srd	r12, rWORD8, rSHR
-	sld	rWORD8_SHIFT, rWORD8, rSHL
-	or	rWORD8, r12, rWORD6_SHIFT
-	blt	cr7, L(duP3x)
-	LD	rWORD1, rOFF24, rSTR1
-	LD	rWORD2, rOFF24, rSTR2
-	cmpld	cr5, rWORD7, rWORD8
-	bne	cr6, L(duLcr6)
-	srd	r0, rWORD2, rSHR
-	sld	rWORD2_SHIFT, rWORD2, rSHL
-	or	rWORD2, r0, rWORD8_SHIFT
-	addi	rSTR1, rSTR1, 16
-	addi	rSTR2, rSTR2, 16
-	cmpld	cr7, rWORD1, rWORD2
-	b	L(duLoop1)
-	.align	4
-L(duP3x):
-	addi	rSTR1, rSTR1, 16
-	addi	rSTR2, rSTR2, 16
-	cmpld	cr5, rWORD7, rWORD8
-	bne	cr6, L(duLcr6)
-	sldi.	rN, rN, 3
-	bne	cr5, L(duLcr5)
-	cmpld	cr7, rN, rSHR
-	beq	L(duZeroReturn)
-	li	r0, 0
-	ble	cr7, L(dutrim)
-	LD	rWORD2, rOFF8, rSTR2
-	srd	r0, rWORD2, rSHR
-	b	L(dutrim)
-
-/* Count is a multiple of 32, remainder is 0 */
-	.align	4
-L(duP4):
-	mtctr	r0
-	srd	r0, rWORD8, rSHR
-	LD	rWORD1, 0, rSTR1
-	sld	rWORD2_SHIFT, rWORD8, rSHL
-	or	rWORD2, r0, rWORD6_SHIFT
-L(duP4e):
-	LD	rWORD3, rOFF8, rSTR1
-	LD	rWORD4, rOFF8, rSTR2
-	cmpld	cr7, rWORD1, rWORD2
-	srd	r12, rWORD4, rSHR
-	sld	rWORD4_SHIFT, rWORD4, rSHL
-	or	rWORD4, r12, rWORD2_SHIFT
-	LD	rWORD5, rOFF16, rSTR1
-	LD	rWORD6, rOFF16, rSTR2
-	cmpld	cr1, rWORD3, rWORD4
-	bne	cr7, L(duLcr7)
-	srd	r0, rWORD6, rSHR
-	sld	rWORD6_SHIFT, rWORD6, rSHL
-	or	rWORD6, r0, rWORD4_SHIFT
-	LD	rWORD7, rOFF24, rSTR1
-	LD	rWORD8, rOFF24, rSTR2
-	addi	rSTR1, rSTR1, 24
-	addi	rSTR2, rSTR2, 24
-	cmpld	cr6, rWORD5, rWORD6
-	bne	cr1, L(duLcr1)
-	srd	r12, rWORD8, rSHR
-	sld	rWORD8_SHIFT, rWORD8, rSHL
-	or	rWORD8, r12, rWORD6_SHIFT
-	cmpld	cr5, rWORD7, rWORD8
-	bdz	L(du24)		/* Adjust CTR as we start with +4 */
-/* This is the primary loop */
-	.align	4
-L(duLoop):
-	LD	rWORD1, rOFF8, rSTR1
-	LD	rWORD2, rOFF8, rSTR2
-	cmpld	cr1, rWORD3, rWORD4
-	bne	cr6, L(duLcr6)
-	srd	r0, rWORD2, rSHR
-	sld	rWORD2_SHIFT, rWORD2, rSHL
-	or	rWORD2, r0, rWORD8_SHIFT
-L(duLoop1):
-	LD	rWORD3, rOFF16, rSTR1
-	LD	rWORD4, rOFF16, rSTR2
-	cmpld	cr6, rWORD5, rWORD6
-	bne	cr5, L(duLcr5)
-	srd	r12, rWORD4, rSHR
-	sld	rWORD4_SHIFT, rWORD4, rSHL
-	or	rWORD4, r12, rWORD2_SHIFT
-L(duLoop2):
-	LD	rWORD5, rOFF24, rSTR1
-	LD	rWORD6, rOFF24, rSTR2
-	cmpld	cr5, rWORD7, rWORD8
-	bne	cr7, L(duLcr7)
-	srd	r0, rWORD6, rSHR
-	sld	rWORD6_SHIFT, rWORD6, rSHL
-	or	rWORD6, r0, rWORD4_SHIFT
-L(duLoop3):
-	LD	rWORD7, rOFF32, rSTR1
-	LD	rWORD8, rOFF32, rSTR2
-	addi	rSTR1, rSTR1, 32
-	addi	rSTR2, rSTR2, 32
-	cmpld	cr7, rWORD1, rWORD2
-	bne	cr1, L(duLcr1)
-	srd	r12, rWORD8, rSHR
-	sld	rWORD8_SHIFT, rWORD8, rSHL
-	or	rWORD8, r12, rWORD6_SHIFT
-	bdnz	L(duLoop)
-
-L(duL4):
-	cmpld	cr1, rWORD3, rWORD4
-	bne	cr6, L(duLcr6)
-	cmpld	cr6, rWORD5, rWORD6
-	bne	cr5, L(duLcr5)
-	cmpld	cr5, rWORD7, rWORD8
-L(du44):
-	bne	cr7, L(duLcr7)
-L(du34):
-	bne	cr1, L(duLcr1)
-L(du24):
-	bne	cr6, L(duLcr6)
-L(du14):
-	sldi.	rN, rN, 3
-	bne	cr5, L(duLcr5)
-/* At this point we have a remainder of 1 to 7 bytes to compare.  We use
-   shift right double to eliminate bits beyond the compare length.
-
-   However it may not be safe to load rWORD2 which may be beyond the
-   string length. So we compare the bit length of the remainder to
-   the right shift count (rSHR). If the bit count is less than or equal
-   we do not need to load rWORD2 (all significant bits are already in
-   rWORD8_SHIFT).  */
-	cmpld	cr7, rN, rSHR
-	beq	L(duZeroReturn)
-	li	r0, 0
-	ble	cr7, L(dutrim)
-	LD	rWORD2, rOFF8, rSTR2
-	srd	r0, rWORD2, rSHR
-	.align	4
-L(dutrim):
-	LD	rWORD1, rOFF8, rSTR1
-	ld	rWORD8, -8(r1)
-	subfic	rN, rN, 64	/* Shift count is 64 - (rN * 8).  */
-	or	rWORD2, r0, rWORD8_SHIFT
-	ld	rWORD7, rWORD7SAVE(r1)
-	ld	rSHL, rSHLSAVE(r1)
-	srd	rWORD1, rWORD1, rN
-	srd	rWORD2, rWORD2, rN
-	ld	rSHR, rSHRSAVE(r1)
-	ld	rWORD8_SHIFT, rWORD8SHIFTSAVE(r1)
-	li	rRTN, 0
-	cmpld	cr7, rWORD1, rWORD2
-	ld	rWORD2_SHIFT, rWORD2SHIFTSAVE(r1)
-	ld	rWORD4_SHIFT, rWORD4SHIFTSAVE(r1)
-	beq	cr7, L(dureturn24)
-	li	rRTN, 1
-	ld	rWORD6_SHIFT, rWORD6SHIFTSAVE(r1)
-	ld	rOFF8,  rOFF8SAVE(r1)
-	ld	rOFF16, rOFF16SAVE(r1)
-	ld	rOFF24, rOFF24SAVE(r1)
-	ld	rOFF32, rOFF32SAVE(r1)
-	bgtlr	cr7
-	li	rRTN, -1
-	blr
-	.align	4
-L(duLcr7):
-	ld	rWORD8, rWORD8SAVE(r1)
-	ld	rWORD7, rWORD7SAVE(r1)
-	li	rRTN, 1
-	bgt	cr7, L(dureturn29)
-	ld	rSHL, rSHLSAVE(r1)
-	ld	rSHR, rSHRSAVE(r1)
-	li	rRTN, -1
-	b	L(dureturn27)
-	.align	4
-L(duLcr1):
-	ld	rWORD8, rWORD8SAVE(r1)
-	ld	rWORD7, rWORD7SAVE(r1)
-	li	rRTN, 1
-	bgt	cr1, L(dureturn29)
-	ld	rSHL, rSHLSAVE(r1)
-	ld	rSHR, rSHRSAVE(r1)
-	li	rRTN, -1
-	b	L(dureturn27)
-	.align	4
-L(duLcr6):
-	ld	rWORD8, rWORD8SAVE(r1)
-	ld	rWORD7, rWORD7SAVE(r1)
-	li	rRTN, 1
-	bgt	cr6, L(dureturn29)
-	ld	rSHL, rSHLSAVE(r1)
-	ld	rSHR, rSHRSAVE(r1)
-	li	rRTN, -1
-	b	L(dureturn27)
-	.align	4
-L(duLcr5):
-	ld	rWORD8, rWORD8SAVE(r1)
-	ld	rWORD7, rWORD7SAVE(r1)
-	li	rRTN, 1
-	bgt	cr5, L(dureturn29)
-	ld	rSHL, rSHLSAVE(r1)
-	ld	rSHR, rSHRSAVE(r1)
-	li	rRTN, -1
-	b	L(dureturn27)
-
-	.align	3
-L(duZeroReturn):
-	li	rRTN, 0
-	.align	4
-L(dureturn):
-	ld	rWORD8, rWORD8SAVE(r1)
-	ld	rWORD7, rWORD7SAVE(r1)
-L(dureturn29):
-	ld	rSHL, rSHLSAVE(r1)
-	ld	rSHR, rSHRSAVE(r1)
-L(dureturn27):
-	ld	rWORD8_SHIFT, rWORD8SHIFTSAVE(r1)
-	ld	rWORD2_SHIFT, rWORD2SHIFTSAVE(r1)
-	ld	rWORD4_SHIFT, rWORD4SHIFTSAVE(r1)
-L(dureturn24):
-	ld	rWORD6_SHIFT, rWORD6SHIFTSAVE(r1)
-	ld	rOFF8,  rOFF8SAVE(r1)
-	ld	rOFF16, rOFF16SAVE(r1)
-	ld	rOFF24, rOFF24SAVE(r1)
-	ld	rOFF32, rOFF32SAVE(r1)
-	blr
-
-L(duzeroLength):
-	ld	rOFF8,  rOFF8SAVE(r1)
-	ld	rOFF16, rOFF16SAVE(r1)
-	ld	rOFF24, rOFF24SAVE(r1)
-	ld	rOFF32, rOFF32SAVE(r1)
-	li	rRTN, 0
-	blr
-
-END (MEMCMP)
-libc_hidden_builtin_def (memcmp)
-weak_alias (memcmp, bcmp)