about summary refs log tree commit diff
path: root/sysdeps/powerpc/powerpc64/power8/strlen.S
diff options
context:
space:
mode:
Diffstat (limited to 'sysdeps/powerpc/powerpc64/power8/strlen.S')
-rw-r--r--sysdeps/powerpc/powerpc64/power8/strlen.S301
1 files changed, 0 insertions, 301 deletions
diff --git a/sysdeps/powerpc/powerpc64/power8/strlen.S b/sysdeps/powerpc/powerpc64/power8/strlen.S
deleted file mode 100644
index 8f4a1fc1dc..0000000000
--- a/sysdeps/powerpc/powerpc64/power8/strlen.S
+++ /dev/null
@@ -1,301 +0,0 @@
-/* Optimized strlen implementation for PowerPC64/POWER8 using a vectorized
-   loop.
-   Copyright (C) 2016-2017 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <http://www.gnu.org/licenses/>.  */
-
-#include <sysdep.h>
-
-/* TODO: change these to the actual instructions when the minimum required
-   binutils allows it.  */
-#define MFVRD(r,v)	.long (0x7c000067 | ((v)<<(32-11)) | ((r)<<(32-16)))
-#define VBPERMQ(t,a,b)	.long (0x1000054c \
-			       | ((t)<<(32-11))	\
-			       | ((a)<<(32-16))	\
-			       | ((b)<<(32-21)) )
-
-/* int [r3] strlen (char *s [r3])  */
-
-#ifndef STRLEN
-# define STRLEN strlen
-#endif
-
-/* TODO: change this to .machine power8 when the minimum required binutils
-   allows it.  */
-	.machine  power7
-EALIGN (STRLEN, 4, 0)
-	CALL_MCOUNT 1
-	dcbt	0,r3
-	clrrdi	r4,r3,3	      /* Align the address to doubleword boundary.  */
-	rlwinm	r6,r3,3,26,28 /* Calculate padding.  */
-	li	r0,0	      /* Doubleword with null chars to use
-				 with cmpb.  */
-	li	r5,-1	      /* MASK = 0xffffffffffffffff.  */
-	ld	r12,0(r4)     /* Load doubleword from memory.  */
-#ifdef __LITTLE_ENDIAN__
-	sld	r5,r5,r6
-#else
-	srd	r5,r5,r6      /* MASK = MASK >> padding.  */
-#endif
-	orc	r9,r12,r5     /* Mask bits that are not part of the string.  */
-	cmpb	r10,r9,r0     /* Check for null bytes in DWORD1.  */
-	cmpdi	cr7,r10,0     /* If r10 == 0, no null's have been found.  */
-	bne	cr7,L(done)
-
-	/* For shorter strings (< 64 bytes), we will not use vector registers,
-	   as the overhead isn't worth it.  So, let's use GPRs instead.  This
-	   will be done the same way as we do in the POWER7 implementation.
-	   Let's see if we are aligned to a quadword boundary.  If so, we can
-	   jump to the first (non-vectorized) loop.  Otherwise, we have to
-	   handle the next DWORD first.  */
-	mtcrf	0x01,r4
-	mr	r9,r4
-	addi	r9,r9,8
-	bt	28,L(align64)
-
-	/* Handle the next 8 bytes so we are aligned to a quadword
-	   boundary.  */
-	ldu	r5,8(r4)
-	cmpb	r10,r5,r0
-	cmpdi	cr7,r10,0
-	addi	r9,r9,8
-	bne	cr7,L(done)
-
-L(align64):
-	/* Proceed to the old (POWER7) implementation, checking two doublewords
-	   per iteraction.  For the first 56 bytes, we will just check for null
-	   characters.  After that, we will also check if we are 64-byte aligned
-	   so we can jump to the vectorized implementation.  We will unroll
-	   these loops to avoid excessive branching.  */
-	ld	r6,8(r4)
-	ldu	r5,16(r4)
-	cmpb	r10,r6,r0
-	cmpb	r11,r5,r0
-	or	r5,r10,r11
-	cmpdi	cr7,r5,0
-	addi	r9,r9,16
-	bne	cr7,L(dword_zero)
-
-	ld	r6,8(r4)
-	ldu	r5,16(r4)
-	cmpb	r10,r6,r0
-	cmpb	r11,r5,r0
-	or	r5,r10,r11
-	cmpdi	cr7,r5,0
-	addi	r9,r9,16
-	bne	cr7,L(dword_zero)
-
-	ld	r6,8(r4)
-	ldu	r5,16(r4)
-	cmpb	r10,r6,r0
-	cmpb	r11,r5,r0
-	or	r5,r10,r11
-	cmpdi	cr7,r5,0
-	addi	r9,r9,16
-	bne	cr7,L(dword_zero)
-
-	/* Are we 64-byte aligned? If so, jump to the vectorized loop.
-	   Note: aligning to 64-byte will necessarily slow down performance for
-	   strings around 64 bytes in length due to the extra comparisons
-	   required to check alignment for the vectorized loop.  This is a
-	   necessary tradeoff we are willing to take in order to speed up the
-	   calculation for larger strings.  */
-	andi.	r10,r9,63
-	beq	cr0,L(preloop)
-	ld	r6,8(r4)
-	ldu	r5,16(r4)
-	cmpb	r10,r6,r0
-	cmpb	r11,r5,r0
-	or	r5,r10,r11
-	cmpdi	cr7,r5,0
-	addi	r9,r9,16
-	bne	cr7,L(dword_zero)
-
-	andi.	r10,r9,63
-	beq	cr0,L(preloop)
-	ld	r6,8(r4)
-	ldu	r5,16(r4)
-	cmpb	r10,r6,r0
-	cmpb	r11,r5,r0
-	or	r5,r10,r11
-	cmpdi	cr7,r5,0
-	addi	r9,r9,16
-	bne	cr7,L(dword_zero)
-
-	andi.	r10,r9,63
-	beq	cr0,L(preloop)
-	ld	r6,8(r4)
-	ldu	r5,16(r4)
-	cmpb	r10,r6,r0
-	cmpb	r11,r5,r0
-	or	r5,r10,r11
-	cmpdi	cr7,r5,0
-	addi	r9,r9,16
-	bne	cr7,L(dword_zero)
-
-	andi.	r10,r9,63
-	beq	cr0,L(preloop)
-	ld	r6,8(r4)
-	ldu	r5,16(r4)
-	cmpb	r10,r6,r0
-	cmpb	r11,r5,r0
-	or	r5,r10,r11
-	cmpdi	cr7,r5,0
-	addi	r9,r9,16
-
-	/* At this point, we are necessarily 64-byte aligned.  If no zeroes were
-	   found, jump to the vectorized loop.  */
-	beq	cr7,L(preloop)
-
-L(dword_zero):
-	/* OK, one (or both) of the doublewords contains a null byte.  Check
-	   the first doubleword and decrement the address in case the first
-	   doubleword really contains a null byte.  */
-
-	cmpdi	cr6,r10,0
-	addi	r4,r4,-8
-	bne	cr6,L(done)
-
-	/* The null byte must be in the second doubleword.  Adjust the address
-	   again and move the result of cmpb to r10 so we can calculate the
-	   length.  */
-
-	mr	r10,r11
-	addi	r4,r4,8
-
-	/* If the null byte was found in the non-vectorized code, compute the
-	   final length.  r10 has the output of the cmpb instruction, that is,
-	   it contains 0xff in the same position as the null byte in the
-	   original doubleword from the string.  Use that to calculate the
-	   length.  */
-L(done):
-#ifdef __LITTLE_ENDIAN__
-	addi	r9, r10,-1    /* Form a mask from trailing zeros.  */
-	andc	r9, r9,r10
-	popcntd	r0, r9	      /* Count the bits in the mask.  */
-#else
-	cntlzd	r0,r10	      /* Count leading zeros before the match.  */
-#endif
-	subf	r5,r3,r4
-	srdi	r0,r0,3	      /* Convert leading/trailing zeros to bytes.  */
-	add	r3,r5,r0      /* Compute final length.  */
-	blr
-
-	/* Vectorized implementation starts here.  */
-	.p2align  4
-L(preloop):
-	/* Set up for the loop.  */
-	mr	r4,r9
-	li	r7, 16	      /* Load required offsets.  */
-	li	r8, 32
-	li	r9, 48
-	li	r12, 8
-	vxor	v0,v0,v0      /* VR with null chars to use with
-				 vcmpequb.  */
-
-	/* Main loop to look for the end of the string.  We will read in
-	   64-byte chunks.  Align it to 32 bytes and unroll it 3 times to
-	   leverage the icache performance.  */
-	.p2align  5
-L(loop):
-	lvx	  v1,r4,r0  /* Load 4 quadwords.  */
-	lvx	  v2,r4,r7
-	lvx	  v3,r4,r8
-	lvx	  v4,r4,r9
-	vminub	  v5,v1,v2  /* Compare and merge into one VR for speed.  */
-	vminub	  v6,v3,v4
-	vminub	  v7,v5,v6
-	vcmpequb. v7,v7,v0  /* Check for NULLs.  */
-	addi	  r4,r4,64  /* Adjust address for the next iteration.  */
-	bne	  cr6,L(vmx_zero)
-
-	lvx	  v1,r4,r0  /* Load 4 quadwords.  */
-	lvx	  v2,r4,r7
-	lvx	  v3,r4,r8
-	lvx	  v4,r4,r9
-	vminub	  v5,v1,v2  /* Compare and merge into one VR for speed.  */
-	vminub	  v6,v3,v4
-	vminub	  v7,v5,v6
-	vcmpequb. v7,v7,v0  /* Check for NULLs.  */
-	addi	  r4,r4,64  /* Adjust address for the next iteration.  */
-	bne	  cr6,L(vmx_zero)
-
-	lvx	  v1,r4,r0  /* Load 4 quadwords.  */
-	lvx	  v2,r4,r7
-	lvx	  v3,r4,r8
-	lvx	  v4,r4,r9
-	vminub	  v5,v1,v2  /* Compare and merge into one VR for speed.  */
-	vminub	  v6,v3,v4
-	vminub	  v7,v5,v6
-	vcmpequb. v7,v7,v0  /* Check for NULLs.  */
-	addi	  r4,r4,64  /* Adjust address for the next iteration.  */
-	beq	  cr6,L(loop)
-
-L(vmx_zero):
-	/* OK, we found a null byte.  Let's look for it in the current 64-byte
-	   block and mark it in its corresponding VR.  */
-	vcmpequb  v1,v1,v0
-	vcmpequb  v2,v2,v0
-	vcmpequb  v3,v3,v0
-	vcmpequb  v4,v4,v0
-
-	/* We will now 'compress' the result into a single doubleword, so it
-	   can be moved to a GPR for the final calculation.  First, we
-	   generate an appropriate mask for vbpermq, so we can permute bits into
-	   the first halfword.  */
-	vspltisb  v10,3
-	lvsl	  v11,r0,r0
-	vslb	  v10,v11,v10
-
-	/* Permute the first bit of each byte into bits 48-63.  */
-	VBPERMQ(v1,v1,v10)
-	VBPERMQ(v2,v2,v10)
-	VBPERMQ(v3,v3,v10)
-	VBPERMQ(v4,v4,v10)
-
-	/* Shift each component into its correct position for merging.  */
-#ifdef __LITTLE_ENDIAN__
-	vsldoi  v2,v2,v2,2
-	vsldoi  v3,v3,v3,4
-	vsldoi  v4,v4,v4,6
-#else
-	vsldoi	v1,v1,v1,6
-	vsldoi	v2,v2,v2,4
-	vsldoi	v3,v3,v3,2
-#endif
-
-	/* Merge the results and move to a GPR.  */
-	vor	v1,v2,v1
-	vor	v2,v3,v4
-	vor	v4,v1,v2
-	MFVRD(r10,v4)
-
-	 /* Adjust address to the begninning of the current 64-byte block.  */
-	addi	r4,r4,-64
-
-#ifdef __LITTLE_ENDIAN__
-	addi	r9, r10,-1    /* Form a mask from trailing zeros.  */
-	andc	r9, r9,r10
-	popcntd	r0, r9	      /* Count the bits in the mask.  */
-#else
-	cntlzd	r0,r10	      /* Count leading zeros before the match.  */
-#endif
-	subf	r5,r3,r4
-	add	r3,r5,r0      /* Compute final length.  */
-	blr
-
-END (STRLEN)
-libc_hidden_builtin_def (strlen)