about summary refs log tree commit diff
diff options
context:
space:
mode:
-rw-r--r--ChangeLog15
-rw-r--r--NEWS3
-rw-r--r--sysdeps/powerpc/powerpc64/multiarch/Makefile5
-rw-r--r--sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c6
-rw-r--r--sysdeps/powerpc/powerpc64/multiarch/stpncpy-power8.S39
-rw-r--r--sysdeps/powerpc/powerpc64/multiarch/stpncpy.c7
-rw-r--r--sysdeps/powerpc/powerpc64/multiarch/strncpy-power8.S40
-rw-r--r--sysdeps/powerpc/powerpc64/multiarch/strncpy.c7
-rw-r--r--sysdeps/powerpc/powerpc64/power8/stpncpy.S20
-rw-r--r--sysdeps/powerpc/powerpc64/power8/strncpy.S424
10 files changed, 559 insertions, 7 deletions
diff --git a/ChangeLog b/ChangeLog
index 16199e3130..20aded41ca 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,20 @@
 2015-01-13  Adhemerval Zanella  <azanella@linux.vnet.ibm.com>
 
+	* sysdeps/powerpc/powerpc64/multiarch/Makefile [sysdep_routines]:
+	Add strncpy-power8 and stpncpy-power8 objects.
+	* sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
+	(__libc_ifunc_impl_list): Add __strncpy_power8 and stpncpy_power8
+	implementations.
+	* sysdeps/powerpc/powerpc64/multiarch/stpncpy-power8.S: New file.
+	* sysdeps/powerpc/powerpc64/multiarch/stpncpy.c (__stpncpy): Add
+	__stpncpy_power8 implementation.
+	* sysdeps/powerpc/powerpc64/multiarch/strncpy-power8.S: New file.
+	* sysdeps/powerpc/powerpc64/multiarch/strncpy.c (strncpy): Add
+	__strncpy_power8 implementation.
+	* sysdeps/powerpc/powerpc64/power8/stpncpy.S: New file.
+	* sysdeps/powerpc/powerpc64/power8/strncpy.S: New file.
+	* NEWS: Update.
+
 	* sysdeps/powerpc/powerpc64/multiarch/strncat-power7.c: New file.
 	* sysdeps/powerpc/powerpc64/multiarch/strncat-power7.S: Remove file.
 	* sysdeps/powerpc/powerpc64/power7/strncat.S: Likewise.
diff --git a/NEWS b/NEWS
index e02091802e..08b3daacd5 100644
--- a/NEWS
+++ b/NEWS
@@ -19,7 +19,8 @@ Version 2.21
   17744, 17745, 17746, 17747, 17748, 17775, 17777, 17780, 17781, 17782,
   17791, 17793, 17796, 17797, 17803, 17806, 17834
 
-* Optimized strcpy and stpcpy implementations for powerpc64/powerpc64le.
+* Optimized strcpy, stpcpy, strncpy, stpncpy implementations for
+  powerpc64/powerpc64le.
 
 * Added support for TSX lock elision of pthread mutexes on powerpc32, powerpc64
   and powerpc64le.  This may improve lock scaling of existing programs on
diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
index 74b2daac1d..18d337843c 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
+++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
@@ -17,9 +17,10 @@ sysdep_routines += memcpy-power7 memcpy-a2 memcpy-power6 memcpy-cell \
 		   stpcpy-power7 stpcpy-ppc64 \
 		   strrchr-power7 strrchr-ppc64 strncat-power7 strncat-ppc64 \
 		   strncpy-power7 strncpy-ppc64 \
-		   stpncpy-power7 stpncpy-ppc64 strcmp-power7 strcmp-ppc64 \
+		   stpncpy-power8 stpncpy-power7 stpncpy-ppc64 \
+		   strcmp-power7 strcmp-ppc64 \
 		   strcat-power8 strcat-power7 strcat-ppc64 memmove-power7 \
-		   memmove-ppc64 bcopy-ppc64
+		   memmove-ppc64 bcopy-ppc64 strncpy-power8
 
 CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops
 CFLAGS-strncase_l-power7.c += -mcpu=power7 -funroll-loops
diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
index dbb21fddb9..132cb13eac 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
@@ -279,6 +279,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
   /* Support sysdeps/powerpc/powerpc64/multiarch/strncpy.c.  */
   IFUNC_IMPL (i, name, strncpy,
 	      IFUNC_IMPL_ADD (array, i, strncpy,
+			      hwcap2 & PPC_FEATURE2_ARCH_2_07,
+			      __strncpy_power8)
+	      IFUNC_IMPL_ADD (array, i, strncpy,
 			      hwcap & PPC_FEATURE_HAS_VSX,
 			      __strncpy_power7)
 	      IFUNC_IMPL_ADD (array, i, strncpy, 1,
@@ -287,6 +290,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
   /* Support sysdeps/powerpc/powerpc64/multiarch/stpncpy.c.  */
   IFUNC_IMPL (i, name, stpncpy,
 	      IFUNC_IMPL_ADD (array, i, stpncpy,
+			      hwcap2 & PPC_FEATURE2_ARCH_2_07,
+			      __stpncpy_power8)
+	      IFUNC_IMPL_ADD (array, i, stpncpy,
 			      hwcap & PPC_FEATURE_HAS_VSX,
 			      __stpncpy_power7)
 	      IFUNC_IMPL_ADD (array, i, stpncpy, 1,
diff --git a/sysdeps/powerpc/powerpc64/multiarch/stpncpy-power8.S b/sysdeps/powerpc/powerpc64/multiarch/stpncpy-power8.S
new file mode 100644
index 0000000000..d5d835de91
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/multiarch/stpncpy-power8.S
@@ -0,0 +1,39 @@
+/* Optimized stpncpy implementation for POWER8.
+   Copyright (C) 2015 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+#define USE_AS_STPNCPY
+
+#undef EALIGN
+#define EALIGN(name, alignt, words)				\
+  .section ".text";						\
+  ENTRY_2(__stpncpy_power8)					\
+  .align ALIGNARG(alignt);					\
+  EALIGN_W_##words;						\
+  BODY_LABEL(__stpncpy_power8):					\
+  cfi_startproc;						\
+  LOCALENTRY(__stpncpy_power8)
+
+#undef END
+#define END(name)						\
+  cfi_endproc;							\
+  TRACEBACK(__stpncpy_power8)					\
+  END_2(__stpncpy_power8)
+
+#include <sysdeps/powerpc/powerpc64/power8/stpncpy.S>
diff --git a/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c b/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c
index 9e5a270303..0f4072f982 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c
@@ -23,10 +23,13 @@
 
 extern __typeof (__stpncpy) __stpncpy_ppc attribute_hidden;
 extern __typeof (__stpncpy) __stpncpy_power7 attribute_hidden;
+extern __typeof (__stpncpy) __stpncpy_power8 attribute_hidden;
 
 libc_ifunc (__stpncpy,
-            (hwcap & PPC_FEATURE_HAS_VSX)
-            ? __stpncpy_power7
+            (hwcap2 & PPC_FEATURE2_ARCH_2_07)
+            ? __stpncpy_power8 :
+              (hwcap & PPC_FEATURE_HAS_VSX)
+              ? __stpncpy_power7
             : __stpncpy_ppc);
 
 weak_alias (__stpncpy, stpncpy)
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strncpy-power8.S b/sysdeps/powerpc/powerpc64/multiarch/strncpy-power8.S
new file mode 100644
index 0000000000..ed906a4394
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/multiarch/strncpy-power8.S
@@ -0,0 +1,40 @@
+/* Optimized strncpy implementation for POWER8.
+   Copyright (C) 2015 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+#undef EALIGN
+#define EALIGN(name, alignt, words)				\
+  .section ".text";						\
+  ENTRY_2(__strncpy_power8)					\
+  .align ALIGNARG(alignt);					\
+  EALIGN_W_##words;						\
+  BODY_LABEL(__strncpy_power8):					\
+  cfi_startproc;						\
+  LOCALENTRY(__strncpy_power8)
+
+#undef END
+#define END(name)						\
+  cfi_endproc;							\
+  TRACEBACK(__strncpy_power8)					\
+  END_2(__strncpy_power8)
+
+#undef libc_hidden_builtin_def
+#define libc_hidden_builtin_def(name)
+
+#include <sysdeps/powerpc/powerpc64/power8/strncpy.S>
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strncpy.c b/sysdeps/powerpc/powerpc64/multiarch/strncpy.c
index ae4e97a265..ffb0f23643 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/strncpy.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/strncpy.c
@@ -24,12 +24,15 @@
 
 extern __typeof (strncpy) __strncpy_ppc attribute_hidden;
 extern __typeof (strncpy) __strncpy_power7 attribute_hidden;
+extern __typeof (strncpy) __strncpy_power8 attribute_hidden;
 
 /* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
  ifunc symbol properly. */
 libc_ifunc (strncpy,
-            (hwcap & PPC_FEATURE_HAS_VSX)
-            ? __strncpy_power7
+            (hwcap2 & PPC_FEATURE2_ARCH_2_07)
+            ? __strncpy_power8 :
+              (hwcap & PPC_FEATURE_HAS_VSX)
+              ? __strncpy_power7
             : __strncpy_ppc);
 
 #endif
diff --git a/sysdeps/powerpc/powerpc64/power8/stpncpy.S b/sysdeps/powerpc/powerpc64/power8/stpncpy.S
new file mode 100644
index 0000000000..76a146609f
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power8/stpncpy.S
@@ -0,0 +1,20 @@
+/* Optimized stpncpy implementation for PowerPC64/POWER8.
+   Copyright (C) 2015 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#define USE_AS_STPNCPY
+#include <sysdeps/powerpc/powerpc64/power8/strncpy.S>
diff --git a/sysdeps/powerpc/powerpc64/power8/strncpy.S b/sysdeps/powerpc/powerpc64/power8/strncpy.S
new file mode 100644
index 0000000000..5fda953526
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power8/strncpy.S
@@ -0,0 +1,424 @@
+/* Optimized strncpy/stpncpy implementation for PowerPC64/POWER8.
+   Copyright (C) 2015 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+#ifdef USE_AS_STPNCPY
+# define FUNC_NAME __stpncpy
+#else
+# define FUNC_NAME strncpy
+#endif
+
+/* Implements the function
+
+   char * [r3] strncpy (char *dest [r3], const char *src [r4], size_t n [r5])
+
+   or
+
+   char * [r3] stpncpy (char *dest [r3], const char *src [r4], size_t n [r5])
+
+   if USE_AS_STPCPY is defined.
+
+   The implementation uses unaligned doubleword access to avoid specialized
+   code paths depending of data alignment.  Although recent powerpc64 uses
+   64K as default, the page cross handling assumes minimum page size of
+   4k.  */
+
+	.machine  power7
+EALIGN (FUNC_NAME, 4, 0)
+
+        /* Check if the [src]+15 will cross a 4K page by checking if the bit
+           indicating the page size changes.  Basically:
+
+           uint64_t srcin = (uint64_t)src;
+           uint64_t ob = srcin & 4096UL;
+           uint64_t nb = (srcin+15UL) & 4096UL;
+           if (ob ^ nb)
+             goto pagecross;  */
+
+	addi	r10,r4,16
+	rlwinm	r9,r4,0,19,19
+
+	/* Since it is a leaf function, save some non-volatile registers on the
+	   protected/red zone.  */
+	std	r26,-48(r1)
+	std	r27,-40(r1)
+
+	rlwinm	r8,r10,0,19,19
+
+	std	r28,-32(r1)
+	std	r29,-24(r1)
+
+	cmpld	r7,r9,r8
+
+	std	r30,-16(r1)
+	std	r31,-8(r1)
+
+	beq	cr7,L(unaligned_lt_16)
+	rldicl	r9,r4,0,61
+	subfic	r8,r9,8
+	cmpld	cr7,r5,r8
+	bgt 	cr7,L(pagecross)
+
+	/* At this points there is 1 to 15 bytes to check and write.  Since it could
+	   be either from first unaligned 16 bytes access or from bulk copy, the code
+	   uses an unrolled byte read/write instead of trying to analyze the cmpb
+	   results.  */
+L(short_path):
+	mr	r9,r3
+L(short_path_1):
+	cmpdi	cr7,r5,0
+	beq	cr7,L(short_path_loop_end_1)
+L(short_path_2):
+	lbz	r10,0(r4)
+	cmpdi	cr7,r10,0
+	stb	r10,0(r9)
+	beq	cr7,L(zero_pad_start_1)
+	cmpdi	cr0,r5,1
+	addi	r8,r9,1
+	addi	r6,r5,-1
+	beq	cr0,L(short_path_loop_end_0)
+	lbz	r10,1(r4)
+	cmpdi	cr7,r10,0
+	stb	r10,1(r9)
+	beq	cr7,L(zero_pad_start_prepare_1)
+	addi	r10,r5,-3
+	b	L(short_path_loop_1)
+
+	.align	4
+L(short_path_loop):
+	lbz	r8,0(r4)
+	addi	r7,r10,-2
+	cmpdi	cr5,r8,0
+	stb	r8,0(r9)
+	beq	cr5,L(zero_pad_start_1)
+	beq	r7,L(short_path_loop_end_0)
+	lbz	r8,1(r4)
+	cmpdi	cr7,r8,0
+	stb	r8,1(r9)
+	beq	cr7,L(zero_pad_start)
+	mr	r10,r7
+L(short_path_loop_1):
+	addic.	r5,r5,-2
+	addi	r9,r9,2
+	cmpdi	cr7,r10,0
+	addi	r4,r4,2
+	addi	r6,r9,1
+	bne	cr0,L(short_path_loop)
+#ifdef USE_AS_STPNCPY
+	mr	r3,r9
+	b	L(short_path_loop_end)
+#endif
+
+L(short_path_loop_end_0):
+#ifdef USE_AS_STPNCPY
+	addi	r3,r9,1
+	b	L(short_path_loop_end)
+#endif
+L(short_path_loop_end_1):
+#ifdef USE_AS_STPNCPY
+	mr	r3,r9
+#endif
+L(short_path_loop_end):
+	/* Restore non-volatile registers.  */
+	ld	r26,-48(r1)
+	ld	r27,-40(r1)
+	ld	r28,-32(r1)
+	ld	r29,-24(r1)
+	ld	r30,-16(r1)
+	ld	r31,-8(r1)
+	blr
+
+	/* This code pads the remainder dest with NULL bytes.  The algorithm
+	   calculate the remanining size and issues a doubleword unrolled
+	   loops followed by a byte a byte set.  */
+	.align	4
+L(zero_pad_start):
+	mr	r5,r10
+	mr	r9,r6
+L(zero_pad_start_1):
+	srdi.	r8,r5,r3
+	mr	r10,r9
+#ifdef USE_AS_STPNCPY
+	mr	r3,r9
+#endif
+	beq-	cr0,L(zero_pad_loop_b_start)
+	cmpldi	cr7,r8,1
+	li	cr7,0
+	std	r7,0(r9)
+	beq	cr7,L(zero_pad_loop_b_prepare)
+	addic.	r8,r8,-2
+	addi	r10,r9,r16
+	std	r7,8(r9)
+	beq	cr0,L(zero_pad_loop_dw_2)
+	std	r7,16(r9)
+	li	r9,0
+	b	L(zero_pad_loop_dw_1)
+
+	.align	4
+L(zero_pad_loop_dw):
+	addi	r10,r10,16
+	std	r9,-8(r10)
+	beq	cr0,L(zero_pad_loop_dw_2)
+	std	r9,0(r10)
+L(zero_pad_loop_dw_1):
+	cmpldi	cr7,r8,1
+	std	r9,0(r10)
+	addic.	r8,r8,-2
+	bne	cr7,L(zero_pad_loop_dw)
+	addi	r10,r10,8
+L(zero_pad_loop_dw_2):
+	rldicl	r5,r5,0,61
+L(zero_pad_loop_b_start):
+	cmpdi	cr7,r5,0
+	addi	r5,r5,-1
+	addi	r9,r10,-1
+	add	r10,r10,5
+	subf	r10,r9,r10
+	li	r8,0
+	beq-	cr7,L(short_path_loop_end)
+
+	/* Write remaining 1-8 bytes.  */
+        .align  4
+	addi	r9,r9,1
+	mtocrf	0x1,r10
+	bf	29,4f
+        stw     r8,0(r9)
+        addi	r9,r9,4
+
+        .align  4
+4:      bf      30,2f
+        sth     r8,0(r9)
+        addi	r9,r9,2
+
+        .align  4
+2:      bf	31,1f
+        stb	r8,0(r9)
+
+	/* Restore non-volatile registers.  */
+1:	ld	r26,-48(r1)
+	ld	r27,-40(r1)
+	ld	r28,-32(r1)
+	ld	r29,-24(r1)
+	ld	r30,-16(r1)
+	ld	r31,-8(r1)
+	blr
+
+	/* The common case where [src]+16 will not cross a 4K page boundary.
+	   In this case the code fast check the first 16 bytes by using doubleword
+	   read/compares and update destiny if neither total size or null byte
+	   is found in destiny. */
+	.align	4
+L(unaligned_lt_16):
+	cmpldi	cr7,r5,7
+	ble	cr7,L(short_path)
+	ld	r7,0(r4)
+	li	r8,0
+	cmpb	r8,r7,r8
+	cmpdi	cr7,r8,0
+	bne	cr7,L(short_path_prepare_2)
+	addi	r6,r5,-8
+	std	r7,0(r3)
+	addi	r9,r3,r8
+	cmpldi	cr7,r6,7
+	addi	r7,r4,8
+	ble	cr7,L(short_path_prepare_1_1)
+	ld	r4,8(r4)
+	cmpb	r8,r4,r8
+	cmpdi	cr7,r8,0
+	bne	cr7,L(short_path_prepare_2_1)
+	std	r4,8(r3)
+	addi	r29,r3,16
+	addi	r5,r5,-16
+	/* Neither the null byte was found or total length was reached,
+	   align to 16 bytes and issue a bulk copy/compare.  */
+	b	L(align_to_16b)
+
+	/* In the case of 4k page boundary cross, the algorithm first align
+	   the address to a doubleword, calculate a mask based on alignment
+	   to ignore the bytes and continue using doubleword.  */
+	.align	4
+L(pagecross):
+	rldicr	r11,r4,0,59	/* Align the address to 8 bytes boundary.  */
+	li	r6,-1		/* MASK = 0xffffffffffffffffUL.  */
+	sldi	r9,r9,3		/* Calculate padding.  */
+	ld	r7,0(r11)	/* Load doubleword from memory.  */
+#ifdef __LITTLE_ENDIAN__
+	sld	r9,r6,r9	/* MASK = MASK << padding.  */
+#else
+	srd	r9,r6,r9	/* MASK = MASK >> padding.  */
+#endif
+	orc	r9,r7,r9	/* Mask bits that are not part of the
+				   string.  */
+	li	cr7,0
+	cmpb	r9,r9,r7	/* Check for null bytes in DWORD1.  */
+	cmpdi	cr7,r9,0
+	bne	cr7,L(short_path_prepare_2)
+	subf	r8,r8,r5	/* Adjust total length.  */
+	cmpldi	cr7,r8,8	/* Check if length was reached.  */
+	ble	cr7,L(short_path_prepare_2)
+
+	/* For next checks we have aligned address, so we check for more
+	   three doublewords to make sure we can read 16 unaligned bytes
+	   to start the bulk copy with 16 aligned addresses.  */
+	ld	cr7,8(r11)
+	cmpb	r9,r7,r9
+	cmpdi	cr7,r9,0
+	bne	cr7,L(short_path_prepare_2)
+	addi	cr7,r8,-8
+	cmpldi	cr7,r7,8
+	ble	cr7,L(short_path_prepare_2)
+	ld	cr7,16(r11)
+	cmpb	r9,r7,r9
+	cmpdi	cr7,r9,0
+	bne	cr7,L(short_path_prepare_2)
+	addi	r8,r8,-16
+	cmpldi	r7,r8,8
+	ble	cr7,L(short_path_prepare_2)
+	ld	r8,24(r11)
+	cmpb	r9,r8,r9
+	cmpdi	r7,r9,0
+	bne	cr7,L(short_path_prepare_2)
+
+	/* No null byte found in the 32 bytes readed and length not reached,
+	   read source again using unaligned loads and store them.  */
+	ld	r9,0(r4)
+	addi	r29,r3,16
+	addi	r5,r5,-16
+	std	r9,0(r3)
+	ld	r9,8(r4)
+	std	r9,8(r3)
+
+	/* Align source to 16 bytes and adjust destiny and size.  */
+L(align_to_16b):
+	rldicl	r9,r10,0,60
+	rldicr	r28,r10,0,59
+	add	r12,r5,r9
+	subf	r29,r9,r29
+
+	/* The bulk read/compare/copy loads two doublewords, compare and merge
+	   in a single register for speed.  This is an attempt to speed up the
+	   null-checking process for bigger strings.  */
+
+	cmpldi	cr7,r12,15
+	ble	cr7,L(short_path_prepare_1_2)
+
+	/* Main loop for large sizes, unrolled 2 times to get better use of
+	   pipeline.  */
+	ld	r8,0(28)
+	ld	r10,8(28)
+	li	r9,0
+	cmpb	r7,r8,r9
+	cmpb	r9,r10,r9
+	or.	r6,r9,r7
+	bne	cr0,L(short_path_prepare_2_3)
+	addi	r5,r12,-16
+	addi	r4,r28,16
+	std	r8,0(r29)
+	std	r10,8(r29)
+	cmpldi	cr7,r5,15
+	addi	r9,r29,16
+	ble	cr7,L(short_path_1)
+	mr	r11,r28
+	mr	r6,r29
+	li	r30,0
+	subfic	r26,r4,48
+	subfic	r27,r9,48
+
+	b	L(loop_16b)
+
+	.align	4
+L(loop_start):
+	ld	r31,0(r11)
+	ld	r10,8(r11)
+	cmpb	r0,r31,r7
+	cmpb	r8,r10,r7
+	or.	r7,r0,r8
+	addi	r5,r5,-32
+	cmpldi	cr7,r5,15
+	add	r4,r4,r26
+	add	r9,r9,r27
+	bne	cr0,L(short_path_prepare_2_2)
+	add	r4,r28,r4
+	std	r31,0(r6)
+	add	r9,r29,r9
+	std	r10,8(r6)
+	ble	cr7,L(short_path_1)
+
+L(loop_16b):
+	ld	r10,16(r11)
+	ld	r0,24(r11)
+	cmpb	r8,r10,r30
+	cmpb	r7,r0,r30
+	or.	r7,r8,r7
+	addi	r12,r12,-32
+	cmpldi	r7,r12,15
+	addi	r11,r11,32
+	bne	cr0,L(short_path_2)
+	std	r10,16(r6)
+	addi	r6,r6,32
+	std	r0,-8(r6)
+	bgt	cr7,L(loop_start)
+
+	mr	r5,r12
+	mr	r4,r11
+	mr	r9,r6
+	b	L(short_path_1)
+
+	.align	4
+L(short_path_prepare_1_1):
+	mr	r5,r6
+	mr	r4,r7
+	b	L(short_path_1)
+L(short_path_prepare_1_2):
+	mr	r5,r12
+	mr	r4,r28
+	mr	r9,r29
+	b	L(short_path_1)
+L(short_path_prepare_2):
+	mr	r9,r3
+	b	L(short_path_2)
+L(short_path_prepare_2_1):
+	mr	r5,r6
+	mr	r4,r7
+	b	L(short_path_2)
+L(short_path_prepare_2_2):
+	mr	r5,r12
+	mr	r4,r11
+	mr	r9,r6
+	b	L(short_path_2)
+L(short_path_prepare_2_3):
+	mr	r5,r12
+	mr	r4,r28
+	mr	r9,r29
+	b	L(short_path_2)
+L(zero_pad_loop_b_prepare):
+	addi	r10,r9,8
+	rldicl	r5,r5,0,61
+	b	L(zero_pad_loop_b_start)
+L(zero_pad_start_prepare_1):
+	mr	r5,r6
+	mr	r9,r8
+	b	L(zero_pad_start_1)
+END (FUNC_NAME)
+
+#ifdef USE_AS_STPNCPY
+libc_hidden_def (__stpncpy)
+#else
+libc_hidden_builtin_def (strncpy)
+#endif