From 9f2f36e5a91c2ce6edba5415e176155eb1008ae1 Mon Sep 17 00:00:00 2001 From: Adhemerval Zanella Date: Tue, 23 Dec 2014 13:39:23 -0500 Subject: powerpc: Optimized strncat for POWER7/PPC64 With 3eb38795dbbbd816 (Simplify strncat) the generic algorithms uses strlen, strnlen, and memcpy. This is faster than POWER7 current implementation, especially for unaligned strings (where POWER7 code uses byte-byte operations). This patch removes the assembly implementation and uses a multiarch specialization based on default algorithm calling optimized POWER7 symbols. --- .../powerpc/powerpc64/multiarch/strncat-power7.S | 42 ---- .../powerpc/powerpc64/multiarch/strncat-power7.c | 31 +++ sysdeps/powerpc/powerpc64/power7/strncat.S | 228 --------------------- 3 files changed, 31 insertions(+), 270 deletions(-) delete mode 100644 sysdeps/powerpc/powerpc64/multiarch/strncat-power7.S create mode 100644 sysdeps/powerpc/powerpc64/multiarch/strncat-power7.c delete mode 100644 sysdeps/powerpc/powerpc64/power7/strncat.S (limited to 'sysdeps/powerpc') diff --git a/sysdeps/powerpc/powerpc64/multiarch/strncat-power7.S b/sysdeps/powerpc/powerpc64/multiarch/strncat-power7.S deleted file mode 100644 index 6216284d6f..0000000000 --- a/sysdeps/powerpc/powerpc64/multiarch/strncat-power7.S +++ /dev/null @@ -1,42 +0,0 @@ -/* Optimized strncat implementation for POWER7. - Copyright (C) 2014-2015 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - . */ - -#include - -#undef EALIGN -#define EALIGN(name, alignt, words) \ - .section ".text"; \ - ENTRY_2(__strncat_power7) \ - .align ALIGNARG(alignt); \ - EALIGN_W_##words; \ - BODY_LABEL(__strncat_power7): \ - cfi_startproc; \ - LOCALENTRY(__strncat_power7) - -#undef END -#define END(name) \ - cfi_endproc; \ - TRACEBACK(__strncat_power7) \ - END_2(__strncat_power7) - -#undef libc_hidden_builtin_def -#define libc_hidden_builtin_def(name) - -#define STRLEN __strlen_power7 - -#include diff --git a/sysdeps/powerpc/powerpc64/multiarch/strncat-power7.c b/sysdeps/powerpc/powerpc64/multiarch/strncat-power7.c new file mode 100644 index 0000000000..39b1aebe9b --- /dev/null +++ b/sysdeps/powerpc/powerpc64/multiarch/strncat-power7.c @@ -0,0 +1,31 @@ +/* Copyright (C) 2015 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include + +#define STRNCAT __strncat_power7 + +extern __typeof (strncat) __strncat_power7 attribute_hidden; +extern __typeof (strlen) __strlen_power7 attribute_hidden; +extern __typeof (strnlen) __strnlen_power7 attribute_hidden; +extern __typeof (memcpy) __memcpy_power7 attribute_hidden; + +#define strlen __strlen_power7 +#define __strnlen __strnlen_power7 +#define memcpy __memcpy_power7 + +#include diff --git a/sysdeps/powerpc/powerpc64/power7/strncat.S b/sysdeps/powerpc/powerpc64/power7/strncat.S deleted file mode 100644 index 05502acbbf..0000000000 --- a/sysdeps/powerpc/powerpc64/power7/strncat.S +++ /dev/null @@ -1,228 +0,0 @@ -/* Optimized strncat implementation for PowerPC64/POWER7. - - Copyright (C) 2014-2015 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - . */ - -/* The algorithm is as follows for aligned memory access : - - if address of s2 is divisible by 0x7UL, - perform aligned doubleword catenation - else - perform unaligned catenation - - The aligned comparison are made using cmpb instructions. */ - -/* char* [r3] strncat (const char *s1 [r3], - const char *s2 [r4], - size_t size [r5]) */ - -#include - -#ifndef STRNCAT -# undef strncat -# define STRNCAT strncat -#endif - -#ifndef STRLEN -/* For builds with no IFUNC support, local calls should be made to internal - GLIBC symbol (created by libc_hidden_builtin_def). */ -# ifdef SHARED -# define STRLEN __GI_strlen -# else -# define STRLEN strlen -# endif -#endif - -#define FRAMESIZE (FRAME_MIN_SIZE+32) - - .machine power7 -EALIGN(STRNCAT, 4, 0) - CALL_MCOUNT 3 - - mflr r0 /* Load link register LR to r0. */ - -/* We shall use r29, r30 and r31 non volatile register for retention. - Save all the callee registers in the GPR save area. */ - std r29, -24(r1) /* Save callers register r29. */ - std r30, -16(r1) /* Save callers register r30. */ - std r31, -8(r1) /* Save callers register r31. */ - - std r0, 16(r1) /* Store the link register. */ - stdu r1, -FRAMESIZE(r1) /* Create the stack frame. */ - -/* Improve performance with CPU pre-fetch. */ - dcbt 0, r3 /* Pre-fetch str to avoid cache - miss. */ - dcbt 0, r4 /* Pre-fetch accept to avoid cache - miss. */ - - mr. r29, r5 /* Save "n" in r29. */ - mr r30, r3 /* Save "s1" in r30 from r3. */ - beq cr0,L(done) - - mr r31, r4 /* Save "s2" in r31 from r4. */ - bl STRLEN /* Call optimized strlen on s1; goto - end of s1. */ - nop - cmpldi cr7, r29, 7 /* If s2 is <=7 process - byte-by-byte. */ - add r3, r30, r3 /* Grab the last character of s1. */ - bgt cr7,L(alignment) /* Process by aligned strings. */ - - cmpldi cr7, r29, 3 /* If n is >= 4, we can - byte-unroll. */ - addi r9, r3, -1 /* Make "s1" point before next - character, increment when read. */ - bgt cr7, L(bytes_unroll) /* Process each byte. */ - -L(byte_by_byte): - lbz r10, 0(r31) - addi r8, r9, 1 - cmpdi cr7, r10, 0 /* Check for NULL in "s2". */ - stb r10, 1(r9) - beq cr7, L(done) - add r9, r9, r29 - subf r9, r8, r9 - addi r9, r9, 1 - mtctr r9 - b L(branch2) - .p2align 4 -L(branch1): - lbzu r10, 1(r31) - cmpdi cr7, r10, 0 - stbu r10, 1(r8) - beq cr7,L(done) -L(branch2): - mr r9, r8 - bdnz L(branch1) - beq cr7,L(done) -L(nullTerminate): - li r10, 0 /* Load NULL for termination. */ - stb r10, 1(r9) /* Append or terminate s1 with - NULL. */ - .p2align 4 /* A small section here. */ -L(done): /* We return now. */ - addi r1, r1, FRAMESIZE /* Restore stack pointer. */ - mr r3, r30 /* Set the return value length of - string. */ - ld r0, 16(r1) /* Read the saved link register. */ - ld r29, -24(r1) /* Restore save register r29. */ - ld r30, -16(r1) /* Restore save register r30. */ - ld r31, -8(r1) /* Restore save register r31. */ - mtlr r0 /* Restore link register. */ - blr /* Branch to link register. */ - - .p2align 4 -L(alignment): - rldicl. r9, r31, 0, 61 /* Check if s2 is 8byte aligned */ - beq cr0,L(dwordAligned) - - .p2align 4 -/* Unaligned bytes in string, so process byte by byte. - POWER7 has performance gains over loop unroll. */ -L(bytes_unroll): - addi r9, r3, -1 - srdi r10, r29, 2 - mtctr r10 - b L(L10) - .p2align 4 -L(L44): - lbz r10, 1(r31) /* Load byte. */ - cmpdi cr7, r10, 0 /* Compare ; if byte not zero, - continue. */ - stb r10, 2(r9) /* Store byte */ - beq cr7, L(done) - addi r31, r31, 4 - - lbz r10, -2(r31) /* Perform loop unroll here on byte - load and store. */ - cmpdi cr7, r10, 0 - stb r10, 3(r9) - beq cr7, L(done) - - lbz r10, -1(r31) /* Loop unroll here. */ - cmpdi cr7, r10, 0 - stbu r10, 4(r9) - beq cr7, L(done) - - bdz L(leftNbytes) - -L(L10): - lbz r10, 0(r31) /* Loop unroll here. */ - cmpdi cr7, r10, 0 - stb r10, 1(r9) - bne cr7,L(L44) - b L(done) - .p2align 4 -/* If s2 is double word aligned, we load and store double word. */ -L(dwordAligned): -/* read, write 8 bytes at a time */ - srdi r8, r29, 3 /* Compute count for CTR to loop; - count = n/8. */ - li r7, 0 /* Load r7 with NULL. */ - li r10, 0 /* Load r10 with MASK '0'. */ - - mtctr r8 /* Move count to CTR. */ -L(loop8): - ld r9, 0(r31) /* Read double word from s2. */ - cmpb r6, r9, r10 /* Compare bytes in s2 we read - just now. */ - cmpdi r6, 0 /* If cmpb returned NULL, - we continue. */ - bne+ L(a8) - std r9, 0(r3) /* Append double word from s2 - with s1. */ - addi r3, r3, 8 /* Increment s1. */ - addi r31, r31, 8 /* Increment s2. */ - subi r29, r29, 8 /* Decrement count by 8. */ - bdnz L(loop8) /* Continue until "count" is - non zero. */ - -L(a8): - cmpdi r29, 0 /* If "n" is already zero, we skip. */ - beq+ L(align8align) - - mtctr r29 /* Process left over bytes in "n". */ -L(unaligned0): - lbz r9, 0(r31) /* Read a byte from s2. */ - cmpw r9, r7 /* If byte is NULL, we stop here . */ - beq+ L(align8align) /* Skip processing further if NULL. */ - stb r9, 0(r3) /* If not NULL, store byte into s1. */ - addi r3, r3, 1 /* Increment s1 by 1. */ - addi r31, r31, 1 /* Increment s2 by 1. */ - bdnz L(unaligned0) /* Decrement counter "n" and loop - until non zero. */ -L(align8align): - stb r7, 0(r3) /* Terminate s1 with NULL. */ - - addi r1, r1, FRAMESIZE /* Restore stack pointer. */ - mr r3, r30 /* Set the return value, length of - string. */ - ld r0, 16(r1) /* Read the saved link register. */ - ld r29, -24(r1) /* Restore save register r29. */ - ld r30, -16(r1) /* Restore save register r30. */ - ld r31, -8(r1) /* Restore save register r31. */ - mtlr r0 /* Restore link register. */ - blr /* Branch to link register */ - - .p2align 4 -L(leftNbytes): - rldicl. r29, r29, 0, 62 /* Check if n>0 and n < 4 bytes. */ - bne cr0,L(byte_by_byte) /* Process bytes one by one. */ - b L(nullTerminate) /* Now, finish catenation with - NULL termination. */ -END(STRNCAT) -- cgit 1.4.1