diff options
Diffstat (limited to 'sysdeps/powerpc/powerpc64/power7/strncpy.S')
-rw-r--r-- | sysdeps/powerpc/powerpc64/power7/strncpy.S | 722 |
1 files changed, 0 insertions, 722 deletions
diff --git a/sysdeps/powerpc/powerpc64/power7/strncpy.S b/sysdeps/powerpc/powerpc64/power7/strncpy.S deleted file mode 100644 index 0224f74898..0000000000 --- a/sysdeps/powerpc/powerpc64/power7/strncpy.S +++ /dev/null @@ -1,722 +0,0 @@ -/* Copyright (C) 2014-2017 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> - -/* Implements the functions - - char * [r3] strncpy (char *dst [r3], const char *src [r4], size_t n [r5]) - - AND - - char * [r3] stpncpy (char *dst [r3], const char *src [r4], size_t n [r5]) - - The algorithm is as follows: - > if src and dest are 8 byte aligned, perform double word copy - else - > copy byte by byte on unaligned addresses. - - The aligned comparison are made using cmpb instructions. */ - -/* The focus on optimization for performance improvements are as follows: - 1. data alignment [gain from aligned memory access on read/write] - 2. POWER7 gains performance with loop unrolling/unwinding - [gain by reduction of branch penalty]. - 3. The final pad with null bytes is done by calling an optimized - memset. */ - -#ifdef USE_AS_STPNCPY -# ifndef STPNCPY -# define FUNC_NAME __stpncpy -# else -# define FUNC_NAME STPNCPY -# endif -#else -# ifndef STRNCPY -# define FUNC_NAME strncpy -# else -# define FUNC_NAME STRNCPY -# endif -#endif /* !USE_AS_STPNCPY */ - -#define FRAMESIZE (FRAME_MIN_SIZE+32) - -#ifndef MEMSET -/* For builds with no IFUNC support, local calls should be made to internal - GLIBC symbol (created by libc_hidden_builtin_def). */ -# ifdef SHARED -# define MEMSET __GI_memset -# else -# define MEMSET memset -# endif -#endif - - .machine power7 -EALIGN(FUNC_NAME, 4, 0) - CALL_MCOUNT 3 - - mflr r0 /* load link register LR to r0 */ - or r10, r3, r4 /* to verify source and destination */ - rldicl. r8, r10, 0, 61 /* is double word aligned .. ? */ - - std r19, -8(r1) /* save callers register , r19 */ - std r18, -16(r1) /* save callers register , r18 */ - std r0, 16(r1) /* store the link register */ - stdu r1, -FRAMESIZE(r1) /* create the stack frame */ - - mr r9, r3 /* save r3 into r9 for use */ - mr r18, r3 /* save r3 for retCode of strncpy */ - bne 0, L(unaligned) - -L(aligned): - srdi r11, r5, 3 /* compute count for CTR ; count = n/8 */ - cmpldi cr7, r11, 3 /* if count > 4 ; perform unrolling 4 times */ - ble 7, L(update1) - - ld r10, 0(r4) /* load doubleWord from src */ - cmpb r8, r10, r8 /* compare src with NULL ,we read just now */ - cmpdi cr7, r8, 0 /* if cmpb returned NULL ; we continue */ - bne cr7, L(update3) - - std r10, 0(r3) /* copy doubleword at offset=0 */ - ld r10, 8(r4) /* load next doubleword from offset=8 */ - cmpb r8, r10, r8 /* compare src with NULL , we read just now */ - cmpdi cr7, r8, 0 /* if cmpb returned NULL ; we continue */ - bne 7,L(HopBy8) - - addi r8, r11, -4 - mr r7, r3 - srdi r8, r8, 2 - mr r6, r4 - addi r8, r8, 1 - li r12, 0 - mtctr r8 - b L(dwordCopy) - - .p2align 4 -L(dWordUnroll): - std r8, 16(r9) - ld r8, 24(r4) /* load dword,perform loop unrolling again */ - cmpb r10, r8, r10 - cmpdi cr7, r10, 0 - bne cr7, L(HopBy24) - - std r8, 24(r7) /* copy dword at offset=24 */ - addi r9, r9, 32 - addi r4, r4, 32 - bdz L(leftDwords) /* continue with loop on counter */ - - ld r3, 32(r6) - cmpb r8, r3, r10 - cmpdi cr7, r8, 0 - bne cr7, L(update2) - - std r3, 32(r7) - ld r10, 40(r6) - cmpb r8, r10, r8 - cmpdi cr7, r8, 0 - bne cr7, L(HopBy40) - - mr r6, r4 /* update values */ - mr r7, r9 - mr r11, r0 - mr r5, r19 - -L(dwordCopy): - std r10, 8(r9) /* copy dword at offset=8 */ - addi r19, r5, -32 - addi r0, r11, -4 - ld r8, 16(r4) - cmpb r10, r8, r12 - cmpdi cr7, r10, 0 - beq cr7, L(dWordUnroll) - - addi r9, r9, 16 /* increment dst by 16 */ - addi r4, r4, 16 /* increment src by 16 */ - addi r5, r5, -16 /* decrement length 'n' by 16 */ - addi r0, r11, -2 /* decrement loop counter */ - -L(dWordUnrollOFF): - ld r10, 0(r4) /* load first dword */ - li r8, 0 /* load mask */ - cmpb r8, r10, r8 - cmpdi cr7, r8, 0 - bne cr7, L(byte_by_byte) - mtctr r0 - li r7, 0 - b L(CopyDword) - - .p2align 4 -L(loadDWordandCompare): - ld r10, 0(r4) - cmpb r8, r10, r7 - cmpdi cr7, r8, 0 - bne cr7, L(byte_by_byte) - -L(CopyDword): - addi r9, r9, 8 - std r10, -8(r9) - addi r4, r4, 8 - addi r5, r5, -8 - bdnz L(loadDWordandCompare) - -L(byte_by_byte): - cmpldi cr7, r5, 3 - ble cr7, L(verifyByte) - srdi r10, r5, 2 - mr r19, r9 - mtctr r10 - b L(firstByteUnroll) - - .p2align 4 -L(bytes_unroll): - lbz r10, 1(r4) /* load byte from src */ - cmpdi cr7, r10, 0 /* compare for NULL */ - stb r10, 1(r19) /* store byte to dst */ - beq cr7, L(updtDestComputeN2ndByte) - - addi r4, r4, 4 /* advance src */ - - lbz r10, -2(r4) /* perform loop unrolling for byte r/w */ - cmpdi cr7, r10, 0 - stb r10, 2(r19) - beq cr7, L(updtDestComputeN3rdByte) - - lbz r10, -1(r4) /* perform loop unrolling for byte r/w */ - addi r19, r19, 4 - cmpdi cr7, r10, 0 - stb r10, -1(r19) - beq cr7, L(ComputeNByte) - - bdz L(update0) - -L(firstByteUnroll): - lbz r10, 0(r4) /* perform loop unrolling for byte r/w */ - cmpdi cr7, 10, 0 - stb r10, 0(r19) - bne cr7, L(bytes_unroll) - addi r19, r19, 1 - -L(ComputeNByte): - subf r9, r19, r9 /* compute 'n'n bytes to fill */ - add r8, r9, r5 - -L(zeroFill): - cmpdi cr7, r8, 0 /* compare if length is zero */ - beq cr7, L(update3return) - - mr r3, r19 /* fill buffer with */ - li r4, 0 /* zero fill buffer */ - mr r5, r8 /* how many bytes to fill buffer with */ - bl MEMSET /* call optimized memset */ - nop - -L(update3return): -#ifdef USE_AS_STPNCPY - addi r3, r19, -1 /* update return value */ -#endif - -L(hop2return): -#ifndef USE_AS_STPNCPY - mr r3, r18 /* set return value */ -#endif - addi r1, r1, FRAMESIZE /* restore stack pointer */ - ld r0, 16(r1) /* read the saved link register */ - ld r18, -16(r1) /* restore callers save register, r18 */ - ld r19, -8(r1) /* restore callers save register, r19 */ - mtlr r0 /* branch to link register */ - blr /* return */ - - .p2align 4 -L(update0): - mr r9, r19 - - .p2align 4 -L(verifyByte): - rldicl. r8, r5, 0, 62 -#ifdef USE_AS_STPNCPY - mr r3, r9 -#endif - beq cr0, L(hop2return) - mtctr r8 - addi r4, r4, -1 - mr r19, r9 - b L(oneBYone) - - .p2align 4 -L(proceed): - bdz L(done) - -L(oneBYone): - lbzu r10, 1(r4) /* copy byte */ - addi r19, r19, 1 - addi r8, r8, -1 - cmpdi cr7, r10, 0 - stb r10, -1(r19) - bne cr7, L(proceed) - b L(zeroFill) - - .p2align 4 -L(done): - addi r1, r1, FRAMESIZE /* restore stack pointer */ -#ifdef USE_AS_STPNCPY - mr r3, r19 /* set the return value */ -#else - mr r3, r18 /* set the return value */ -#endif - ld r0, 16(r1) /* read the saved link register */ - ld r18, -16(r1) /* restore callers save register, r18 */ - ld r19, -8(r1) /* restore callers save register, r19 */ - mtlr r0 /* branch to link register */ - blr /* return */ - -L(update1): - mr r0, r11 - mr r19, r5 - - .p2align 4 -L(leftDwords): - cmpdi cr7, r0, 0 - mr r5, r19 - bne cr7, L(dWordUnrollOFF) - b L(byte_by_byte) - - .p2align 4 -L(updtDestComputeN2ndByte): - addi r19, r19, 2 /* update dst by 2 */ - subf r9, r19, r9 /* compute distance covered */ - add r8, r9, r5 - b L(zeroFill) - - .p2align 4 -L(updtDestComputeN3rdByte): - addi r19, r19, 3 /* update dst by 3 */ - subf r9, r19, r9 /* compute distance covered */ - add r8, r9, r5 - b L(zeroFill) - - .p2align 4 -L(HopBy24): - addi r9, r9, 24 /* increment dst by 24 */ - addi r4, r4, 24 /* increment src by 24 */ - addi r5, r5, -24 /* decrement length 'n' by 24 */ - addi r0, r11, -3 /* decrement loop counter */ - b L(dWordUnrollOFF) - - .p2align 4 -L(update2): - mr r5, r19 - b L(dWordUnrollOFF) - - .p2align 4 -L(HopBy40): - addi r9, r7, 40 /* increment dst by 40 */ - addi r4, r6, 40 /* increment src by 40 */ - addi r5, r5, -40 /* decrement length 'n' by 40 */ - addi r0, r11, -5 /* decrement loop counter */ - b L(dWordUnrollOFF) - -L(update3): - mr r0, r11 - b L(dWordUnrollOFF) - -L(HopBy8): - addi r9, r3, 8 /* increment dst by 8 */ - addi r4, r4, 8 /* increment src by 8 */ - addi r5, r5, -8 /* decrement length 'n' by 8 */ - addi r0, r11, -1 /* decrement loop counter */ - b L(dWordUnrollOFF) - -L(unaligned): - cmpdi r5, 16 /* Proceed byte by byte for less than 16 */ - ble L(byte_by_byte) - rldicl r7, r3, 0, 61 - rldicl r6, r4, 0, 61 - cmpdi r6, 0 /* Check src alignment */ - beq L(srcaligndstunalign) - /* src is unaligned */ - rlwinm r10, r4, 3,26,28 /* Calculate padding. */ - clrrdi r4, r4, 3 /* Align the addr to dw boundary */ - ld r8, 0(r4) /* Load doubleword from memory. */ - li r0, 0 - /* Discard bits not part of the string */ -#ifdef __LITTLE_ENDIAN__ - srd r7, r8, r10 -#else - sld r7, r8, r10 -#endif - cmpb r0, r7, r0 /* Compare each byte against null */ - /* Discard bits not part of the string */ -#ifdef __LITTLE_ENDIAN__ - sld r0, r0, r10 -#else - srd r0, r0, r10 -#endif - cmpdi r0, 0 - bne L(bytebybyte) /* if it has null, copy byte by byte */ - subfic r6, r6, 8 - rlwinm r12, r3, 3,26,28 /* Calculate padding in bits. */ - rldicl r9, r3, 0, 61 /* Calculate padding in bytes. */ - addi r3, r3, -1 - - cmpdi r12, 0 /* check dest alignment */ - beq L(srcunaligndstalign) - - /* both src and dst unaligned */ -#ifdef __LITTLE_ENDIAN__ - sld r8, r7, r10 - mr r11, r10 - addi r11, r11, -8 /* Adjust byte pointer on loaded dw */ -#else - srd r8, r7, r10 - subfic r11, r10, 64 -#endif - /* dst alignment is greater then src alignment? */ - cmpd cr7, r12, r10 - ble cr7, L(dst_align_small) - /* src alignment is less than dst */ - - /* Calculate the dst alignment difference */ - subfic r7, r9, 8 - mtctr r7 - - /* Write until dst is aligned */ - cmpdi r0, r7, 4 - blt L(storebyte1) /* less than 4, store byte by byte */ - beq L(equal1) /* if its 4, store word */ - addi r0, r7, -4 /* greater than 4, so stb and stw */ - mtctr r0 -L(storebyte1): -#ifdef __LITTLE_ENDIAN__ - addi r11, r11, 8 /* Adjust byte pointer on loaded dw */ -#else - addi r11, r11, -8 -#endif - srd r7, r8, r11 - stbu r7, 1(r3) - addi r5, r5, -1 - bdnz L(storebyte1) - - subfic r7, r9, 8 /* Check the remaining bytes */ - cmpdi r0, r7, 4 - blt L(proceed1) - - .align 4 -L(equal1): -#ifdef __LITTLE_ENDIAN__ - addi r11, r11, 8 /* Adjust byte pointer on loaded dw */ - srd r7, r8, r11 -#else - subfic r11, r11, 64 - sld r7, r8, r11 - srdi r7, r7, 32 -#endif - stw r7, 1(r3) - addi r3, r3, 4 - addi r5, r5, -4 - -L(proceed1): - mr r7, r8 - /* calculate the Left over bytes to be written */ - subfic r11, r10, 64 - subfic r12, r12, 64 - subf r12, r12, r11 /* remaining bytes on second dw */ - subfic r10, r12, 64 /* remaining bytes on first dw */ - subfic r9, r9, 8 - subf r6, r9, r6 /* recalculate padding */ -L(srcunaligndstalign): - addi r3, r3, 1 - subfic r12, r10, 64 /* remaining bytes on second dw */ - addi r4, r4, 8 - li r0,0 - b L(storedouble) - - .align 4 -L(dst_align_small): - mtctr r6 - /* Write until src is aligned */ -L(storebyte2): -#ifdef __LITTLE_ENDIAN__ - addi r11, r11, 8 /* Adjust byte pointer on dw */ -#else - addi r11, r11, -8 -#endif - srd r7, r8, r11 - stbu r7, 1(r3) - addi r5, r5, -1 - bdnz L(storebyte2) - - addi r4, r4, 8 /* Increment src pointer */ - addi r3, r3, 1 /* Increment dst pointer */ - mr r9, r3 - li r8, 0 - cmpd cr7, r12, r10 - beq cr7, L(aligned) - rldicl r6, r3, 0, 61 /* Recalculate padding */ - mr r7, r6 - - /* src is algined */ -L(srcaligndstunalign): - mr r9, r3 - mr r6, r7 - ld r8, 0(r4) - subfic r10, r7, 8 - mr r7, r8 - li r0, 0 /* Check null */ - cmpb r0, r8, r0 - cmpdi r0, 0 - bne L(byte_by_byte) /* Do byte by byte if there is NULL */ - rlwinm r12, r3, 3,26,28 /* Calculate padding */ - addi r3, r3, -1 - /* write byte by byte until aligned */ -#ifdef __LITTLE_ENDIAN__ - li r11, -8 -#else - li r11, 64 -#endif - mtctr r10 - cmpdi r0, r10, 4 - blt L(storebyte) - beq L(equal) - addi r0, r10, -4 - mtctr r0 -L(storebyte): -#ifdef __LITTLE_ENDIAN__ - addi r11, r11, 8 /* Adjust byte pointer on dw */ -#else - addi r11, r11, -8 -#endif - srd r7, r8, r11 - stbu r7, 1(r3) - addi r5, r5, -1 - bdnz L(storebyte) - - cmpdi r0, r10, 4 - blt L(align) - - .align 4 -L(equal): -#ifdef __LITTLE_ENDIAN__ - addi r11, r11, 8 - srd r7, r8, r11 -#else - subfic r11, r11, 64 - sld r7, r8, r11 - srdi r7, r7, 32 -#endif - stw r7, 1(r3) - addi r5, r5, -4 - addi r3, r3, 4 -L(align): - addi r3, r3, 1 - addi r4, r4, 8 /* Increment src pointer */ - subfic r10, r12, 64 - li r0, 0 - /* dst addr aligned to 8 */ -L(storedouble): - cmpdi r5, 8 - ble L(null1) - ld r7, 0(r4) /* load next dw */ - cmpb r0, r7, r0 - cmpdi r0, 0 /* check for null on each new dw */ - bne L(null) -#ifdef __LITTLE_ENDIAN__ - srd r9, r8, r10 /* bytes from first dw */ - sld r11, r7, r12 /* bytes from second dw */ -#else - sld r9, r8, r10 - srd r11, r7, r12 -#endif - or r11, r9, r11 /* make as a single dw */ - std r11, 0(r3) /* store as std on aligned addr */ - mr r8, r7 /* still few bytes left to be written */ - addi r3, r3, 8 /* increment dst addr */ - addi r4, r4, 8 /* increment src addr */ - addi r5, r5, -8 - b L(storedouble) /* Loop until NULL */ - - .align 4 - -/* We've hit the end of the string. Do the rest byte-by-byte. */ -L(null): - addi r3, r3, -1 - mr r10, r12 - mtctr r6 -#ifdef __LITTLE_ENDIAN__ - subfic r10, r10, 64 - addi r10, r10, -8 -#endif - cmpdi r0, r5, 4 - blt L(loop) - cmpdi r0, r6, 4 - blt L(loop) - - /* we can still use stw if leftover >= 4 */ -#ifdef __LITTLE_ENDIAN__ - addi r10, r10, 8 - srd r11, r8, r10 -#else - subfic r10, r10, 64 - sld r11, r8, r10 - srdi r11, r11, 32 -#endif - stw r11, 1(r3) - addi r5, r5, -4 - addi r3, r3, 4 - cmpdi r0, r5, 0 - beq L(g1) - cmpdi r0, r6, 4 - beq L(bytebybyte1) - addi r10, r10, 32 -#ifdef __LITTLE_ENDIAN__ - addi r10, r10, -8 -#else - subfic r10, r10, 64 -#endif - addi r0, r6, -4 - mtctr r0 - /* remaining byte by byte part of first dw */ -L(loop): -#ifdef __LITTLE_ENDIAN__ - addi r10, r10, 8 -#else - addi r10, r10, -8 -#endif - srd r0, r8, r10 - stbu r0, 1(r3) - addi r5, r5, -1 - cmpdi r0, r5, 0 - beq L(g1) - bdnz L(loop) -L(bytebybyte1): - addi r3, r3, 1 - /* remaining byte by byte part of second dw */ -L(bytebybyte): - addi r3, r3, -8 - addi r4, r4, -1 - -#ifdef __LITTLE_ENDIAN__ - extrdi. r0, r7, 8, 56 - stbu r7, 8(r3) - addi r5, r5, -1 - beq L(g2) - cmpdi r5, 0 - beq L(g1) - extrdi. r0, r7, 8, 48 - stbu r0, 1(r3) - addi r5, r5, -1 - beq L(g2) - cmpdi r5, 0 - beq L(g1) - extrdi. r0, r7, 8, 40 - stbu r0, 1(r3) - addi r5, r5, -1 - beq L(g2) - cmpdi r5, 0 - beq L(g1) - extrdi. r0, r7, 8, 32 - stbu r0, 1(r3) - addi r5, r5, -1 - beq L(g2) - cmpdi r5, 0 - beq L(g1) - extrdi. r0, r7, 8, 24 - stbu r0, 1(r3) - addi r5, r5, -1 - beq L(g2) - cmpdi r5, 0 - beq L(g1) - extrdi. r0, r7, 8, 16 - stbu r0, 1(r3) - addi r5, r5, -1 - beq L(g2) - cmpdi r5, 0 - beq L(g1) - extrdi. r0, r7, 8, 8 - stbu r0, 1(r3) - addi r5, r5, -1 - beq L(g2) - cmpdi r5, 0 - beq L(g1) - extrdi r0, r7, 8, 0 - stbu r0, 1(r3) - addi r5, r5, -1 - b L(g2) -#else - extrdi. r0, r7, 8, 0 - stbu r0, 8(r3) - addi r5, r5, -1 - beq L(g2) - cmpdi r5, 0 - beq L(g1) - extrdi. r0, r7, 8, 8 - stbu r0, 1(r3) - addi r5, r5, -1 - beq L(g2) - cmpdi r5, 0 - beq L(g1) - extrdi. r0, r7, 8, 16 - stbu r0, 1(r3) - addi r5, r5, -1 - beq L(g2) - cmpdi r5, 0 - beq L(g1) - extrdi. r0, r7, 8, 24 - stbu r0, 1(r3) - addi r5, r5, -1 - beq L(g2) - cmpdi r5, 0 - beq L(g1) - extrdi. r0, r7, 8, 32 - stbu r0, 1(r3) - addi r5, r5, -1 - beq L(g2) - cmpdi r5, 0 - beq L(g1) - extrdi. r0, r7, 8, 40 - stbu r0, 1(r3) - addi r5, r5, -1 - beq L(g2) - cmpdi r5, 0 - beq L(g1) - extrdi. r0, r7, 8, 48 - stbu r0, 1(r3) - addi r5, r5, -1 - beq L(g2) - cmpdi r5, 0 - beq L(g1) - stbu r7, 1(r3) - addi r5, r5, -1 - b L(g2) -#endif -L(g1): -#ifdef USE_AS_STPNCPY - addi r3, r3, 1 -#endif -L(g2): - addi r3, r3, 1 - mr r19, r3 - mr r8, r5 - b L(zeroFill) -L(null1): - mr r9, r3 - subf r4, r6, r4 - b L(byte_by_byte) -END(FUNC_NAME) -#ifndef USE_AS_STPNCPY -libc_hidden_builtin_def (strncpy) -#endif |