/* Copyright (C) 2014 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU C Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with the GNU C Library; if not, see . */ #include /* Implements the functions char * [r3] strncpy (char *dst [r3], const char *src [r4], size_t n [r5]) AND char * [r3] stpncpy (char *dst [r3], const char *src [r4], size_t n [r5]) The algorithm is as follows: > if src and dest are 8 byte aligned, perform double word copy else > copy byte by byte on unaligned addresses. The aligned comparison are made using cmpb instructions. */ /* The focus on optimization for performance improvements are as follows: 1. data alignment [gain from aligned memory access on read/write] 2. POWER7 gains performance with loop unrolling/unwinding [gain by reduction of branch penalty]. 3. The final pad with null bytes is done by calling an optimized memset. */ #ifdef USE_AS_STPNCPY # define FUNC_NAME __stpncpy #else # define FUNC_NAME strncpy #endif #define FRAMESIZE (FRAME_MIN_SIZE+32) #ifndef MEMSET /* For builds with no IFUNC support, local calls should be made to internal GLIBC symbol (created by libc_hidden_builtin_def). */ # ifdef SHARED # define MEMSET __GI_memset # else # define MEMSET memset # endif #endif .machine power7 EALIGN(FUNC_NAME, 4, 0) CALL_MCOUNT 3 mflr r0 /* load link register LR to r0 */ or r10, r3, r4 /* to verify source and destination */ rldicl. r8, r10, 0, 61 /* is double word aligned .. ? */ std r19, -8(r1) /* save callers register , r19 */ std r18, -16(r1) /* save callers register , r18 */ std r0, 16(r1) /* store the link register */ stdu r1, -FRAMESIZE(r1) /* create the stack frame */ mr r9, r3 /* save r3 into r9 for use */ mr r18, r3 /* save r3 for retCode of strncpy */ bne 0, L(byte_by_byte) srdi r11, r5, 3 /* compute count for CTR ; count = n/8 */ cmpldi cr7, r11, 3 /* if count > 4 ; perform unrolling 4 times */ ble 7, L(update1) ld r10, 0(r4) /* load doubleWord from src */ cmpb r8, r10, r8 /* compare src with NULL ,we read just now */ cmpdi cr7, r8, 0 /* if cmpb returned NULL ; we continue */ bne cr7, L(update3) std r10, 0(r3) /* copy doubleword at offset=0 */ ld r10, 8(r4) /* load next doubleword from offset=8 */ cmpb r8, r10, r8 /* compare src with NULL , we read just now */ cmpdi cr7, r8, 0 /* if cmpb returned NULL ; we continue */ bne 7,L(HopBy8) addi r8, r11, -4 mr r7, r3 srdi r8, r8, 2 mr r6, r4 addi r8, r8, 1 li r12, 0 mtctr r8 b L(dwordCopy) .p2align 4 L(dWordUnroll): std r8, 16(r9) ld r8, 24(r4) /* load dword,perform loop unrolling again */ cmpb r10, r8, r10 cmpdi cr7, r10, 0 bne cr7, L(HopBy24) std r8, 24(r7) /* copy dword at offset=24 */ addi r9, r9, 32 addi r4, r4, 32 bdz L(leftDwords) /* continue with loop on counter */ ld r3, 32(r6) cmpb r8, r3, r10 cmpdi cr7, r8, 0 bne cr7, L(update2) std r3, 32(r7) ld r10, 40(r6) cmpb r8, r10, r8 cmpdi cr7, r8, 0 bne cr7, L(HopBy40) mr r6, r4 /* update values */ mr r7, r9 mr r11, r0 mr r5, r19 L(dwordCopy): std r10, 8(r9) /* copy dword at offset=8 */ addi r19, r5, -32 addi r0, r11, -4 ld r8, 16(r4) cmpb r10, r8, r12 cmpdi cr7, r10, 0 beq cr7, L(dWordUnroll) addi r9, r9, 16 /* increment dst by 16 */ addi r4, r4, 16 /* increment src by 16 */ addi r5, r5, -16 /* decrement length 'n' by 16 */ addi r0, r11, -2 /* decrement loop counter */ L(dWordUnrollOFF): ld r10, 0(r4) /* load first dword */ li r8, 0 /* load mask */ cmpb r8, r10, r8 cmpdi cr7, r8, 0 bne cr7, L(byte_by_byte) mtctr r0 li r7, 0 b L(CopyDword) .p2align 4 L(loadDWordandCompare): ld r10, 0(r4) cmpb r8, r10, r7 cmpdi cr7, r8, 0 bne cr7, L(byte_by_byte) L(CopyDword): addi r9, r9, 8 std r10, -8(r9) addi r4, r4, 8 addi r5, r5, -8 bdnz L(loadDWordandCompare) L(byte_by_byte): cmpldi cr7, r5, 3 ble cr7, L(verifyByte) srdi r10, r5, 2 mr r19, r9 mtctr r10 b L(firstByteUnroll) .p2align 4 L(bytes_unroll): lbz r10, 1(r4) /* load byte from src */ cmpdi cr7, r10, 0 /* compare for NULL */ stb r10, 1(r19) /* store byte to dst */ beq cr7, L(updtDestComputeN2ndByte) addi r4, r4, 4 /* advance src */ lbz r10, -2(r4) /* perform loop unrolling for byte r/w */ cmpdi cr7, r10, 0 stb r10, 2(r19) beq cr7, L(updtDestComputeN3rdByte) lbz r10, -1(r4) /* perform loop unrolling for byte r/w */ addi r19, r19, 4 cmpdi cr7, r10, 0 stb r10, -1(r19) beq cr7, L(ComputeNByte) bdz L(update0) L(firstByteUnroll): lbz r10, 0(r4) /* perform loop unrolling for byte r/w */ cmpdi cr7, 10, 0 stb r10, 0(r19) bne cr7, L(bytes_unroll) addi r19, r19, 1 L(ComputeNByte): subf r9, r19, r9 /* compute 'n'n bytes to fill */ add r8, r9, r5 L(zeroFill): cmpdi cr7, r8, 0 /* compare if length is zero */ beq cr7, L(update3return) mr r3, r19 /* fill buffer with */ li r4, 0 /* zero fill buffer */ mr r5, r8 /* how many bytes to fill buffer with */ bl MEMSET /* call optimized memset */ nop L(update3return): #ifdef USE_AS_STPNCPY addi r3, r19, -1 /* update return value */ #endif L(hop2return): #ifndef USE_AS_STPNCPY mr r3, r18 /* set return value */ #endif addi r1, r1, FRAMESIZE /* restore stack pointer */ ld r0, 16(r1) /* read the saved link register */ ld r18, -16(r1) /* restore callers save register, r18 */ ld r19, -8(r1) /* restore callers save register, r19 */ mtlr r0 /* branch to link register */ blr /* return */ .p2align 4 L(update0): mr r9, r19 .p2align 4 L(verifyByte): rldicl. r8, r5, 0, 62 #ifdef USE_AS_STPNCPY mr r3, r9 #endif beq cr0, L(hop2return) mtctr r8 addi r4, r4, -1 mr r19, r9 b L(oneBYone) .p2align 4 L(proceed): bdz L(done) L(oneBYone): lbzu r10, 1(r4) /* copy byte */ addi r19, r19, 1 addi r8, r8, -1 cmpdi cr7, r10, 0 stb r10, -1(r19) bne cr7, L(proceed) b L(zeroFill) .p2align 4 L(done): addi r1, r1, FRAMESIZE /* restore stack pointer */ #ifdef USE_AS_STPNCPY mr r3, r19 /* set the return value */ #else mr r3, r18 /* set the return value */ #endif ld r0, 16(r1) /* read the saved link register */ ld r18, -16(r1) /* restore callers save register, r18 */ ld r19, -8(r1) /* restore callers save register, r19 */ mtlr r0 /* branch to link register */ blr /* return */ L(update1): mr r0, r11 mr r19, r5 .p2align 4 L(leftDwords): cmpdi cr7, r0, 0 mr r5, r19 bne cr7, L(dWordUnrollOFF) b L(byte_by_byte) .p2align 4 L(updtDestComputeN2ndByte): addi r19, r19, 2 /* update dst by 2 */ subf r9, r19, r9 /* compute distance covered */ add r8, r9, r5 b L(zeroFill) .p2align 4 L(updtDestComputeN3rdByte): addi r19, r19, 3 /* update dst by 3 */ subf r9, r19, r9 /* compute distance covered */ add r8, r9, r5 b L(zeroFill) .p2align 4 L(HopBy24): addi r9, r9, 24 /* increment dst by 24 */ addi r4, r4, 24 /* increment src by 24 */ addi r5, r5, -24 /* decrement length 'n' by 24 */ addi r0, r11, -3 /* decrement loop counter */ b L(dWordUnrollOFF) .p2align 4 L(update2): mr r5, r19 b L(dWordUnrollOFF) .p2align 4 L(HopBy40): addi r9, r7, 40 /* increment dst by 40 */ addi r4, r6, 40 /* increment src by 40 */ addi r5, r5, -40 /* decrement length 'n' by 40 */ addi r0, r11, -5 /* decrement loop counter */ b L(dWordUnrollOFF) L(update3): mr r0, r11 b L(dWordUnrollOFF) L(HopBy8): addi r9, r3, 8 /* increment dst by 8 */ addi r4, r4, 8 /* increment src by 8 */ addi r5, r5, -8 /* decrement length 'n' by 8 */ addi r0, r11, -1 /* decrement loop counter */ b L(dWordUnrollOFF) END(FUNC_NAME) #ifndef USE_AS_STPNCPY libc_hidden_builtin_def (strncpy) #endif