/* Optimized strcpy implementation for PowerPC64/POWER9. Copyright (C) 2020-2024 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU C Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with the GNU C Library; if not, see . */ #include #ifdef USE_AS_STPCPY # ifndef STPCPY # define FUNC_NAME __stpcpy # else # define FUNC_NAME STPCPY # endif #else # ifndef STRCPY # define FUNC_NAME strcpy # else # define FUNC_NAME STRCPY # endif #endif /* !USE_AS_STPCPY */ /* Implements the function char * [r3] strcpy (char *dest [r3], const char *src [r4]) or char * [r3] stpcpy (char *dest [r3], const char *src [r4]) if USE_AS_STPCPY is defined. This implementation never reads across a page boundary, but may read beyond the NUL terminator. */ /* Load 4 quadwords, merge into one VR for speed and check for NUL and branch to label if NUL is found. */ #define CHECK_64B(offset,addr,label) \ lxv 32+v4,(offset+0)(addr); \ lxv 32+v5,(offset+16)(addr); \ lxv 32+v6,(offset+32)(addr); \ lxv 32+v7,(offset+48)(addr); \ vminub v14,v4,v5; \ vminub v15,v6,v7; \ vminub v16,v14,v15; \ vcmpequb. v0,v16,v18; \ beq cr6,$+12; \ li r7,offset; \ b L(label); \ stxv 32+v4,(offset+0)(r11); \ stxv 32+v5,(offset+16)(r11); \ stxv 32+v6,(offset+32)(r11); \ stxv 32+v7,(offset+48)(r11) /* Load quadword at addr+offset to vreg, check for NUL bytes, and branch to label if any are found. */ #define CHECK_16B(vreg,offset,addr,label) \ lxv vreg+32,offset(addr); \ vcmpequb. v15,vreg,v18; \ bne cr6,L(label); /* Store vreg2 with length if NUL is found. */ #define STORE_WITH_LEN(vreg1,vreg2,reg) \ vctzlsbb r8,vreg1; \ addi r9,r8,1; \ sldi r9,r9,56; \ stxvl 32+vreg2,reg,r9; .machine power9 ENTRY_TOCLESS (FUNC_NAME, 4) CALL_MCOUNT 2 vspltisb v18,0 /* Zeroes in v18. */ vspltisb v19,-1 /* 0xFF bytes in v19. */ /* Next 16B-aligned address. Prepare address for L(loop). */ addi r5,r4,16 clrrdi r5,r5,4 subf r8,r4,r5 add r11,r3,r8 /* Align data and fill bytes not loaded with non matching char. */ lvx v0,0,r4 lvsr v1,0,r4 vperm v0,v19,v0,v1 vcmpequb. v6,v0,v18 /* 0xff if byte is NUL, 0x00 otherwise. */ beq cr6,L(no_null) /* There's a NUL byte. */ STORE_WITH_LEN(v6,v0,r3) #ifdef USE_AS_STPCPY /* stpcpy returns the dest address plus the size not counting the final '\0'. */ add r3,r3,r8 #endif blr L(no_null): sldi r10,r8,56 /* stxvl wants size in top 8 bits. */ stxvl 32+v0,r3,r10 /* Partial store. */ /* The main loop is optimized for longer strings(> 512 bytes), so checking the first bytes in 16B chunks benefits shorter strings a lot. */ .p2align 4 L(aligned): CHECK_16B(v0,0,r5,tail1) CHECK_16B(v1,16,r5,tail2) CHECK_16B(v2,32,r5,tail3) CHECK_16B(v3,48,r5,tail4) CHECK_16B(v4,64,r5,tail5) CHECK_16B(v5,80,r5,tail6) CHECK_16B(v6,96,r5,tail7) CHECK_16B(v7,112,r5,tail8) stxv 32+v0,0(r11) stxv 32+v1,16(r11) stxv 32+v2,32(r11) stxv 32+v3,48(r11) stxv 32+v4,64(r11) stxv 32+v5,80(r11) stxv 32+v6,96(r11) stxv 32+v7,112(r11) addi r11,r11,128 CHECK_16B(v0,128,r5,tail1) CHECK_16B(v1,128+16,r5,tail2) CHECK_16B(v2,128+32,r5,tail3) CHECK_16B(v3,128+48,r5,tail4) CHECK_16B(v4,128+64,r5,tail5) CHECK_16B(v5,128+80,r5,tail6) CHECK_16B(v6,128+96,r5,tail7) CHECK_16B(v7,128+112,r5,tail8) stxv 32+v0,0(r11) stxv 32+v1,16(r11) stxv 32+v2,32(r11) stxv 32+v3,48(r11) stxv 32+v4,64(r11) stxv 32+v5,80(r11) stxv 32+v6,96(r11) stxv 32+v7,112(r11) addi r11,r11,128 CHECK_16B(v0,256,r5,tail1) CHECK_16B(v1,256+16,r5,tail2) CHECK_16B(v2,256+32,r5,tail3) CHECK_16B(v3,256+48,r5,tail4) CHECK_16B(v4,256+64,r5,tail5) CHECK_16B(v5,256+80,r5,tail6) CHECK_16B(v6,256+96,r5,tail7) CHECK_16B(v7,256+112,r5,tail8) stxv 32+v0,0(r11) stxv 32+v1,16(r11) stxv 32+v2,32(r11) stxv 32+v3,48(r11) stxv 32+v4,64(r11) stxv 32+v5,80(r11) stxv 32+v6,96(r11) stxv 32+v7,112(r11) addi r11,r11,128 CHECK_16B(v0,384,r5,tail1) CHECK_16B(v1,384+16,r5,tail2) CHECK_16B(v2,384+32,r5,tail3) CHECK_16B(v3,384+48,r5,tail4) CHECK_16B(v4,384+64,r5,tail5) CHECK_16B(v5,384+80,r5,tail6) CHECK_16B(v6,384+96,r5,tail7) CHECK_16B(v7,384+112,r5,tail8) stxv 32+v0,0(r11) stxv 32+v1,16(r11) stxv 32+v2,32(r11) stxv 32+v3,48(r11) stxv 32+v4,64(r11) stxv 32+v5,80(r11) stxv 32+v6,96(r11) stxv 32+v7,112(r11) /* Align src pointer down to a 64B boundary. */ addi r5,r4,512 clrrdi r5,r5,6 subf r7,r4,r5 add r11,r3,r7 /* Switch to a more aggressive approach checking 64B each time. */ .p2align 5 L(strcpy_loop): CHECK_64B(0,r5,tail_64b) CHECK_64B(64,r5,tail_64b) CHECK_64B(128,r5,tail_64b) CHECK_64B(192,r5,tail_64b) CHECK_64B(256,r5,tail_64b) CHECK_64B(256+64,r5,tail_64b) CHECK_64B(256+128,r5,tail_64b) CHECK_64B(256+192,r5,tail_64b) addi r5,r5,512 addi r11,r11,512 b L(strcpy_loop) .p2align 5 L(tail_64b): /* OK, we found a NUL byte. Let's look for it in the current 64-byte block and mark it in its corresponding VR. */ add r11,r11,r7 vcmpequb. v8,v4,v18 beq cr6,L(no_null_16B) /* There's a NUL byte. */ STORE_WITH_LEN(v8,v4,r11) #ifdef USE_AS_STPCPY add r3,r11,r8 #endif blr L(no_null_16B): stxv 32+v4,0(r11) vcmpequb. v8,v5,v18 beq cr6,L(no_null_32B) /* There's a NUL byte. */ addi r11,r11,16 STORE_WITH_LEN(v8,v5,r11) #ifdef USE_AS_STPCPY add r3,r11,r8 #endif blr L(no_null_32B): stxv 32+v5,16(r11) vcmpequb. v8,v6,v18 beq cr6,L(no_null_48B) /* There's a NUL byte. */ addi r11,r11,32 STORE_WITH_LEN(v8,v6,r11) #ifdef USE_AS_STPCPY add r3,r11,r8 #endif blr L(no_null_48B): stxv 32+v6,32(r11) vcmpequb. v8,v7,v18; /* There's a NUL byte. */ addi r11,r11,48 STORE_WITH_LEN(v8,v7,r11) #ifdef USE_AS_STPCPY add r3,r11,r8 #endif blr .p2align 4 L(tail1): /* There's a NUL byte. */ STORE_WITH_LEN(v15,v0,r11) #ifdef USE_AS_STPCPY add r3,r11,r8 #endif blr .p2align 4 L(tail2): stxv 32+v0,0(r11) /* There's a NUL byte. */ addi r11,r11,16 STORE_WITH_LEN(v15,v1,r11) #ifdef USE_AS_STPCPY add r3,r11,r8 #endif blr .p2align 4 L(tail3): stxv 32+v0,0(r11) stxv 32+v1,16(r11) addi r11,r11,32 STORE_WITH_LEN(v15,v2,r11) #ifdef USE_AS_STPCPY add r3,r11,r8 #endif blr .p2align 4 L(tail4): stxv 32+v0,0(r11) stxv 32+v1,16(r11) stxv 32+v2,32(r11) addi r11,r11,48 STORE_WITH_LEN(v15,v3,r11) #ifdef USE_AS_STPCPY add r3,r11,r8 #endif blr .p2align 4 L(tail5): stxv 32+v0,0(r11) stxv 32+v1,16(r11) stxv 32+v2,32(r11) stxv 32+v3,48(r11) addi r11,r11,64 STORE_WITH_LEN(v15,v4,r11) #ifdef USE_AS_STPCPY add r3,r11,r8 #endif blr .p2align 4 L(tail6): stxv 32+v0,0(r11) stxv 32+v1,16(r11) stxv 32+v2,32(r11) stxv 32+v3,48(r11) stxv 32+v4,64(r11) addi r11,r11,80 STORE_WITH_LEN(v15,v5,r11) #ifdef USE_AS_STPCPY add r3,r11,r8 #endif blr .p2align 4 L(tail7): stxv 32+v0,0(r11) stxv 32+v1,16(r11) stxv 32+v2,32(r11) stxv 32+v3,48(r11) stxv 32+v4,64(r11) stxv 32+v5,80(r11) addi r11,r11,96 STORE_WITH_LEN(v15,v6,r11) #ifdef USE_AS_STPCPY add r3,r11,r8 #endif blr .p2align 4 L(tail8): stxv 32+v0,0(r11) stxv 32+v1,16(r11) stxv 32+v2,32(r11) stxv 32+v3,48(r11) stxv 32+v4,64(r11) stxv 32+v5,80(r11) stxv 32+v6,96(r11) addi r11,r11,112 STORE_WITH_LEN(v15,v7,r11) #ifdef USE_AS_STPCPY add r3,r11,r8 #endif blr END (FUNC_NAME) #ifndef USE_AS_STPCPY libc_hidden_builtin_def (strcpy) #endif