/* Optimized memset aligned implementation using basic LoongArch instructions. Copyright (C) 2023 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU C Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with the GNU C Library. If not, see . */ #include #include #include #if IS_IN (libc) # define MEMSET_NAME __memset_aligned #else # define MEMSET_NAME memset #endif LEAF(MEMSET_NAME, 6) move t0, a0 andi a3, a0, 0x7 li.w t6, 16 beqz a3, L(align) bltu a2, t6, L(short_data) L(make_align): li.w t8, 8 sub.d t2, t8, a3 pcaddi t1, 11 slli.d t3, t2, 2 sub.d t1, t1, t3 jr t1 L(al7): st.b a1, t0, 6 L(al6): st.b a1, t0, 5 L(al5): st.b a1, t0, 4 L(al4): st.b a1, t0, 3 L(al3): st.b a1, t0, 2 L(al2): st.b a1, t0, 1 L(al1): st.b a1, t0, 0 L(al0): add.d t0, t0, t2 sub.d a2, a2, t2 L(align): bstrins.d a1, a1, 15, 8 bstrins.d a1, a1, 31, 16 bstrins.d a1, a1, 63, 32 bltu a2, t6, L(less_16bytes) andi a4, a2, 0x3f beq a4, a2, L(less_64bytes) sub.d t1, a2, a4 move a2, a4 add.d a5, t0, t1 L(loop_64bytes): addi.d t0, t0, 64 st.d a1, t0, -64 st.d a1, t0, -56 st.d a1, t0, -48 st.d a1, t0, -40 st.d a1, t0, -32 st.d a1, t0, -24 st.d a1, t0, -16 st.d a1, t0, -8 bne t0, a5, L(loop_64bytes) L(less_64bytes): srai.d a4, a2, 5 beqz a4, L(less_32bytes) addi.d a2, a2, -32 st.d a1, t0, 0 st.d a1, t0, 8 st.d a1, t0, 16 st.d a1, t0, 24 addi.d t0, t0, 32 L(less_32bytes): bltu a2, t6, L(less_16bytes) addi.d a2, a2, -16 st.d a1, t0, 0 st.d a1, t0, 8 addi.d t0, t0, 16 L(less_16bytes): srai.d a4, a2, 3 beqz a4, L(less_8bytes) addi.d a2, a2, -8 st.d a1, t0, 0 addi.d t0, t0, 8 L(less_8bytes): beqz a2, L(less_1byte) srai.d a4, a2, 2 beqz a4, L(less_4bytes) addi.d a2, a2, -4 st.w a1, t0, 0 addi.d t0, t0, 4 L(less_4bytes): srai.d a3, a2, 1 beqz a3, L(less_2bytes) addi.d a2, a2, -2 st.h a1, t0, 0 addi.d t0, t0, 2 L(less_2bytes): beqz a2, L(less_1byte) st.b a1, t0, 0 L(less_1byte): jr ra L(short_data): pcaddi t1, 19 slli.d t3, a2, 2 sub.d t1, t1, t3 jr t1 L(short_15): st.b a1, a0, 14 L(short_14): st.b a1, a0, 13 L(short_13): st.b a1, a0, 12 L(short_12): st.b a1, a0, 11 L(short_11): st.b a1, a0, 10 L(short_10): st.b a1, a0, 9 L(short_9): st.b a1, a0, 8 L(short_8): st.b a1, a0, 7 L(short_7): st.b a1, a0, 6 L(short_6): st.b a1, a0, 5 L(short_5): st.b a1, a0, 4 L(short_4): st.b a1, a0, 3 L(short_3): st.b a1, a0, 2 L(short_2): st.b a1, a0, 1 L(short_1): st.b a1, a0, 0 L(short_0): jr ra END(MEMSET_NAME) libc_hidden_builtin_def (MEMSET_NAME)