From d626a24f235dbd4c446b241211a9a264a1eedb9e Mon Sep 17 00:00:00 2001 From: Stefan Liebler Date: Wed, 26 Aug 2015 10:26:21 +0200 Subject: S390: Optimize strcat and wcscat. This patch provides optimized versions of strcat and wcscat with the z13 vector instructions. ChangeLog: * sysdeps/s390/multiarch/strcat-c.c: New File. * sysdeps/s390/multiarch/strcat-vx.S: Likewise. * sysdeps/s390/multiarch/strcat.c: Likewise. * sysdeps/s390/multiarch/wcscat-c.c: Likewise. * sysdeps/s390/multiarch/wcscat-vx.S: Likewise. * sysdeps/s390/multiarch/wcscat.c: Likewise. * sysdeps/s390/multiarch/Makefile (sysdep_routines): Add strcat and wcscat functions. * sysdeps/s390/multiarch/ifunc-impl-list.c (__libc_ifunc_impl_list): Add ifunc test for strcat, wcscat. * string/strcat.c (STRCAT): Define and use macro. * wcsmbs/wcscat.c: Use WCSCAT if defined. * string/test-strcat.c: Add wcscat support. * wcsmbs/test-wcscat.c: New File. * wcsmbs/Makefile (strop-tests): Add wcscat. * benchtests/bench-strcat.c: Add wcscat support. * benchtests/bench-wcscat.c: New File. * benchtests/Makefile (wcsmbs-bench): Add wcscat. --- sysdeps/s390/multiarch/Makefile | 6 +- sysdeps/s390/multiarch/ifunc-impl-list.c | 3 + sysdeps/s390/multiarch/strcat-c.c | 28 +++++ sysdeps/s390/multiarch/strcat-vx.S | 161 ++++++++++++++++++++++++++++ sysdeps/s390/multiarch/strcat.c | 27 +++++ sysdeps/s390/multiarch/wcscat-c.c | 25 +++++ sysdeps/s390/multiarch/wcscat-vx.S | 175 +++++++++++++++++++++++++++++++ sysdeps/s390/multiarch/wcscat.c | 28 +++++ 8 files changed, 451 insertions(+), 2 deletions(-) create mode 100644 sysdeps/s390/multiarch/strcat-c.c create mode 100644 sysdeps/s390/multiarch/strcat-vx.S create mode 100644 sysdeps/s390/multiarch/strcat.c create mode 100644 sysdeps/s390/multiarch/wcscat-c.c create mode 100644 sysdeps/s390/multiarch/wcscat-vx.S create mode 100644 sysdeps/s390/multiarch/wcscat.c (limited to 'sysdeps') diff --git a/sysdeps/s390/multiarch/Makefile b/sysdeps/s390/multiarch/Makefile index 98b588f1f2..6283999faa 100644 --- a/sysdeps/s390/multiarch/Makefile +++ b/sysdeps/s390/multiarch/Makefile @@ -4,7 +4,8 @@ sysdep_routines += strlen strlen-vx strlen-c \ strcpy strcpy-vx \ stpcpy stpcpy-vx stpcpy-c \ strncpy strncpy-vx \ - stpncpy stpncpy-vx stpncpy-c + stpncpy stpncpy-vx stpncpy-c \ + strcat strcat-vx strcat-c endif ifeq ($(subdir),wcsmbs) @@ -13,5 +14,6 @@ sysdep_routines += wcslen wcslen-vx wcslen-c \ wcscpy wcscpy-vx wcscpy-c \ wcpcpy wcpcpy-vx wcpcpy-c \ wcsncpy wcsncpy-vx wcsncpy-c \ - wcpncpy wcpncpy-vx wcpncpy-c + wcpncpy wcpncpy-vx wcpncpy-c \ + wcscat wcscat-vx wcscat-c endif diff --git a/sysdeps/s390/multiarch/ifunc-impl-list.c b/sysdeps/s390/multiarch/ifunc-impl-list.c index ca69983466..ccf4dea198 100644 --- a/sysdeps/s390/multiarch/ifunc-impl-list.c +++ b/sysdeps/s390/multiarch/ifunc-impl-list.c @@ -97,6 +97,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_VX_IMPL (stpncpy); IFUNC_VX_IMPL (wcpncpy); + IFUNC_VX_IMPL (strcat); + IFUNC_VX_IMPL (wcscat); + #endif /* HAVE_S390_VX_ASM_SUPPORT */ return i; diff --git a/sysdeps/s390/multiarch/strcat-c.c b/sysdeps/s390/multiarch/strcat-c.c new file mode 100644 index 0000000000..51c12e375f --- /dev/null +++ b/sysdeps/s390/multiarch/strcat-c.c @@ -0,0 +1,28 @@ +/* Default strcat implementation for S/390. + Copyright (C) 2015 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#if defined HAVE_S390_VX_ASM_SUPPORT && IS_IN (libc) +# define STRCAT __strcat_c +# ifdef SHARED +# undef libc_hidden_builtin_def +# define libc_hidden_builtin_def(name) \ + __hidden_ver1 (__strcat_c, __GI_strcat, __strcat_c); +# endif /* SHARED */ + +# include +#endif /* HAVE_S390_VX_ASM_SUPPORT && IS_IN (libc) */ diff --git a/sysdeps/s390/multiarch/strcat-vx.S b/sysdeps/s390/multiarch/strcat-vx.S new file mode 100644 index 0000000000..ba553bec26 --- /dev/null +++ b/sysdeps/s390/multiarch/strcat-vx.S @@ -0,0 +1,161 @@ +/* Vector optimized 32/64 bit S/390 version of strcat. + Copyright (C) 2015 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#if defined HAVE_S390_VX_ASM_SUPPORT && IS_IN (libc) + +# include "sysdep.h" +# include "asm-syntax.h" + + .text + +/* char * strcat (const char *dest, const char *src) + Concatenate two strings. + + Register usage: + -r0=saved dest pointer for return + -r1=tmp + -r2=dest + -r3=src + -r4=tmp + -r5=current_len + -v16=part of src + -v17=index of zero + -v18=part of src +*/ +ENTRY(__strcat_vx) + .machine "z13" + .machinemode "zarch_nohighgprs" + + lgr %r0,%r2 /* Save destination pointer for return. */ + + /* STRLEN + r1 = loaded bytes (tmp) + r4 = zero byte index (tmp) + r2 = dst + */ + vlbb %v16,0(%r2),6 /* Load s until next 4k-byte boundary. */ + lcbb %r1,0(%r2),6 /* Get bytes to 4k-byte boundary or 16. */ + + vfenezb %v16,%v16,%v16 /* Find element not equal with zero search. */ + vlgvb %r5,%v16,7 /* Load zero index or 16 if not found. */ + clrjl %r5,%r1,.Llen_end /* Found zero within loaded bytes, end. */ + + /* Align s to 16 byte. */ + risbgn %r1,%r2,60,128+63,0 /* %r3 = bits 60-63 of %r2 'and' 15. */ + lghi %r5,16 /* current_len = 16. */ + slr %r5,%r1 /* Compute bytes to 16bytes boundary. */ + + /* Find zero in 16byte aligned loop. */ +.Llen_loop: + vl %v16,0(%r5,%r2) /* Load s. */ + vfenezbs %v16,%v16,%v16 /* Find element not equal with zero search. */ + je .Llen_found /* Jump away if zero was found. */ + vl %v16,16(%r5,%r2) + vfenezbs %v16,%v16,%v16 + je .Llen_found16 + vl %v16,32(%r5,%r2) + vfenezbs %v16,%v16,%v16 + je .Llen_found32 + vl %v16,48(%r5,%r2) + vfenezbs %v16,%v16,%v16 + je .Llen_found48 + + aghi %r5,64 + j .Llen_loop /* No zero -> loop. */ + +.Llen_found48: + aghi %r5,16 +.Llen_found32: + aghi %r5,16 +.Llen_found16: + aghi %r5,16 +.Llen_found: + vlgvb %r4,%v16,7 /* Load byte index of zero. */ + algr %r5,%r4 + +.Llen_end: + /* STRCPY + %r1 = loaded bytes (tmp) + %r4 = zero byte index (tmp) + %r3 = curr src pointer + %r2 = curr dst pointer + */ + la %r2,0(%r5,%r2) /* strcpy at end of dst-string. */ + + vlbb %v16,0(%r3),6 /* Load s until next 4k-byte boundary. */ + lcbb %r1,0(%r3),6 /* Get bytes to 4k-byte boundary or 16. */ + + vfenezb %v17,%v16,%v16 /* Find element not equal with zero search. */ + vlgvb %r5,%v17,7 /* Load zero index or 16 if not found. */ + clrjl %r5,%r1,.Lcpy_found_align /* If found zero within loaded bytes, + copy bytes before and return. */ + + /* Align s to 16 byte. */ + risbgn %r4,%r3,60,128+63,0 /* %r3 = bits 60-63 of %r2 'and' 15. */ + lghi %r5,15 /* current_len = 15. */ + slr %r5,%r4 /* Compute highest index to 16byte boundary. */ + + vstl %v16,%r5,0(%r2) /* Copy loaded characters - no zero. */ + ahi %r5,1 /* Start loop at next character. */ + + /* Find zero in 16byte aligned loop. */ +.Lcpy_loop: + vl %v16,0(%r5,%r3) /* Load s. */ + vfenezbs %v17,%v16,%v16 /* Find element not equal with zero search. */ + je .Lcpy_found_v16_0 /* Jump away if zero was found. */ + vl %v18,16(%r5,%r3)/* Load next part of s. */ + vst %v16,0(%r5,%r2) /* Store previous part without zero to dst. */ + vfenezbs %v17,%v18,%v18 + je .Lcpy_found_v18_16 + vl %v16,32(%r5,%r3) + vst %v18,16(%r5,%r2) + vfenezbs %v17,%v16,%v16 + je .Lcpy_found_v16_32 + vl %v18,48(%r5,%r3) + vst %v16,32(%r5,%r2) + vfenezbs %v17,%v18,%v18 + je .Lcpy_found_v18_48 + vst %v18,48(%r5,%r2) + + aghi %r5,64 + j .Lcpy_loop /* No zero -> loop. */ + +.Lcpy_found_v16_32: + aghi %r5,32 +.Lcpy_found_v16_0: + la %r4,0(%r5,%r2) + vlgvb %r1,%v17,7 /* Load byte index of zero. */ + vstl %v16,%r1,0(%r4) /* Copy characters including zero. */ + lgr %r2,%r0 /* Load saved dest-ptr. */ + br %r14 + +.Lcpy_found_v18_48: + aghi %r5,32 +.Lcpy_found_v18_16: + la %r4,16(%r5,%r2) + vlgvb %r1,%v17,7 /* Load byte index of zero. */ + vstl %v18,%r1,0(%r4) /* Copy characters including zero. */ + lgr %r2,%r0 /* Load saved dest-ptr. */ + br %r14 + +.Lcpy_found_align: + vstl %v16,%r5,0(%r2) /* Copy characters including zero. */ + lgr %r2,%r0 /* Load saved dest-ptr. */ + br %r14 +END(__strcat_vx) +#endif /* HAVE_S390_VX_ASM_SUPPORT && IS_IN (libc) */ diff --git a/sysdeps/s390/multiarch/strcat.c b/sysdeps/s390/multiarch/strcat.c new file mode 100644 index 0000000000..e518eb3eb7 --- /dev/null +++ b/sysdeps/s390/multiarch/strcat.c @@ -0,0 +1,27 @@ +/* Multiple versions of strcat. + Copyright (C) 2015 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#if defined HAVE_S390_VX_ASM_SUPPORT && IS_IN (libc) +# include +# include + +s390_vx_libc_ifunc2 (__strcat, strcat) + +#else +# include +#endif /* !(defined HAVE_S390_VX_ASM_SUPPORT && IS_IN (libc)) */ diff --git a/sysdeps/s390/multiarch/wcscat-c.c b/sysdeps/s390/multiarch/wcscat-c.c new file mode 100644 index 0000000000..d86b8a3fb5 --- /dev/null +++ b/sysdeps/s390/multiarch/wcscat-c.c @@ -0,0 +1,25 @@ +/* Default wcscat implementation for S/390. + Copyright (C) 2015 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#if defined HAVE_S390_VX_ASM_SUPPORT && IS_IN (libc) +# define WCSCAT __wcscat_c + +# include +extern __typeof (__wcscat) __wcscat_c; +# include +#endif diff --git a/sysdeps/s390/multiarch/wcscat-vx.S b/sysdeps/s390/multiarch/wcscat-vx.S new file mode 100644 index 0000000000..71f7b1a9cf --- /dev/null +++ b/sysdeps/s390/multiarch/wcscat-vx.S @@ -0,0 +1,175 @@ +/* Vector optimized 32/64 bit S/390 version of wcscat. + Copyright (C) 2015 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#if defined HAVE_S390_VX_ASM_SUPPORT && IS_IN (libc) + +# include "sysdep.h" +# include "asm-syntax.h" + + .text + +/* wchar_t * wcscat (wchar_t *dest, const wchar_t *src) + Concatenate two strings. + + Register usage: + -r0=saved dest pointer for return + -r1=tmp + -r2=dest + -r3=src + -r4=tmp + -r5=current_len + -v16=part of src + -v17=index of zero + -v18=part of src +*/ +ENTRY(__wcscat_vx) + .machine "z13" + .machinemode "zarch_nohighgprs" + + vlbb %v16,0(%r2),6 /* Load s until next 4k-byte boundary. */ + lcbb %r1,0(%r2),6 /* Get bytes to 4k-byte boundary or 16. */ + + /* __wcslen_c can handle non 4byte aligned pointers, + but __wcscpy_c not. Thus if either src or dest is + not 4byte aligned, use __wcscat_c. */ + tmll %r2,3 /* Test if s is 4-byte aligned? */ + jne .Lfallback /* And use common-code variant if not. */ + tmll %r3,3 /* Test if src is 4-byte aligned? */ + jne .Lfallback /* And use common-code variant if not. */ + + lgr %r0,%r2 /* Save destination pointer for return. */ + + /* WCSLEN + r1 = loaded bytes (tmp) + r4 = zero byte index (tmp) + r2 = dst + */ + + vfenezf %v16,%v16,%v16 /* Find element not equal with zero search. */ + vlgvb %r5,%v16,7 /* Load zero index or 16 if not found. */ + clrjl %r5,%r1,.Llen_end /* Found zero within loaded bytes, end. */ + + /* Align s to 16 byte. */ + risbgn %r1,%r2,60,128+63,0 /* %r3 = bits 60-63 of %r2 'and' 15. */ + lghi %r5,16 /* current_len = 16. */ + slr %r5,%r1 /* Compute bytes to 16bytes boundary. */ + + /* Find zero in 16byte aligned loop. */ +.Llen_loop: + vl %v16,0(%r5,%r2) /* Load s. */ + vfenezfs %v16,%v16,%v16 /* Find element not equal with zero search. */ + je .Llen_found /* Jump away if zero was found. */ + vl %v16,16(%r5,%r2) + vfenezfs %v16,%v16,%v16 + je .Llen_found16 + vl %v16,32(%r5,%r2) + vfenezfs %v16,%v16,%v16 + je .Llen_found32 + vl %v16,48(%r5,%r2) + vfenezfs %v16,%v16,%v16 + je .Llen_found48 + + aghi %r5,64 + j .Llen_loop /* No zero -> loop. */ + +.Llen_found48: + aghi %r5,16 +.Llen_found32: + aghi %r5,16 +.Llen_found16: + aghi %r5,16 +.Llen_found: + vlgvb %r4,%v16,7 /* Load byte index of zero. */ + algr %r5,%r4 + +.Llen_end: + /* WCSCPY + %r1 = loaded bytes (tmp) + %r4 = zero byte index (tmp) + %r3 = curr src pointer + %r2 = curr dst pointer + */ + la %r2,0(%r5,%r2) /* strcpy at end of dst-string. */ + + vlbb %v16,0(%r3),6 /* Load s until next 4k-byte boundary. */ + lcbb %r1,0(%r3),6 /* Get bytes to 4k-byte boundary or 16. */ + + vfenezf %v17,%v16,%v16 /* Find element not equal with zero search. */ + vlgvb %r5,%v17,7 /* Load zero index or 16 if not found. */ + clrjl %r5,%r1,.Lcpy_found_align /* If found zero within loaded bytes, + copy bytes before and return. */ + + /* Align s to 16 byte. */ + risbgn %r4,%r3,60,128+63,0 /* %r3 = bits 60-63 of %r2 'and' 15. */ + lghi %r5,15 /* current_len = 15. */ + slr %r5,%r4 /* Compute highest index to 16byte boundary. */ + + vstl %v16,%r5,0(%r2) /* Copy loaded characters - no zero. */ + ahi %r5,1 /* Start loop at next character. */ + + /* Find zero in 16byte aligned loop. */ +.Lcpy_loop: + vl %v16,0(%r5,%r3) /* Load s. */ + vfenezfs %v17,%v16,%v16 /* Find element not equal with zero search. */ + je .Lcpy_found_v16_0 /* Jump away if zero was found. */ + vl %v18,16(%r5,%r3) /* Load next part of s. */ + vst %v16,0(%r5,%r2) /* Save previous part without zero to dst. */ + vfenezfs %v17,%v18,%v18 + je .Lcpy_found_v18_16 + vl %v16,32(%r5,%r3) + vst %v18,16(%r5,%r2) + vfenezfs %v17,%v16,%v16 + je .Lcpy_found_v16_32 + vl %v18,48(%r5,%r3) + vst %v16,32(%r5,%r2) + vfenezfs %v17,%v18,%v18 + je .Lcpy_found_v18_48 + vst %v18,48(%r5,%r2) + + aghi %r5,64 + j .Lcpy_loop /* No zero -> loop. */ + +.Lcpy_found_v16_32: + aghi %r5,32 +.Lcpy_found_v16_0: + la %r4,0(%r5,%r2) + vlgvb %r1,%v17,7 /* Load byte index of zero. */ + aghi %r1,3 /* Also copy remaining bytes of zero. */ + vstl %v16,%r1,0(%r4) /* Copy characters including zero. */ + lgr %r2,%r0 /* Load saved dest-ptr. */ + br %r14 + +.Lcpy_found_v18_48: + aghi %r5,32 +.Lcpy_found_v18_16: + la %r4,16(%r5,%r2) + vlgvb %r1,%v17,7 /* Load byte index of zero. */ + aghi %r1,3 /* Also copy remaining bytes of zero. */ + vstl %v18,%r1,0(%r4) /* Copy characters including zero. */ + lgr %r2,%r0 /* Load saved dest-ptr. */ + br %r14 + +.Lcpy_found_align: + aghi %r5,3 /* Also copy remaining bytes of found zero. */ + vstl %v16,%r5,0(%r2) /* Copy characters including zero. */ + lgr %r2,%r0 /* Load saved dest-ptr. */ + br %r14 +.Lfallback: + jg __wcscat_c +END(__wcscat_vx) +#endif /* HAVE_S390_VX_ASM_SUPPORT && IS_IN (libc) */ diff --git a/sysdeps/s390/multiarch/wcscat.c b/sysdeps/s390/multiarch/wcscat.c new file mode 100644 index 0000000000..a2a000b056 --- /dev/null +++ b/sysdeps/s390/multiarch/wcscat.c @@ -0,0 +1,28 @@ +/* Multiple versions of wcscat. + Copyright (C) 2015 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#if defined HAVE_S390_VX_ASM_SUPPORT && IS_IN (libc) +# include +# include + +s390_vx_libc_ifunc (__wcscat) +weak_alias (__wcscat, wcscat) + +#else +# include +#endif /* !(defined HAVE_S390_VX_ASM_SUPPORT && IS_IN (libc)) */ -- cgit 1.4.1