From 3380772a3c7f5c5dfc23604c7699b1e02624f049 Mon Sep 17 00:00:00 2001 From: Adhemerval Zanella Date: Thu, 29 Dec 2011 19:15:10 -0500 Subject: PowerPC - Optimization for str[n]casecmp functions This patch provides throughput boost for the strcasecmp function (25% on ppc32 and 40% on ppc64) and strncasecmp (15% on both ppc32 and ppc64) for POWER7. The optimization is done by manually (strcasecmp) or automatically (strncasecmp) unrolling the test loop to avoid CPU stalls caused by a test followed by a load. --- ChangeLog | 18 ++++ sysdeps/powerpc/Makefile | 2 + sysdeps/powerpc/locale-defines.sym | 5 + sysdeps/powerpc/powerpc32/power7/Makefile | 4 + sysdeps/powerpc/powerpc32/power7/strcasecmp.S | 132 ++++++++++++++++++++++++ sysdeps/powerpc/powerpc32/power7/strcasecmp_l.S | 5 + sysdeps/powerpc/powerpc64/power7/Makefile | 5 + sysdeps/powerpc/powerpc64/power7/strcasecmp.S | 125 ++++++++++++++++++++++ sysdeps/powerpc/powerpc64/power7/strcasecmp_l.S | 5 + 9 files changed, 301 insertions(+) create mode 100644 sysdeps/powerpc/locale-defines.sym create mode 100644 sysdeps/powerpc/powerpc32/power7/Makefile create mode 100644 sysdeps/powerpc/powerpc32/power7/strcasecmp.S create mode 100644 sysdeps/powerpc/powerpc32/power7/strcasecmp_l.S create mode 100644 sysdeps/powerpc/powerpc64/power7/strcasecmp.S create mode 100644 sysdeps/powerpc/powerpc64/power7/strcasecmp_l.S diff --git a/ChangeLog b/ChangeLog index f81b0c59ad..f3a26324cc 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,21 @@ +2011-11-16 Adhemerval Zanella + + * sysdeps/powerpc/Makefile: Added locale-defines.sym generation. + * sysdeps/powerpc/locale-defines.sym: Locale definitions for strcasecmp + optimized code. + * sysdeps/powerpc/powerpc32/power7/Makefile: New file: added unroll-loop + option for strncasecmp/strncasecmp_l compilation. + * sysdeps/powerpc/powerpc32/power7/strcasecmp.S: New file: strcasecmp + optimization for PPC32. + * sysdeps/powerpc/powerpc32/power7/strcasecmp_l.S: New file: strcasecmp_l + optimization for PPC32. + * sysdeps/powerpc/powerpc64/power7/Makefile: Added unroll-loop option for + strncasecmp/strncasecmp_l compilation. + * sysdeps/powerpc/powerpc64/power7/strcasecmp.S: New file: strcasecmp + optimization for PPC64. + * sysdeps/powerpc/powerpc64/power7/strcasecmp_l.S: New file: strcasecmp_l + optimization for PPC64. + 2011-11-18 Adhemerval Zanella * math/libm-test.inc: Added more nerabyint tests. diff --git a/sysdeps/powerpc/Makefile b/sysdeps/powerpc/Makefile index e43ca704f0..23a9a16730 100644 --- a/sysdeps/powerpc/Makefile +++ b/sysdeps/powerpc/Makefile @@ -23,4 +23,6 @@ endif ifeq ($(subdir),csu) # get offset to rtld_global._dl_hwcap gen-as-const-headers += rtld-global-offsets.sym +# get offset to __locale_struct.__ctype_tolower +gen-as-const-headers += locale-defines.sym endif diff --git a/sysdeps/powerpc/locale-defines.sym b/sysdeps/powerpc/locale-defines.sym new file mode 100644 index 0000000000..af64b920a4 --- /dev/null +++ b/sysdeps/powerpc/locale-defines.sym @@ -0,0 +1,5 @@ +#include + +-- + +LOCALE_CTYPE_TOLOWER offsetof (struct __locale_struct, __ctype_tolower) diff --git a/sysdeps/powerpc/powerpc32/power7/Makefile b/sysdeps/powerpc/powerpc32/power7/Makefile new file mode 100644 index 0000000000..5e8f4a28ba --- /dev/null +++ b/sysdeps/powerpc/powerpc32/power7/Makefile @@ -0,0 +1,4 @@ +ifeq ($(subdir),string) +CFLAGS-strncase.c += -funroll-loops +CFLAGS-strncase_l.c += -funroll-loops +endif diff --git a/sysdeps/powerpc/powerpc32/power7/strcasecmp.S b/sysdeps/powerpc/powerpc32/power7/strcasecmp.S new file mode 100644 index 0000000000..5d84fce470 --- /dev/null +++ b/sysdeps/powerpc/powerpc32/power7/strcasecmp.S @@ -0,0 +1,132 @@ +/* Optimized strcasecmp implementation for PowerPC32. + Copyright (C) 2011 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include +#include +#include +#include + +/* int [r3] strcasecmp (const char *s1 [r3], const char *s2 [r4] ) + + or if defined USE_IN_EXTENDED_LOCALE_MODEL: + + int [r3] strcasecmp_l (const char *s1 [r3], const char *s2 [r4], + __locale_t loc [r5]) */ + +#ifndef STRCMP +# define __STRCMP __strcasecmp +# define STRCMP strcasecmp +#endif + +ENTRY (BP_SYM (__STRCMP)) + +#define rRTN r3 /* Return value */ +#define rSTR1 r5 /* 1st string */ +#define rSTR2 r4 /* 2nd string */ +#define rLOCARG r5 /* 3rd argument: locale_t */ +#define rCHAR1 r6 /* Byte readed from 1st string */ +#define rCHAR2 r7 /* Byte readed from 2nd string */ +#define rADDR1 r8 /* Address of tolower(rCHAR1) */ +#define rADDR2 r12 /* Address of tolower(rCHAR2) */ +#define rLWR1 r8 /* Byte tolower(rCHAR1) */ +#define rLWR2 r12 /* Byte tolower(rCHAR2) */ +#define rTMP r0 +#define rGOT r9 /* Address of the Global Offset Table */ +#define rLOC r11 /* Default locale address */ + + cmpw cr7, r3, r4 +#ifndef USE_IN_EXTENDED_LOCALE_MODEL +# ifdef SHARED + mflr rTMP + bcl 20,31,.L1 +.L1: mflr rGOT + addis rGOT, rGOT, _GLOBAL_OFFSET_TABLE_-.L1@ha + addi rGOT, rGOT, _GLOBAL_OFFSET_TABLE_-.L1@l + lwz rLOC, __libc_tsd_LOCALE@got@tprel(rGOT) + add rLOC, rLOC, __libc_tsd_LOCALE@tls + lwz rLOC, 0(rLOC) + mtlr rTMP +# else + lis rTMP,_GLOBAL_OFFSET_TABLE_@ha + la rLOC,_GLOBAL_OFFSET_TABLE_@l(rTMP) + lwz rLOC, __libc_tsd_LOCALE@got@tprel(rGOT) + add rLOC, rLOC, __libc_tsd_LOCALE@tls + lwz rLOC, 0(rLOC) +# endif /* SHARED */ +#else + mr rLOC, rLOCARG +#endif + mr rSTR1, rRTN + lwz rLOC, LOCALE_CTYPE_TOLOWER(rLOC) + li rRTN, 0 + beqlr cr7 + + /* Unrolling loop for POWER: loads are done with 'lbz' plus + offset and string descriptors are only updated in the end + of loop unrolling. */ + +L(loop): + lbz rCHAR1, 0(rSTR1) /* Load char from s1 */ + lbz rCHAR2, 0(rSTR2) /* Load char from s2 */ + sldi rADDR1, rCHAR1, 2 /* Calculate address for tolower(*s1) */ + sldi rADDR2, rCHAR2, 2 /* Calculate address for tolower(*s2) */ + lwzx rLWR1, rLOC, rADDR1 /* Load tolower(*s1) */ + lwzx rLWR2, rLOC, rADDR2 /* Load tolower(*s2) */ + cmpwi cr7, rCHAR1, 0 /* *s1 == '\0' ? */ + subf. r3, rLWR2, rLWR1 + bnelr + beqlr cr7 + lbz rCHAR1, 1(rSTR1) + lbz rCHAR2, 1(rSTR2) + sldi rADDR1, rCHAR1, 2 + sldi rADDR2, rCHAR2, 2 + lwzx rLWR1, rLOC, rADDR1 + lwzx rLWR2, rLOC, rADDR2 + cmpwi cr7, rCHAR1, 0 + subf. r3, rLWR2, rLWR1 + bnelr + beqlr cr7 + lbz rCHAR1, 2(rSTR1) + lbz rCHAR2, 2(rSTR2) + sldi rADDR1, rCHAR1, 2 + sldi rADDR2, rCHAR2, 2 + lwzx rLWR1, rLOC, rADDR1 + lwzx rLWR2, rLOC, rADDR2 + cmpwi cr7, rCHAR1, 0 + subf. r3, rLWR2, rLWR1 + bnelr + beqlr cr7 + lbz rCHAR1, 3(rSTR1) + lbz rCHAR2, 3(rSTR2) + /* Increment both string descriptors */ + addi rSTR1, rSTR1, 4 + addi rSTR2, rSTR2, 4 + sldi rADDR1, rCHAR1, 2 + sldi rADDR2, rCHAR2, 2 + lwzx rLWR1, rLOC, rADDR1 + lwzx rLWR2, rLOC, rADDR2 + cmpwi cr7, rCHAR1, 0 + subf. r3, rLWR2, rLWR1 + bnelr + bne cr7,L(loop) + blr +END (BP_SYM (__STRCMP)) + +weak_alias (BP_SYM (__STRCMP), BP_SYM (STRCMP)) +libc_hidden_builtin_def (__STRCMP) diff --git a/sysdeps/powerpc/powerpc32/power7/strcasecmp_l.S b/sysdeps/powerpc/powerpc32/power7/strcasecmp_l.S new file mode 100644 index 0000000000..c13c4ebcb8 --- /dev/null +++ b/sysdeps/powerpc/powerpc32/power7/strcasecmp_l.S @@ -0,0 +1,5 @@ +#define USE_IN_EXTENDED_LOCALE_MODEL +#define STRCMP strcasecmp_l +#define __STRCMP __strcasecmp_l + +#include "strcasecmp.S" diff --git a/sysdeps/powerpc/powerpc64/power7/Makefile b/sysdeps/powerpc/powerpc64/power7/Makefile index b0f45205b9..40aacfa15a 100644 --- a/sysdeps/powerpc/powerpc64/power7/Makefile +++ b/sysdeps/powerpc/powerpc64/power7/Makefile @@ -3,3 +3,8 @@ ifeq ($(subdir),elf) # optimization may require a TOC reference before relocations are resolved. CFLAGS-rtld.c += -mno-vsx endif + +ifeq ($(subdir),string) +CFLAGS-strncase.c += -funroll-loops +CFLAGS-strncase_l.c += -funroll-loops +endif diff --git a/sysdeps/powerpc/powerpc64/power7/strcasecmp.S b/sysdeps/powerpc/powerpc64/power7/strcasecmp.S new file mode 100644 index 0000000000..1477b2e17e --- /dev/null +++ b/sysdeps/powerpc/powerpc64/power7/strcasecmp.S @@ -0,0 +1,125 @@ +/* Optimized strcasecmp implementation for PowerPC64. + Copyright (C) 2011 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include +#include +#include +#include + +/* int [r3] strcasecmp (const char *s1 [r3], const char *s2 [r4] ) + + or if defined USE_IN_EXTENDED_LOCALE_MODEL: + + int [r3] strcasecmp_l (const char *s1 [r3], const char *s2 [r4], + __locale_t loc [r5]) */ + +#ifndef STRCMP +# define __STRCMP __strcasecmp +# define STRCMP strcasecmp +#endif + +ENTRY (BP_SYM (__STRCMP)) + CALL_MCOUNT 2 + +#define rRTN r3 /* Return value */ +#define rSTR1 r5 /* 1st string */ +#define rSTR2 r4 /* 2nd string */ +#define rLOCARG r5 /* 3rd argument: locale_t */ +#define rCHAR1 r6 /* Byte readed from 1st string */ +#define rCHAR2 r7 /* Byte readed from 2nd string */ +#define rADDR1 r8 /* Address of tolower(rCHAR1) */ +#define rADDR2 r12 /* Address of tolower(rCHAR2) */ +#define rLWR1 r8 /* Word tolower(rCHAR1) */ +#define rLWR2 r12 /* Word tolower(rCHAR2) */ +#define rTMP r9 +#define rLOC r11 /* Default locale address */ + + cmpd cr7, r3, r4 +#ifndef USE_IN_EXTENDED_LOCALE_MODEL + ld rTMP, __libc_tsd_LOCALE@got@tprel(r2) + add rLOC, rTMP, __libc_tsd_LOCALE@tls + ld rLOC, 0(rLOC) +#else + mr rLOC, rLOCARG +#endif + ld rLOC, LOCALE_CTYPE_TOLOWER(rLOC) + mr rSTR1, rRTN + li rRTN, 0 + beqlr cr7 + + + /* Unrolling loop for POWER: loads are done with 'lbz' plus + offset and string descriptors are only updated in the end + of loop unrolling. */ + + lbz rCHAR1, 0(rSTR1) /* Load char from s1 */ + lbz rCHAR2, 0(rSTR2) /* Load char from s2 */ +L(loop): + cmpdi rCHAR1, 0 /* *s1 == '\0' ? */ + sldi rADDR1, rCHAR1, 2 /* Calculate address for tolower(*s1) */ + sldi rADDR2, rCHAR2, 2 /* Calculate address for tolower(*s2) */ + lwzx rLWR1, rLOC, rADDR1 /* Load tolower(*s1) */ + lwzx rLWR2, rLOC, rADDR2 /* Load tolower(*s2) */ + cmpw cr1, rLWR1, rLWR2 /* r = tolower(*s1) == tolower(*s2) ? */ + crorc 4*cr1+eq,eq,4*cr1+eq /* (*s1 != '\0') || (r == 1) */ + beq cr1, L(done) + lbz rCHAR1, 1(rSTR1) + lbz rCHAR2, 1(rSTR2) + cmpdi rCHAR1, 0 + sldi rADDR1, rCHAR1, 2 + sldi rADDR2, rCHAR2, 2 + lwzx rLWR1, rLOC, rADDR1 + lwzx rLWR2, rLOC, rADDR2 + cmpw cr1, rLWR1, rLWR2 + crorc 4*cr1+eq,eq,4*cr1+eq + beq cr1, L(done) + lbz rCHAR1, 2(rSTR1) + lbz rCHAR2, 2(rSTR2) + cmpdi rCHAR1, 0 + sldi rADDR1, rCHAR1, 2 + sldi rADDR2, rCHAR2, 2 + lwzx rLWR1, rLOC, rADDR1 + lwzx rLWR2, rLOC, rADDR2 + cmpw cr1, rLWR1, rLWR2 + crorc 4*cr1+eq,eq,4*cr1+eq + beq cr1, L(done) + lbz rCHAR1, 3(rSTR1) + lbz rCHAR2, 3(rSTR2) + cmpdi rCHAR1, 0 + /* Increment both string descriptors */ + addi rSTR1, rSTR1, 4 + addi rSTR2, rSTR2, 4 + sldi rADDR1, rCHAR1, 2 + sldi rADDR2, rCHAR2, 2 + lwzx rLWR1, rLOC, rADDR1 + lwzx rLWR2, rLOC, rADDR2 + cmpw cr1, rLWR1, rLWR2 + crorc 4*cr1+eq,eq,4*cr1+eq + beq cr1,L(done) + lbz rCHAR1, 0(rSTR1) /* Load char from s1 */ + lbz rCHAR2, 0(rSTR2) /* Load char from s2 */ + b L(loop) +L(done): + subf r0, rLWR2, rLWR1 + extsw rRTN, r0 + blr +END (BP_SYM (__STRCMP)) + +weak_alias (BP_SYM (__STRCMP), BP_SYM (STRCMP)) +libc_hidden_builtin_def (__STRCMP) diff --git a/sysdeps/powerpc/powerpc64/power7/strcasecmp_l.S b/sysdeps/powerpc/powerpc64/power7/strcasecmp_l.S new file mode 100644 index 0000000000..c13c4ebcb8 --- /dev/null +++ b/sysdeps/powerpc/powerpc64/power7/strcasecmp_l.S @@ -0,0 +1,5 @@ +#define USE_IN_EXTENDED_LOCALE_MODEL +#define STRCMP strcasecmp_l +#define __STRCMP __strcasecmp_l + +#include "strcasecmp.S" -- cgit 1.4.1