about summary refs log tree commit diff
diff options
context:
space:
mode:
authorAdhemerval Zanella <azanella@linux.vnet.ibm.com>2011-12-29 19:15:10 -0500
committerAdhemerval Zanella <azanella@linux.vnet.ibm.com>2011-12-29 19:15:10 -0500
commit3380772a3c7f5c5dfc23604c7699b1e02624f049 (patch)
tree9a6b7cf9bef0656dd0c42b25c7540d26cd496f47
parent47a1ab537d97add9091078a9659e0999532ac2dd (diff)
downloadglibc-3380772a3c7f5c5dfc23604c7699b1e02624f049.tar.gz
glibc-3380772a3c7f5c5dfc23604c7699b1e02624f049.tar.xz
glibc-3380772a3c7f5c5dfc23604c7699b1e02624f049.zip
PowerPC - Optimization for str[n]casecmp functions
This patch provides throughput boost for the strcasecmp function
(25% on ppc32 and 40% on ppc64) and strncasecmp (15% on both ppc32
and ppc64) for POWER7. The optimization is done by manually
(strcasecmp) or automatically (strncasecmp) unrolling the test loop
to avoid CPU stalls caused by a test followed by a load.
-rw-r--r--ChangeLog18
-rw-r--r--sysdeps/powerpc/Makefile2
-rw-r--r--sysdeps/powerpc/locale-defines.sym5
-rw-r--r--sysdeps/powerpc/powerpc32/power7/Makefile4
-rw-r--r--sysdeps/powerpc/powerpc32/power7/strcasecmp.S132
-rw-r--r--sysdeps/powerpc/powerpc32/power7/strcasecmp_l.S5
-rw-r--r--sysdeps/powerpc/powerpc64/power7/Makefile5
-rw-r--r--sysdeps/powerpc/powerpc64/power7/strcasecmp.S125
-rw-r--r--sysdeps/powerpc/powerpc64/power7/strcasecmp_l.S5
9 files changed, 301 insertions, 0 deletions
diff --git a/ChangeLog b/ChangeLog
index f81b0c59ad..f3a26324cc 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,21 @@
+2011-11-16  Adhemerval Zanella  <azanella@linux.vnet.ibm.com>
+
+	* sysdeps/powerpc/Makefile: Added locale-defines.sym generation.
+	* sysdeps/powerpc/locale-defines.sym: Locale definitions for strcasecmp
+	optimized code.
+	* sysdeps/powerpc/powerpc32/power7/Makefile: New file: added unroll-loop
+	option for strncasecmp/strncasecmp_l compilation.
+	* sysdeps/powerpc/powerpc32/power7/strcasecmp.S: New file: strcasecmp
+	optimization for PPC32.
+	* sysdeps/powerpc/powerpc32/power7/strcasecmp_l.S: New file: strcasecmp_l
+	optimization for PPC32.
+	* sysdeps/powerpc/powerpc64/power7/Makefile: Added unroll-loop option for
+	strncasecmp/strncasecmp_l compilation.
+	* sysdeps/powerpc/powerpc64/power7/strcasecmp.S: New file: strcasecmp
+	optimization for PPC64.
+	* sysdeps/powerpc/powerpc64/power7/strcasecmp_l.S: New file: strcasecmp_l
+	optimization for PPC64.
+
 2011-11-18  Adhemerval Zanella  <azanella@linux.vnet.ibm.com>
 
 	* math/libm-test.inc: Added more nerabyint tests.
diff --git a/sysdeps/powerpc/Makefile b/sysdeps/powerpc/Makefile
index e43ca704f0..23a9a16730 100644
--- a/sysdeps/powerpc/Makefile
+++ b/sysdeps/powerpc/Makefile
@@ -23,4 +23,6 @@ endif
 ifeq ($(subdir),csu)
 # get offset to rtld_global._dl_hwcap
 gen-as-const-headers += rtld-global-offsets.sym
+# get offset to __locale_struct.__ctype_tolower
+gen-as-const-headers += locale-defines.sym
 endif
diff --git a/sysdeps/powerpc/locale-defines.sym b/sysdeps/powerpc/locale-defines.sym
new file mode 100644
index 0000000000..af64b920a4
--- /dev/null
+++ b/sysdeps/powerpc/locale-defines.sym
@@ -0,0 +1,5 @@
+#include <locale/localeinfo.h>
+
+--
+
+LOCALE_CTYPE_TOLOWER	offsetof (struct __locale_struct, __ctype_tolower)
diff --git a/sysdeps/powerpc/powerpc32/power7/Makefile b/sysdeps/powerpc/powerpc32/power7/Makefile
new file mode 100644
index 0000000000..5e8f4a28ba
--- /dev/null
+++ b/sysdeps/powerpc/powerpc32/power7/Makefile
@@ -0,0 +1,4 @@
+ifeq ($(subdir),string)
+CFLAGS-strncase.c += -funroll-loops
+CFLAGS-strncase_l.c += -funroll-loops
+endif
diff --git a/sysdeps/powerpc/powerpc32/power7/strcasecmp.S b/sysdeps/powerpc/powerpc32/power7/strcasecmp.S
new file mode 100644
index 0000000000..5d84fce470
--- /dev/null
+++ b/sysdeps/powerpc/powerpc32/power7/strcasecmp.S
@@ -0,0 +1,132 @@
+/* Optimized strcasecmp implementation for PowerPC32.
+   Copyright (C) 2011 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#include <sysdep.h>
+#include <bp-sym.h>
+#include <bp-asm.h>
+#include <locale-defines.h>
+
+/* int [r3] strcasecmp (const char *s1 [r3], const char *s2 [r4] )
+
+   or if defined USE_IN_EXTENDED_LOCALE_MODEL:
+
+   int [r3] strcasecmp_l (const char *s1 [r3], const char *s2 [r4],
+                          __locale_t loc [r5]) */
+
+#ifndef STRCMP
+# define __STRCMP __strcasecmp
+# define STRCMP   strcasecmp
+#endif
+
+ENTRY (BP_SYM (__STRCMP))
+
+#define rRTN	r3	/* Return value */
+#define rSTR1	r5	/* 1st string */
+#define rSTR2	r4	/* 2nd string */
+#define rLOCARG	r5	/* 3rd argument: locale_t */
+#define rCHAR1	r6	/* Byte readed from 1st string */
+#define rCHAR2	r7	/* Byte readed from 2nd string */
+#define rADDR1	r8	/* Address of tolower(rCHAR1) */
+#define rADDR2	r12	/* Address of tolower(rCHAR2) */
+#define rLWR1	r8	/* Byte tolower(rCHAR1) */
+#define rLWR2	r12	/* Byte tolower(rCHAR2) */
+#define rTMP	r0
+#define rGOT	r9	/* Address of the Global Offset Table */
+#define rLOC	r11	/* Default locale address */
+
+	cmpw    cr7, r3, r4
+#ifndef USE_IN_EXTENDED_LOCALE_MODEL
+# ifdef SHARED
+	mflr	rTMP
+	bcl	20,31,.L1
+.L1:	mflr	rGOT
+	addis	rGOT, rGOT, _GLOBAL_OFFSET_TABLE_-.L1@ha
+	addi 	rGOT, rGOT, _GLOBAL_OFFSET_TABLE_-.L1@l
+	lwz	rLOC, __libc_tsd_LOCALE@got@tprel(rGOT)
+	add 	rLOC, rLOC, __libc_tsd_LOCALE@tls
+	lwz	rLOC, 0(rLOC)
+	mtlr	rTMP
+# else
+	lis	rTMP,_GLOBAL_OFFSET_TABLE_@ha
+	la	rLOC,_GLOBAL_OFFSET_TABLE_@l(rTMP)
+	lwz	rLOC, __libc_tsd_LOCALE@got@tprel(rGOT)
+	add	rLOC, rLOC, __libc_tsd_LOCALE@tls
+	lwz	rLOC, 0(rLOC)
+# endif /* SHARED */
+#else
+	mr	rLOC, rLOCARG
+#endif
+	mr	rSTR1, rRTN
+	lwz	rLOC, LOCALE_CTYPE_TOLOWER(rLOC)
+	li	rRTN, 0
+	beqlr	cr7
+
+	/* Unrolling loop for POWER: loads are done with 'lbz' plus
+	offset and string descriptors are only updated in the end
+	of loop unrolling. */
+
+L(loop):
+	lbz	rCHAR1, 0(rSTR1)	/* Load char from s1 */
+	lbz	rCHAR2, 0(rSTR2)	/* Load char from s2 */
+	sldi	rADDR1, rCHAR1, 2	/* Calculate address for tolower(*s1) */
+	sldi	rADDR2, rCHAR2, 2	/* Calculate address for tolower(*s2) */
+	lwzx	rLWR1, rLOC, rADDR1	/* Load tolower(*s1) */
+	lwzx	rLWR2, rLOC, rADDR2	/* Load tolower(*s2) */
+	cmpwi	cr7, rCHAR1, 0		/* *s1 == '\0' ? */
+	subf.	r3, rLWR2, rLWR1
+	bnelr
+	beqlr	cr7
+	lbz	rCHAR1, 1(rSTR1)
+	lbz	rCHAR2, 1(rSTR2)
+	sldi	rADDR1, rCHAR1, 2
+	sldi	rADDR2, rCHAR2, 2
+	lwzx	rLWR1, rLOC, rADDR1
+	lwzx	rLWR2, rLOC, rADDR2
+	cmpwi	cr7, rCHAR1, 0
+	subf.	r3, rLWR2, rLWR1
+	bnelr
+	beqlr	cr7
+	lbz	rCHAR1, 2(rSTR1)
+	lbz	rCHAR2, 2(rSTR2)
+	sldi	rADDR1, rCHAR1, 2
+	sldi	rADDR2, rCHAR2, 2
+	lwzx	rLWR1, rLOC, rADDR1
+	lwzx	rLWR2, rLOC, rADDR2
+	cmpwi	cr7, rCHAR1, 0
+	subf.	r3, rLWR2, rLWR1
+	bnelr
+	beqlr	cr7
+	lbz	rCHAR1, 3(rSTR1)
+	lbz	rCHAR2, 3(rSTR2)
+	/* Increment both string descriptors */
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+	sldi	rADDR1, rCHAR1, 2
+	sldi	rADDR2, rCHAR2, 2
+	lwzx	rLWR1, rLOC, rADDR1
+	lwzx	rLWR2, rLOC, rADDR2
+	cmpwi	cr7, rCHAR1, 0
+	subf.	r3, rLWR2, rLWR1
+	bnelr
+	bne	cr7,L(loop)
+	blr
+END (BP_SYM (__STRCMP))
+
+weak_alias (BP_SYM (__STRCMP), BP_SYM (STRCMP))
+libc_hidden_builtin_def (__STRCMP)
diff --git a/sysdeps/powerpc/powerpc32/power7/strcasecmp_l.S b/sysdeps/powerpc/powerpc32/power7/strcasecmp_l.S
new file mode 100644
index 0000000000..c13c4ebcb8
--- /dev/null
+++ b/sysdeps/powerpc/powerpc32/power7/strcasecmp_l.S
@@ -0,0 +1,5 @@
+#define USE_IN_EXTENDED_LOCALE_MODEL
+#define STRCMP   strcasecmp_l
+#define __STRCMP __strcasecmp_l
+
+#include "strcasecmp.S"
diff --git a/sysdeps/powerpc/powerpc64/power7/Makefile b/sysdeps/powerpc/powerpc64/power7/Makefile
index b0f45205b9..40aacfa15a 100644
--- a/sysdeps/powerpc/powerpc64/power7/Makefile
+++ b/sysdeps/powerpc/powerpc64/power7/Makefile
@@ -3,3 +3,8 @@ ifeq ($(subdir),elf)
 # optimization may require a TOC reference before relocations are resolved.
 CFLAGS-rtld.c += -mno-vsx
 endif
+
+ifeq ($(subdir),string)
+CFLAGS-strncase.c += -funroll-loops
+CFLAGS-strncase_l.c += -funroll-loops
+endif
diff --git a/sysdeps/powerpc/powerpc64/power7/strcasecmp.S b/sysdeps/powerpc/powerpc64/power7/strcasecmp.S
new file mode 100644
index 0000000000..1477b2e17e
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power7/strcasecmp.S
@@ -0,0 +1,125 @@
+/* Optimized strcasecmp implementation for PowerPC64.
+   Copyright (C) 2011 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#include <sysdep.h>
+#include <bp-sym.h>
+#include <bp-asm.h>
+#include <locale-defines.h>
+
+/* int [r3] strcasecmp (const char *s1 [r3], const char *s2 [r4] )
+
+   or if defined USE_IN_EXTENDED_LOCALE_MODEL:
+
+   int [r3] strcasecmp_l (const char *s1 [r3], const char *s2 [r4],
+                          __locale_t loc [r5]) */
+
+#ifndef STRCMP
+# define __STRCMP __strcasecmp
+# define STRCMP   strcasecmp
+#endif
+
+ENTRY (BP_SYM (__STRCMP))
+	CALL_MCOUNT 2
+
+#define rRTN	r3	/* Return value */
+#define rSTR1	r5	/* 1st string */
+#define rSTR2	r4	/* 2nd string */
+#define rLOCARG	r5	/* 3rd argument: locale_t */
+#define rCHAR1	r6	/* Byte readed from 1st string */
+#define rCHAR2	r7	/* Byte readed from 2nd string */
+#define rADDR1	r8	/* Address of tolower(rCHAR1) */
+#define rADDR2	r12	/* Address of tolower(rCHAR2) */
+#define rLWR1	r8	/* Word tolower(rCHAR1) */
+#define rLWR2	r12	/* Word tolower(rCHAR2) */
+#define rTMP	r9
+#define rLOC	r11	/* Default locale address */
+
+	cmpd	cr7, r3, r4
+#ifndef USE_IN_EXTENDED_LOCALE_MODEL
+	ld 	rTMP, __libc_tsd_LOCALE@got@tprel(r2)
+	add 	rLOC, rTMP, __libc_tsd_LOCALE@tls
+	ld	rLOC, 0(rLOC)
+#else
+	mr	rLOC, rLOCARG
+#endif
+	ld	rLOC, LOCALE_CTYPE_TOLOWER(rLOC)
+	mr	rSTR1, rRTN
+	li	rRTN, 0
+	beqlr	cr7
+
+
+	/* Unrolling loop for POWER: loads are done with 'lbz' plus
+	offset and string descriptors are only updated in the end
+	of loop unrolling. */
+
+	lbz	rCHAR1, 0(rSTR1)	/* Load char from s1 */
+	lbz	rCHAR2, 0(rSTR2)	/* Load char from s2 */
+L(loop):
+	cmpdi	rCHAR1, 0		/* *s1 == '\0' ? */
+	sldi	rADDR1, rCHAR1, 2	/* Calculate address for tolower(*s1) */
+	sldi	rADDR2, rCHAR2, 2	/* Calculate address for tolower(*s2) */
+	lwzx	rLWR1, rLOC, rADDR1	/* Load tolower(*s1) */
+	lwzx	rLWR2, rLOC, rADDR2	/* Load tolower(*s2) */
+	cmpw	cr1, rLWR1, rLWR2	/* r = tolower(*s1) == tolower(*s2) ? */
+	crorc	4*cr1+eq,eq,4*cr1+eq	/* (*s1 != '\0') || (r == 1) */
+	beq	cr1, L(done)
+	lbz	rCHAR1, 1(rSTR1)
+	lbz	rCHAR2, 1(rSTR2)
+	cmpdi	rCHAR1, 0
+	sldi	rADDR1, rCHAR1, 2
+	sldi	rADDR2, rCHAR2, 2
+	lwzx	rLWR1, rLOC, rADDR1
+	lwzx	rLWR2, rLOC, rADDR2
+	cmpw	cr1, rLWR1, rLWR2
+	crorc	4*cr1+eq,eq,4*cr1+eq
+	beq	cr1, L(done)
+	lbz	rCHAR1, 2(rSTR1)
+	lbz	rCHAR2, 2(rSTR2)
+	cmpdi	rCHAR1, 0
+	sldi	rADDR1, rCHAR1, 2
+	sldi	rADDR2, rCHAR2, 2
+	lwzx	rLWR1, rLOC, rADDR1
+	lwzx	rLWR2, rLOC, rADDR2
+	cmpw	cr1, rLWR1, rLWR2
+	crorc	4*cr1+eq,eq,4*cr1+eq
+	beq	cr1, L(done)
+	lbz	rCHAR1, 3(rSTR1)
+	lbz	rCHAR2, 3(rSTR2)
+	cmpdi	rCHAR1, 0
+	/* Increment both string descriptors */
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+	sldi	rADDR1, rCHAR1, 2
+	sldi	rADDR2, rCHAR2, 2
+	lwzx	rLWR1, rLOC, rADDR1
+	lwzx	rLWR2, rLOC, rADDR2
+	cmpw	cr1, rLWR1, rLWR2
+	crorc	4*cr1+eq,eq,4*cr1+eq
+	beq     cr1,L(done)
+	lbz	rCHAR1, 0(rSTR1)	/* Load char from s1 */
+	lbz	rCHAR2, 0(rSTR2)	/* Load char from s2 */
+	b	L(loop)
+L(done):
+	subf	r0, rLWR2, rLWR1
+	extsw	rRTN, r0
+	blr
+END (BP_SYM (__STRCMP))
+
+weak_alias (BP_SYM (__STRCMP), BP_SYM (STRCMP))
+libc_hidden_builtin_def (__STRCMP)
diff --git a/sysdeps/powerpc/powerpc64/power7/strcasecmp_l.S b/sysdeps/powerpc/powerpc64/power7/strcasecmp_l.S
new file mode 100644
index 0000000000..c13c4ebcb8
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power7/strcasecmp_l.S
@@ -0,0 +1,5 @@
+#define USE_IN_EXTENDED_LOCALE_MODEL
+#define STRCMP   strcasecmp_l
+#define __STRCMP __strcasecmp_l
+
+#include "strcasecmp.S"