about summary refs log tree commit diff
path: root/sysdeps/powerpc
diff options
context:
space:
mode:
authorGeoff Keating <geoffk@cygnus.com>2002-08-22 19:07:46 +0000
committerGeoff Keating <geoffk@cygnus.com>2002-08-22 19:07:46 +0000
commitb8a5737a496ae3893f5f924852ad2cec565457b4 (patch)
tree4526094d02d07a38f24f106d3e34a36f52cb8932 /sysdeps/powerpc
parent7a14a672b9a9af25c0119f89af620532099fad95 (diff)
downloadglibc-b8a5737a496ae3893f5f924852ad2cec565457b4.tar.gz
glibc-b8a5737a496ae3893f5f924852ad2cec565457b4.tar.xz
glibc-b8a5737a496ae3893f5f924852ad2cec565457b4.zip
2002-08-22 Steven Munroe <sjmunroe@us.ibm.com>
	* sysdeps/powerpc/elf/libc-start.c
	(__cache_line_size): Declare.
	(__aux_init_cache): New.
	(__libc_start_main): Change type of `auxvec' parameter to
	`ElfW(auxv_t) *'.  Correct walking of aux vector.  Call
	__aux_init_cache.
	* sysdeps/unix/sysv/linux/powerpc/dl-sysdep.c
	(__cache_line_size): Declare.
	(__aux_init_cache): New.
	(DL_PLATFORM_INIT): Define.
	* sysdeps/powerpc/memset.S: Define __cache_line_size and use its
	value to select the correct stride for dcbz.
2002-08-22  Steven Munroe  <sjmunroe@us.ibm.com>

	* sysdeps/powerpc/elf/libc-start.c 
	(__cache_line_size): Declare.
	(__aux_init_cache): New.
	(__libc_start_main): Change type of `auxvec' parameter to
	`ElfW(auxv_t) *'.  Correct walking of aux vector.  Call
	__aux_init_cache.
	* sysdeps/unix/sysv/linux/powerpc/dl-sysdep.c 
	(__cache_line_size): Declare.
	(__aux_init_cache): New.
	(DL_PLATFORM_INIT): Define.
	* sysdeps/powerpc/memset.S: Define __cache_line_size and use its
	value to select the correct stride for dcbz.
Diffstat (limited to 'sysdeps/powerpc')
-rw-r--r--sysdeps/powerpc/elf/libc-start.c41
-rw-r--r--sysdeps/powerpc/memset.S121
2 files changed, 151 insertions, 11 deletions
diff --git a/sysdeps/powerpc/elf/libc-start.c b/sysdeps/powerpc/elf/libc-start.c
index e20fd04fb7..c96e96609d 100644
--- a/sysdeps/powerpc/elf/libc-start.c
+++ b/sysdeps/powerpc/elf/libc-start.c
@@ -26,6 +26,10 @@ extern void __libc_init_first (int argc, char **argv, char **envp);
 
 extern int _dl_starting_up;
 weak_extern (_dl_starting_up)
+
+extern int __cache_line_size;
+weak_extern (__cache_line_size)
+
 extern int __libc_multiple_libcs;
 extern void *__libc_stack_end;
 
@@ -37,12 +41,33 @@ struct startup_info
   void (*fini) (void);
 };
 
+/* Scan the Aux Vector for the "Data Cache Block Size" entry.  If found
+   verify that the static extern __cache_line_size is defined by checking
+   for not NULL.  If it is defined then assign the cache block size 
+   value to __cache_line_size.  */
+static inline void
+__aux_init_cache (ElfW(auxv_t) *av)
+{
+  for (; av->a_type != AT_NULL; ++av)
+    switch (av->a_type)
+      {
+      case AT_DCACHEBSIZE:
+        {
+          int *cls = & __cache_line_size;
+          if (cls != NULL)
+            *cls = av->a_un.a_val;
+        }
+        break;
+      }
+}
+
+
 int
 /* GKM FIXME: GCC: this should get __BP_ prefix by virtue of the
    BPs in the arglist of startup_info.main and startup_info.init. */
 BP_SYM (__libc_start_main) (int argc, char *__unbounded *__unbounded ubp_av,
 		   char *__unbounded *__unbounded ubp_ev,
-		   void *__unbounded auxvec, void (*rtld_fini) (void),
+		   ElfW(auxv_t) *__unbounded auxvec, void (*rtld_fini) (void),
 		   struct startup_info *__unbounded stinfo,
 		   char *__unbounded *__unbounded stack_on_entry)
 {
@@ -66,6 +91,7 @@ BP_SYM (__libc_start_main) (int argc, char *__unbounded *__unbounded ubp_av,
      as a statically-linked program by Linux...	 */
   if (*stack_on_entry != NULL)
     {
+      char *__unbounded *__unbounded temp;
       /* ...in which case, we have argc as the top thing on the
 	 stack, followed by argv (NULL-terminated), envp (likewise),
 	 and the auxilary vector.  */
@@ -73,10 +99,12 @@ BP_SYM (__libc_start_main) (int argc, char *__unbounded *__unbounded ubp_av,
       ubp_av = stack_on_entry + 1;
       ubp_ev = ubp_av + argc + 1;
 #ifdef HAVE_AUX_VECTOR
-      auxvec = ubp_ev;
-      while (*(char *__unbounded *__unbounded) auxvec != NULL)
-	++auxvec;
-      ++auxvec;
+      temp = ubp_ev;
+      while (*temp != NULL)
+        ++temp;
+      auxvec = (ElfW(auxv_t) *)++temp;
+      
+
 # ifndef SHARED
       _dl_aux_init ((ElfW(auxv_t) *) auxvec);
 # endif
@@ -85,6 +113,9 @@ BP_SYM (__libc_start_main) (int argc, char *__unbounded *__unbounded ubp_av,
     }
 
   INIT_ARGV_and_ENVIRON;
+    
+  /* Initialize the __cache_line_size variable from the aux vector.  */
+  __aux_init_cache((ElfW(auxv_t) *) auxvec);
 
   /* Store something that has some relationship to the end of the
      stack, for backtraces.  This variable should be thread-specific.  */
diff --git a/sysdeps/powerpc/memset.S b/sysdeps/powerpc/memset.S
index eb11bdef19..bee87af0ce 100644
--- a/sysdeps/powerpc/memset.S
+++ b/sysdeps/powerpc/memset.S
@@ -21,12 +21,26 @@
 #include <bp-sym.h>
 #include <bp-asm.h>
 
+/* Define a global static that can hold the cache line size.  The 
+   assumption is that startup code will access the "aux vector" to
+   to obtain the value set by the kernel and store it into this 
+   variable.  */
+   
+	.globl __cache_line_size
+	.section	".data","aw"
+	.align 2
+	.type	 __cache_line_size,@object
+	.size	 __cache_line_size,4
+__cache_line_size:
+	.long 0
+	.section	".text"
 /* __ptr_t [r3] memset (__ptr_t s [r3], int c [r4], size_t n [r5]));
    Returns 's'.
 
-   The memset is done in three sizes: byte (8 bits), word (32 bits),
-   cache line (256 bits). There is a special case for setting cache lines
-   to 0, to take advantage of the dcbz instruction.  */
+   The memset is done in four sizes: byte (8 bits), word (32 bits),
+   32-byte blocks (256 bits) and __cache_line_size (128, 256, 1024 bits).
+   There is a special case for setting whole cache lines to 0, which 
+   takes advantage of the dcbz instruction.  */
 
 EALIGN (BP_SYM (memset), 5, 1)
 
@@ -50,6 +64,10 @@ EALIGN (BP_SYM (memset), 5, 1)
 #define rNEG64	r8	/* constant -64 for clearing with dcbz */
 #define rNEG32	r9	/* constant -32 for clearing with dcbz */
 
+#define rGOT	r9	/* Address of the Global Offset Table.  */
+#define rCLS	r8	/* Cache line size obtained from static.  */
+#define rCLM	r9	/* Cache line size mask to check for cache alignment.  */
+
 #if __BOUNDED_POINTERS__
 	cmplwi	cr1, rRTN, 0
 	CHECK_BOUNDS_BOTH_WIDE (rMEMP0, rTMP, rTMP2, rLEN)
@@ -105,7 +123,17 @@ L(caligned):
 	cmplwi	cr1, rCHR, 0
 	clrrwi.	rALIGN, rLEN, 5
 	mtcrf	0x01, rLEN	/* 40th instruction from .align */
-	beq	cr1, L(zloopstart) /* special case for clearing memory using dcbz */
+	
+/* Check if we can use the special case for clearing memory using dcbz.
+   This requires that we know the correct cache line size for this    
+   processor.  Getting the __cache_line_size may require establishing GOT
+   addressability, so branch out of line to set this up.  */
+	beq	cr1, L(checklinesize) 
+	
+/* Store blocks of 32-bytes (256-bits) starting on a 32-byte boundary. 
+   Can't assume that rCHR is zero or that the cache line size is either
+   32-bytes or even known.  */
+L(nondcbz):
 	srwi	rTMP, rALIGN, 5
 	mtctr	rTMP
 	beq	L(medium)	/* we may not actually get to do a full line */
@@ -114,7 +142,9 @@ L(caligned):
 	li	rNEG64, -0x40
 	bdz	L(cloopdone)	/* 48th instruction from .align */
 
-L(c3):	dcbz	rNEG64, rMEMP
+/* We can't use dcbz here as we don't know the cache line size.  We can
+   use "data cache block touch for store", which is safe.  */
+L(c3):	dcbtst rNEG64, rMEMP
 	stw	rCHR, -4(rMEMP)
 	stw	rCHR, -8(rMEMP)
 	stw	rCHR, -12(rMEMP)
@@ -142,7 +172,10 @@ L(cloopdone):
 
 	.align 5
 	nop
-/* Clear lines of memory in 128-byte chunks.  */
+/* Clear cache lines of memory in 128-byte chunks.  
+   This code is optimized for processors with 32-byte cache lines.
+   It is further optimized for the 601 processor, which requires
+   some care in how the code is aligned in the i-cache.  */
 L(zloopstart):
 	clrlwi	rLEN, rLEN, 27
 	mtcrf	0x02, rALIGN
@@ -226,4 +259,80 @@ L(medium_28t):
 	stw	rCHR, -4(rMEMP)
 	stw	rCHR, -8(rMEMP)
 	blr
+	
+L(checklinesize):
+#ifdef SHARED
+	mflr rTMP
+/* If the remaining length is less the 32 bytes then don't bother getting
+	 the cache line size.  */
+	beq	L(medium)	
+/* Establishes GOT addressability so we can load __cache_line_size 
+   from static. This value was set from the aux vector during startup.  */
+	bl   _GLOBAL_OFFSET_TABLE_@local-4
+	mflr rGOT
+	lwz	 rGOT,__cache_line_size@got(rGOT)
+	lwz	 rCLS,0(rGOT)
+	mtlr rTMP
+#else 
+/* Load __cache_line_size from static. This value was set from the 
+   aux vector during startup.  */
+	lis	 rCLS,__cache_line_size@ha
+/* If the remaining length is less the 32 bytes then don't bother getting
+	 the cache line size.  */
+	beq	L(medium)
+	lwz  rCLS,__cache_line_size@l(rCLS)
+#endif
+	
+/*If the cache line size was not set then goto to L(nondcbz), which is
+	safe for any cache line size.  */	
+	cmplwi cr1,rCLS,0
+	beq	cr1,L(nondcbz)
+	
+/* If the cache line size is 32 bytes then goto to L(zloopstart),
+	 which is coded specificly for 32-byte lines (and 601).  */	
+	cmplwi cr1,rCLS,32
+	beq	cr1,L(zloopstart)
+	
+/* Now we know the cache line size and it is not 32-bytes.  However 
+	 we may not yet be aligned to the cache line and may have a partial 
+	 line to fill.  Touch it 1st to fetch the cache line.  */	
+	dcbtst 0,rMEMP	
+	
+	addi rCLM,rCLS,-1
+L(getCacheAligned):
+	cmplwi cr1,rLEN,32
+	and. rTMP,rCLM,rMEMP
+	blt	 cr1,L(handletail32)
+	beq	 L(cacheAligned)
+/* We are not aligned to start of a cache line yet.  Store 32-byte
+   of data and test again.  */
+	addi rMEMP,rMEMP,32
+	addi rLEN,rLEN,-32
+	stw	 rCHR,-32(rMEMP)
+	stw	 rCHR,-28(rMEMP)
+	stw	 rCHR,-24(rMEMP)
+	stw	 rCHR,-20(rMEMP)
+	stw	 rCHR,-16(rMEMP)
+	stw	 rCHR,-12(rMEMP)
+	stw	 rCHR,-8(rMEMP)
+	stw	 rCHR,-4(rMEMP)
+	b	 L(getCacheAligned)
+	
+/* Now we are aligned to the cache line and can use dcbz.  */	
+L(cacheAligned):
+	cmplw cr1,rLEN,rCLS
+	blt	 cr1,L(handletail32)
+	dcbz 0,rMEMP
+	subf rLEN,rCLS,rLEN
+	add	 rMEMP,rMEMP,rCLS
+	b	 L(cacheAligned)
+
+/* We are here because; the cache line size was set, it was not 
+   32-bytes, and the remainder (rLEN) is now less than the actual cache 
+   line size.  Set up the preconditions for L(nondcbz) and go there to 
+   store the remaining bytes.  */			
+L(handletail32):
+	clrrwi.	rALIGN, rLEN, 5
+	b		L(nondcbz)
+		
 END (BP_SYM (memset))