about summary refs log tree commit diff
path: root/sysdeps/sparc
diff options
context:
space:
mode:
Diffstat (limited to 'sysdeps/sparc')
-rw-r--r--sysdeps/sparc/sparc32/dl-machine.h77
-rw-r--r--sysdeps/sparc/sparc64/addmul_1.S2
-rw-r--r--sysdeps/sparc/sparc64/dl-machine.h330
-rw-r--r--sysdeps/sparc/sparc64/elf/crtbegin.S3
-rw-r--r--sysdeps/sparc/sparc64/elf/crtend.S3
-rw-r--r--sysdeps/sparc/sparc64/elf/start.S24
-rw-r--r--sysdeps/sparc/sparc64/lshift.S2
-rw-r--r--sysdeps/sparc/sparc64/mul_1.S2
-rw-r--r--sysdeps/sparc/sparc64/rshift.S2
-rw-r--r--sysdeps/sparc/sparc64/submul_1.S2
10 files changed, 339 insertions, 108 deletions
diff --git a/sysdeps/sparc/sparc32/dl-machine.h b/sysdeps/sparc/sparc32/dl-machine.h
index da15e94a69..e1fb2b68ce 100644
--- a/sysdeps/sparc/sparc32/dl-machine.h
+++ b/sysdeps/sparc/sparc32/dl-machine.h
@@ -101,14 +101,26 @@ elf_machine_runtime_setup (struct link_map *l, int lazy, int profile)
 {
   Elf32_Addr *plt;
   extern void _dl_runtime_resolve (Elf32_Word);
+  extern void _dl_runtime_profile (Elf32_Word);
 
   if (l->l_info[DT_JMPREL] && lazy)
     {
+      Elf32_Addr rfunc;
+
       /* The entries for functions in the PLT have not yet been filled in.
 	 Their initial contents will arrange when called to set the high 22
 	 bits of %g1 with an offset into the .rela.plt section and jump to
 	 the beginning of the PLT.  */
       plt = (Elf32_Addr *) (l->l_addr + l->l_info[DT_PLTGOT]->d_un.d_ptr);
+      if (! profile)
+	rfunc = (Elf32_Addr) &_dl_runtime_resolve;
+      else
+	{
+	  rfunc = (Elf32_Addr) &_dl_runtime_profile;
+
+	  if (_dl_name_match_p (_dl_profile, l))
+	    _dl_profile_map = l;
+	}
 
       /* The beginning of the PLT does:
 
@@ -123,8 +135,7 @@ elf_machine_runtime_setup (struct link_map *l, int lazy, int profile)
 
       plt[0] = OPCODE_SAVE_SP;
       /* Construct PC-relative word address.  */
-      plt[1] = OPCODE_CALL | (((Elf32_Addr) &_dl_runtime_resolve -
-			       (Elf32_Addr) &plt[1]) >> 2);
+      plt[1] = OPCODE_CALL | ((rfunc - (Elf32_Addr) &plt[1]) >> 2);
       plt[2] = OPCODE_NOP;	/* Fill call delay slot.  */
       plt[3] = (Elf32_Addr) l;
     }
@@ -134,20 +145,36 @@ elf_machine_runtime_setup (struct link_map *l, int lazy, int profile)
 
 /* This code is used in dl-runtime.c to call the `fixup' function
    and then redirect to the address it returns.  */
-#define ELF_MACHINE_RUNTIME_TRAMPOLINE asm ("\
-	.globl _dl_runtime_resolve
-	.type _dl_runtime_resolve, @function
-_dl_runtime_resolve:
+#define TRAMPOLINE_TEMPLATE(tramp_name, fixup_name)	\
+  asm ( "\
+	.text
+	.globl	" #tramp_name "
+	.type	" #tramp_name ", @function
+	.align	32
+" #tramp_name ":
 	/* Set up the arguments to fixup --
 	   %o0 = link_map out of plt0
-	   %o1 = offset of reloc entry  */
+	   %o1 = offset of reloc entry
+	   %o2 = return address  */
 	ld	[%o7 + 8], %o0
 	srl	%g1, 10, %o1
-	call	fixup
+	mov	%i7, %o2
+	call	" #fixup_name "
 	 sub	%o1, 4*12, %o1
 	jmp	%o0
 	 restore
-	.size _dl_runtime_resolve, . - _dl_runtime_resolve");
+	.size	" #tramp_name ", . - " #tramp_name "
+	.previous")
+
+#ifndef PROF
+#define ELF_MACHINE_RUNTIME_TRAMPOLINE 			\
+  TRAMPOLINE_TEMPLATE (_dl_runtime_resolve, fixup);	\
+  TRAMPOLINE_TEMPLATE (_dl_runtime_profile, profile_fixup);
+#else
+#define ELF_MACHINE_RUNTIME_TRAMPOLINE			\
+  TRAMPOLINE_TEMPLATE (_dl_runtime_resolve, fixup);	\
+  TRAMPOLINE_TEMPLATE (_dl_runtime_profile, fixup);
+#endif
 
 /* Nonzero iff TYPE should not be allowed to resolve to one of
    the main executable's symbols, as for a COPY reloc.  */
@@ -171,9 +198,10 @@ _dl_runtime_resolve:
    its return value is the user program's entry point.  */
 
 #define RTLD_START __asm__ ("\
-.text
-	.globl _start
-	.type _start,@function
+	.text
+	.globl	_start
+	.type	_start, @function
+	.align	32
 _start:
   /* Allocate space for functions to drop their arguments.  */
 	sub	%sp, 6*4, %sp
@@ -181,8 +209,8 @@ _start:
 	call	_dl_start
 	 add	%sp, 22*4, %o0
 	/* FALTHRU */
-	.globl _dl_start_user
-	.type _dl_start_user,@function
+	.globl	_dl_start_user
+	.type	_dl_start_user, @function
 _dl_start_user:
   /* Load the PIC register.  */
 1:	call	2f
@@ -191,6 +219,12 @@ _dl_start_user:
 	add	%l7, %o7, %l7
   /* Save the user entry point address in %l0 */
 	mov	%o0, %l0
+  /* Store the highest stack address.  */
+	sethi	%hi(__libc_stack_end), %g2
+	or	%g2, %lo(__libc_stack_end), %g2
+	ld	[%l7 + %g2], %l1
+	add	%sp, 6*4, %l2
+	st	%l2, [%l1]
   /* See if we were run as a command with the executable file name as an
      extra leading argument.  If so, adjust the contents of the stack.  */
 	sethi	%hi(_dl_skip_args), %g2
@@ -230,7 +264,7 @@ _dl_start_user:
 	st	%i4, [%i1+4]
 	bne	23b
 	 add	%i1, 8, %i1
-  /* Load _dl_main_searchlist to pass to _dl_init_next.  */
+  /* Load searchlist of the main object to pass to _dl_init_next.  */
 3:	sethi	%hi(_dl_main_searchlist), %g1
 	or	%g1, %lo(_dl_main_searchlist), %g1
 	ld	[%l7+%g1], %l1
@@ -242,8 +276,7 @@ _dl_start_user:
 	beq	5f
 	 nop
 	jmpl	%o0, %o7
-	 nop
-	ba,a	4b
+	 sub	%o7, 28, %o7
   /* Clear the startup flag.  */
 5:	sethi	%hi(_dl_starting_up), %g1
 	or	%g1, %lo(_dl_starting_up), %g1
@@ -256,8 +289,8 @@ _dl_start_user:
   /* Jump to the user's entry point and deallocate the extra stack we got.  */
 	jmp	%l0
 	 add	%sp, 6*4, %sp
-	.size   _dl_start_user,.-_dl_start_user
-.previous");
+	.size   _dl_start_user, . - _dl_start_user
+	.previous");
 
 static inline void
 elf_machine_fixup_plt (struct link_map *map, const Elf32_Rela *reloc,
@@ -343,7 +376,10 @@ elf_machine_rela (struct link_map *map, const Elf32_Rela *reloc,
       switch (ELF32_R_TYPE (reloc->r_info))
 	{
 	case R_SPARC_COPY:
-#ifndef RTLD_BOOTSTRAP
+	  if (sym == NULL)
+	    /* This can happen in trace mode if an object could not be
+	       found.  */
+	    break;
 	  if (sym->st_size > refsym->st_size
 	      || (_dl_verbose && sym->st_size < refsym->st_size))
 	    {
@@ -359,7 +395,6 @@ elf_machine_rela (struct link_map *map, const Elf32_Rela *reloc,
 	    }
 	  memcpy (reloc_addr, (void *) value, MIN (sym->st_size,
 						   refsym->st_size));
-#endif
 	  break;
 	case R_SPARC_GLOB_DAT:
 	case R_SPARC_32:
diff --git a/sysdeps/sparc/sparc64/addmul_1.S b/sysdeps/sparc/sparc64/addmul_1.S
index 260481230f..6782db77df 100644
--- a/sysdeps/sparc/sparc64/addmul_1.S
+++ b/sysdeps/sparc/sparc64/addmul_1.S
@@ -32,7 +32,7 @@
 
 ENTRY(__mpn_addmul_1)
 	!#PROLOGUE#	0
-	save	%sp,-128,%sp
+	save	%sp,-192,%sp
 	!#PROLOGUE#	1
 
 	sub	%g0,%i2,%o7
diff --git a/sysdeps/sparc/sparc64/dl-machine.h b/sysdeps/sparc/sparc64/dl-machine.h
index 788e239779..33def6b455 100644
--- a/sysdeps/sparc/sparc64/dl-machine.h
+++ b/sysdeps/sparc/sparc64/dl-machine.h
@@ -21,19 +21,15 @@
 
 #include <assert.h>
 #include <string.h>
-#include <link.h>
 #include <sys/param.h>
+#include <elf/ldsodefs.h>
 #include <sysdep.h>
 
-
-/* Translate a processor-specific dynamic tag to the index into l_info.  */
-#define DT_SPARC(x)	(DT_SPARC_##x - DT_LOPROC + DT_NUM)
-
 /* Return nonzero iff E_MACHINE is compatible with the running host.  */
 static inline int
 elf_machine_matches_host (Elf64_Half e_machine)
 {
-  return e_machine == EM_SPARC64;
+  return e_machine == EM_SPARCV9;
 }
 
 /* Return the link-time address of _DYNAMIC.  Conveniently, this is the
@@ -42,12 +38,11 @@ elf_machine_matches_host (Elf64_Half e_machine)
 static inline Elf64_Addr
 elf_machine_dynamic (void)
 {
-  register Elf64_Addr elf_pic_register __asm__("%l7");
+  register Elf64_Addr *elf_pic_register __asm__("%l7");
 
-  return *(Elf64_Addr *)elf_pic_register;
+  return *elf_pic_register;
 }
 
-
 /* Return the run-time load address of the shared object.  */
 static inline Elf64_Addr
 elf_machine_load_address (void)
@@ -67,21 +62,92 @@ elf_machine_load_address (void)
   return pc - *(Elf64_Addr *)(elf_pic_register + la);
 }
 
+/* We have 3 cases to handle.  And we code different code sequences
+   for each one.  I love V9 code models...  */
 static inline void
 elf_machine_fixup_plt(struct link_map *map, const Elf64_Rela *reloc,
                       Elf64_Addr *reloc_addr, Elf64_Addr value)
 {
-  Elf64_Dyn *pltfmt = map->l_info[DT_SPARC(PLTFMT)];
-  switch (pltfmt ? pltfmt->d_un.d_val : 0)
+  unsigned int *insns = (unsigned int *) reloc_addr;
+  Elf64_Addr plt_vaddr = (Elf64_Addr) reloc_addr;
+
+  /* Now move plt_vaddr up to the call instruction.  */
+  plt_vaddr += (2 * 4);
+
+  /* 32-bit Sparc style, the target is in the lower 32-bits of
+     address space.  */
+  if ((value >> 32) == 0)
     {
-    case 1: /* .got.plt with absolute addresses */
-      *reloc_addr = value;
-      break;
-    case 2: /* .got.plt with got-relative addresses */
-      *reloc_addr = value - (map->l_info[DT_PLTGOT]->d_un.d_ptr + map->l_addr);
-      break;
-    default:
-      assert (! "unexpected .plt format type");
+      /* sethi	%hi(target), %g1
+	 jmpl	%g1 + %lo(target), %g0  */
+
+      insns[2] = 0x81c06000 | (value & 0x3ff);
+      __asm __volatile ("flush %0 + 8" : : "r" (insns));
+
+      insns[1] = 0x03000000 | ((unsigned int)(value >> 10));
+      __asm __volatile ("flush %0 + 4" : : "r" (insns));
+    }
+  /* We can also get somewhat simple sequences if the distance between
+     the target and the PLT entry is within +/- 2GB.  */
+  else if ((plt_vaddr > value
+	    && ((plt_vaddr - value) >> 32) == 0)
+	   || (value > plt_vaddr
+	       && ((value - plt_vaddr) >> 32) == 0))
+    {
+      unsigned int displacement;
+
+      if (plt_vaddr > value)
+	displacement = (0 - (plt_vaddr - value));
+      else
+	displacement = value - plt_vaddr;
+
+      /* mov	%o7, %g1
+	 call	displacement
+	  mov	%g1, %o7  */
+
+      insns[3] = 0x9e100001;
+      __asm __volatile ("flush %0 + 12" : : "r" (insns));
+
+      insns[2] = 0x40000000 | (displacement >> 2);
+      __asm __volatile ("flush %0 + 8" : : "r" (insns));
+
+      insns[1] = 0x8210000f;
+      __asm __volatile ("flush %0 + 4" : : "r" (insns));
+    }
+  /* Worst case, ho hum...  */
+  else
+    {
+      unsigned int high32 = (value >> 32);
+      unsigned int low32 = (unsigned int) value;
+
+      /* ??? Some tricks can be stolen from the sparc64 egcs backend
+	     constant formation code I wrote.  -DaveM  */
+
+      /* sethi	%hh(value), %g1
+	 sethi	%lm(value), %g2
+	 or	%g1, %hl(value), %g1
+	 or	%g2, %lo(value), %g2
+	 sllx	%g1, 32, %g1
+	 jmpl	%g1 + %g2, %g0
+	  nop  */
+
+      insns[6] = 0x81c04002;
+      __asm __volatile ("flush %0 + 24" : : "r" (insns));
+
+      insns[5] = 0x83287020;
+      __asm __volatile ("flush %0 + 20" : : "r" (insns));
+
+      insns[4] = 0x8410a000 | (low32 & 0x3ff);
+      __asm __volatile ("flush %0 + 16" : : "r" (insns));
+
+      insns[3] = 0x82106000 | (high32 & 0x3ff);
+      __asm __volatile ("flush %0 + 12" : : "r" (insns));
+
+      insns[2] = 0x05000000 | (low32 >> 10);
+      __asm __volatile ("flush %0 + 8" : : "r" (insns));
+
+      insns[1] = 0x03000000 | (high32 >> 10);
+      __asm __volatile ("flush %0 + 4" : : "r" (insns));
     }
 }
 
@@ -119,7 +185,7 @@ elf_machine_rela (struct link_map *map, const Elf64_Rela *reloc,
 #endif
 	*reloc_addr = map->l_addr + reloc->r_addend;
     }
-  else
+  else if (ELF64_R_TYPE (reloc->r_info) != R_SPARC_NONE) /* Who is Wilbur? */
     {
       const Elf64_Sym *const refsym = sym;
       Elf64_Addr value;
@@ -137,6 +203,10 @@ elf_machine_rela (struct link_map *map, const Elf64_Rela *reloc,
       switch (ELF64_R_TYPE (reloc->r_info))
 	{
 	case R_SPARC_COPY:
+	  if (sym == NULL)
+	    /* This can happen in trace mode if an object could not be
+	       found.  */
+	    break;
 	  if (sym->st_size > refsym->st_size
 	      || (_dl_verbose && sym->st_size < refsym->st_size))
 	    {
@@ -164,6 +234,9 @@ elf_machine_rela (struct link_map *map, const Elf64_Rela *reloc,
 	case R_SPARC_16:
 	  *(short *) reloc_addr = value;
 	  break;
+	case R_SPARC_32:
+	  *(unsigned int *) reloc_addr = value;
+	  break;
 	case R_SPARC_DISP8:
 	  *(char *) reloc_addr = (value - (Elf64_Addr) reloc_addr);
 	  break;
@@ -171,27 +244,64 @@ elf_machine_rela (struct link_map *map, const Elf64_Rela *reloc,
 	  *(short *) reloc_addr = (value - (Elf64_Addr) reloc_addr);
 	  break;
 	case R_SPARC_DISP32:
-	  *(unsigned int *)reloc_addr = (value - (Elf64_Addr) reloc_addr);
-	  break;
-	case R_SPARC_LO10:
-	  *(unsigned *)reloc_addr = (*(unsigned *)reloc_addr & ~0x3ff)
-				     | (value & 0x3ff);
+	  *(unsigned int *) reloc_addr = (value - (Elf64_Addr) reloc_addr);
 	  break;
 	case R_SPARC_WDISP30:
-	  *(unsigned *)reloc_addr = ((*(unsigned *)reloc_addr & 0xc0000000)
-			 | ((value - (Elf64_Addr) reloc_addr) >> 2));
+	  *(unsigned int *) reloc_addr =
+	    ((*(unsigned int *)reloc_addr & 0xc0000000) |
+	     ((value - (Elf64_Addr) reloc_addr) >> 2));
+	  break;
+
+	/* MEDLOW code model relocs */
+	case R_SPARC_LO10:
+	  *(unsigned int *) reloc_addr =
+	    ((*(unsigned int *)reloc_addr & ~0x3ff) |
+	     (value & 0x3ff));
 	  break;
 	case R_SPARC_HI22:
-	  *(unsigned *)reloc_addr = (*(unsigned *)reloc_addr & 0xffc00000)
-				     | (value >> 10);
+	  *(unsigned int *) reloc_addr =
+	    ((*(unsigned int *)reloc_addr & 0xffc00000) |
+	     (value >> 10));
+	  break;
+
+	/* MEDMID code model relocs */
+	case R_SPARC_H44:
+	  *(unsigned int *) reloc_addr =
+	    ((*(unsigned int *)reloc_addr & 0xffc00000) |
+	     (value >> 22));
+	  break;
+	case R_SPARC_M44:
+	  *(unsigned int *) reloc_addr =
+	    ((*(unsigned int *)reloc_addr & ~0x3ff) |
+	     ((value >> 12) & 0x3ff));
+	  break;
+	case R_SPARC_L44:
+	  *(unsigned int *) reloc_addr =
+	    ((*(unsigned int *)reloc_addr & ~0xfff) |
+	     (value & 0xfff));
+	  break;
+
+	/* MEDANY code model relocs */
+	case R_SPARC_HH22:
+	  *(unsigned int *) reloc_addr =
+	    ((*(unsigned int *)reloc_addr & 0xffc00000) |
+	     (value >> 42));
+	  break;
+	case R_SPARC_HM10:
+	  *(unsigned int *) reloc_addr =
+	    ((*(unsigned int *)reloc_addr & ~0x3ff) |
+	     ((value >> 32) & 0x3ff));
+	  break;
+	case R_SPARC_LM22:
+	  *(unsigned int *) reloc_addr =
+	    ((*(unsigned int *)reloc_addr & 0xffc00000) |
+	     ((value >> 10) & 0x003fffff));
 	  break;
 
 	case R_SPARC_JMP_SLOT:
 	  elf_machine_fixup_plt(map, reloc, reloc_addr, value);
 	  break;
 
-	case R_SPARC_NONE:		/* Alright, Wilbur.  */
-	  break;
 	default:
 	  assert (! "unexpected dynamic reloc type");
 	  break;
@@ -239,16 +349,63 @@ elf_machine_lazy_rel (Elf64_Addr l_addr, const Elf64_Rela *reloc)
 static inline int
 elf_machine_runtime_setup (struct link_map *l, int lazy, int profile)
 {
-  Elf64_Addr *got;
-  extern void _dl_runtime_resolve (void);
-
   if (l->l_info[DT_JMPREL] && lazy)
     {
-      got = (Elf64_Addr *) (l->l_addr + l->l_info[DT_PLTGOT]->d_un.d_ptr);
-      /* This function will get called to fix up the GOT entry indicated by
-         the offset on the stack, and then jump to the resolved address.  */
-      got[1] = (Elf64_Addr) &_dl_runtime_resolve;
-      got[2] = (Elf64_Addr) l;  /* Identify this shared object.  */
+      extern void _dl_runtime_resolve_0 (void);
+      extern void _dl_runtime_resolve_1 (void);
+      extern void _dl_runtime_profile_0 (void);
+      extern void _dl_runtime_profile_1 (void);
+      Elf64_Addr res0_addr, res1_addr;
+      unsigned int *plt = (unsigned int *)
+	(l->l_addr + l->l_info[DT_PLTGOT]->d_un.d_ptr);
+
+      if (! profile)
+	{
+	  res0_addr = (Elf64_Addr) &_dl_runtime_resolve_0;
+	  res1_addr = (Elf64_Addr) &_dl_runtime_resolve_1;
+	}
+      else
+	{
+	  res0_addr = (Elf64_Addr) &_dl_runtime_profile_0;
+	  res1_addr = (Elf64_Addr) &_dl_runtime_profile_1;
+	  if (_dl_name_match_p (_dl_profile, l))
+	    _dl_profile_map = l;
+	}
+
+      /* PLT0 looks like:
+
+	 save	%sp, -192, %sp
+	 sethi	%hh(_dl_runtime_{resolve,profile}_0), %g3
+	 sethi	%lm(_dl_runtime_{resolve,profile}_0), %g4
+	 or	%g3, %hm(_dl_runtime_{resolve,profile}_0), %g3
+	 or	%g4, %lo(_dl_runtime_{resolve,profile}_0), %g4
+	 sllx	%g3, 32, %g3
+	 jmpl	%g3 + %g4, %o0
+	  nop
+
+	 PLT1 is similar except we jump to _dl_runtime_{resolve,profile}_1.  */
+
+      plt[0] = 0x9de3bf40;
+      plt[1] = 0x07000000 | (res0_addr >> (64 - 22));
+      plt[2] = 0x09000000 | ((res0_addr >> 10) & 0x003fffff);
+      plt[3] = 0x8610e000 | ((res0_addr >> 32) & 0x3ff);
+      plt[4] = 0x88112000 | (res0_addr & 0x3ff);
+      plt[5] = 0x8728f020;
+      plt[6] = 0x91c0c004;
+      plt[7] = 0x01000000;
+
+      plt[8 + 0] = 0x9de3bf40;
+      plt[8 + 1] = 0x07000000 | (res1_addr >> (64 - 22));
+      plt[8 + 2] = 0x09000000 | ((res1_addr >> 10) & 0x003fffff);
+      plt[8 + 3] = 0x8610e000 | ((res1_addr >> 32) & 0x3ff);
+      plt[8 + 4] = 0x88112000 | (res1_addr & 0x3ff);
+      plt[8 + 5] = 0x8728f020;
+      plt[8 + 6] = 0x91c0c004;
+      plt[8 + 7] = 0x01000000;
+
+      /* Now put the magic cookie at the beginning of .PLT3
+	 Entry .PLT4 is unused by this implementation.  */
+      *((struct link_map **)(&plt[16 + 0])) = l;
     }
 
   return lazy;
@@ -256,23 +413,68 @@ elf_machine_runtime_setup (struct link_map *l, int lazy, int profile)
 
 /* This code is used in dl-runtime.c to call the `fixup' function
    and then redirect to the address it returns.  */
-#define ELF_MACHINE_RUNTIME_TRAMPOLINE asm ("\
-	.globl _dl_runtime_resolve
-	.type _dl_runtime_resolve, @function
-_dl_runtime_resolve:
-	save %sp, -160, %sp
-	mov %g1, %o0
-	call fixup
-	 mov %g2, %o1
-	jmp %o0
+#define TRAMPOLINE_TEMPLATE(tramp_name, fixup_name)	\
+  asm ("\
+	.text
+	.globl	" #tramp_name "_0
+	.type	" #tramp_name "_0, @function
+	.align	32
+" #tramp_name "_0:
+	ldx	[%o0 + 32 + 8], %l0
+	sethi	%hi(1048576), %g2
+	sub	%g1, %o0, %o0
+	xor	%g2, -20, %g2
+	sethi	%hi(5120), %g3
+	add	%o0, %g2, %o0
+	sethi	%hi(32768), %o2
+	udivx	%o0, %g3, %g3
+	sllx	%g3, 2, %g1
+	add	%g1, %g3, %g1
+	sllx	%g1, 10, %g2
+	sllx	%g1, 5, %g1
+	sub	%o0, %g2, %o0
+	udivx	%o0, 24, %o0
+	add	%o0, %o2, %o0
+	add	%g1, %o0, %g1
+	sllx	%g1, 1, %o1
+	mov	%l0, %o0
+	add	%o1, %g1, %o1
+	mov	%i7, %o2
+	call	" #fixup_name "
+	 sllx	%o1, 3, %o1
+	jmp	%o0
+	 restore
+	.size	" #tramp_name "_0, . - " #tramp_name "_0
+
+	.globl	" #tramp_name "_1
+	.type	" #tramp_name "_1, @function
+	.align	32
+" #tramp_name "_1:
+	srlx	%g1, 15, %o1
+	ldx	[%o0 + 8], %o0
+	sllx	%o1, 1, %o3
+	add	%o1, %o3, %o1
+	mov	%i7, %o2
+	call	" #fixup_name "
+	 sllx	%o1, 3, %o1
+	jmp	%o0
 	 restore
-	.size _dl_runtime_resolve, .-_dl_runtime_resolve
-");
+	.size	" #tramp_name "_1, . - " #tramp_name "_1
+	.previous");
+
+#ifndef PROF
+#define ELF_MACHINE_RUNTIME_TRAMPOLINE 			\
+  TRAMPOLINE_TEMPLATE (_dl_runtime_resolve, fixup);	\
+  TRAMPOLINE_TEMPLATE (_dl_runtime_profile, profile_fixup);
+#else
+#define ELF_MACHINE_RUNTIME_TRAMPOLINE			\
+  TRAMPOLINE_TEMPLATE (_dl_runtime_resolve, fixup);	\
+  TRAMPOLINE_TEMPLATE (_dl_runtime_profile, fixup);
+#endif
 
 /* The PLT uses Elf64_Rela relocs.  */
 #define elf_machine_relplt elf_machine_rela
 
-
 /* Initial entry point code for the dynamic linker.
    The C function `_dl_start' is the real entry point;
    its return value is the user program's entry point.  */
@@ -281,8 +483,10 @@ _dl_runtime_resolve:
 #define __S(x)	__S1(x)
 
 #define RTLD_START __asm__ ( "\
-	.global _start
-	.type _start, @function
+	.text
+	.global	_start
+	.type	_start, @function
+	.align	32
 _start:
    /* Make room for functions to drop their arguments on the stack.  */
 	sub	%sp, 6*8, %sp
@@ -292,8 +496,8 @@ _start:
 	/* FALLTHRU */
 	.size _start, .-_start
 
-	.global _dl_start_user
-	.type _dl_start_user, @function
+	.global	_dl_start_user
+	.type	_dl_start_user, @function
 _dl_start_user:
    /* Load the GOT register.  */
 1:	call	11f
@@ -302,6 +506,12 @@ _dl_start_user:
 	add	%l7,%o7,%l7
    /* Save the user entry point address in %l0.  */
 	mov	%o0,%l0
+  /* Store the highest stack address.  */
+	sethi	%hi(__libc_stack_end), %g2
+	or	%g2, %lo(__libc_stack_end), %g2
+	ldx	[%l7 + %g2], %l1
+	add	%sp, 6*8, %l2
+	stx	%l2, [%l1]
    /* See if we were run as a command with the executable file name as an
       extra leading argument.  If so, we must shift things around since we
       must keep the stack doubleword aligned.  */
@@ -338,7 +548,7 @@ _dl_start_user:
 	stx	%i4, [%i1+8]
 	brnz,pt	%i3, 13b
 	 add	%i1, 16, %i1
-   /* Load _dl_main_searchlist to pass to _dl_init_next.  */
+  /* Load searchlist of the main object to pass to _dl_init_next.  */
 2:	sethi	%hi(_dl_main_searchlist), %g2
 	or	%g2, %lo(_dl_main_searchlist), %g2
 	ldx	[%l7+%g2], %g2
@@ -349,8 +559,7 @@ _dl_start_user:
 	brz,pn	%o0, 4f
 	 nop
 	jmpl	%o0, %o7
-	 nop
-	ba,a	3b
+	 sub	%o7, 24, %o7
    /* Clear the startup flag.  */
 4:	sethi	%hi(_dl_starting_up), %g2
 	or	%g2, %lo(_dl_starting_up), %g2
@@ -360,7 +569,8 @@ _dl_start_user:
 	sethi	%hi(_dl_fini), %g1
 	or	%g1, %lo(_dl_fini), %g1
 	ldx	[%l7+%g1], %g1
-   /* Jump to the user's entry point & undo the allocation of the xtra regs.  */
+  /* Jump to the user's entry point and deallocate the extra stack we got.  */
 	jmp	%l0
 	 add	%sp, 6*8, %sp
-	.size _dl_start_user, .-_dl_start_user");
+	.size	_dl_start_user, . - _dl_start_user
+	.previous");
diff --git a/sysdeps/sparc/sparc64/elf/crtbegin.S b/sysdeps/sparc/sparc64/elf/crtbegin.S
index 318e7a6a3f..c2fe01f6b0 100644
--- a/sysdeps/sparc/sparc64/elf/crtbegin.S
+++ b/sysdeps/sparc/sparc64/elf/crtbegin.S
@@ -40,7 +40,7 @@ __DTOR_LIST__:
 	.align 4
 	.type __do_global_dtors_aux,#function
 __do_global_dtors_aux:
-	save	%sp,-160,%sp
+	save	%sp,-192,%sp
 
 #ifdef PIC
 1:	call	11f
@@ -53,7 +53,6 @@ __do_global_dtors_aux:
 #else
 	sethi	%hi(__DTOR_LIST__), %l0
 	or	%l0, %lo(__DTOR_LIST__), %l0
-	add	%l0, %g4, %l0
 #endif
 
 	ba	3f
diff --git a/sysdeps/sparc/sparc64/elf/crtend.S b/sysdeps/sparc/sparc64/elf/crtend.S
index 7a5c067b07..db0ac87ecd 100644
--- a/sysdeps/sparc/sparc64/elf/crtend.S
+++ b/sysdeps/sparc/sparc64/elf/crtend.S
@@ -40,7 +40,7 @@ __DTOR_END__:
 	.align 4
 	.type __do_global_ctors_aux,#function
 __do_global_ctors_aux:
-	save	%sp,-160,%sp
+	save	%sp,-192,%sp
 
 #ifdef PIC
 1:	call	11f
@@ -53,7 +53,6 @@ __do_global_ctors_aux:
 #else
 	sethi	%hi(__CTOR_END__), %l0
 	or	%l0, %lo(__CTOR_END__), %l0
-	add	%l0, %g4, %l0
 #endif
 
 	ba	3f
diff --git a/sysdeps/sparc/sparc64/elf/start.S b/sysdeps/sparc/sparc64/elf/start.S
index 140cd366e5..1fc24619cf 100644
--- a/sysdeps/sparc/sparc64/elf/start.S
+++ b/sysdeps/sparc/sparc64/elf/start.S
@@ -38,24 +38,12 @@ _start:
 	add	%sp, STACK_BIAS+23*8, %o2
 
   /* Load the addresses of the user entry points.  */
-        sethi   %uhi(main), %o0
-        sethi   %uhi(_init), %o3
-        sethi   %uhi(_fini), %o4
-        or      %o0, %ulo(main), %o0
-        or      %o3, %ulo(_init), %o3
-        or      %o4, %ulo(_fini), %o4
-        sethi   %hi(main), %i0
-        sethi   %hi(_init), %i3
-        sethi   %hi(_fini), %i4
-	sllx	%o0, 32, %o0
-        or      %i0, %lo(main), %i0
-	sllx	%o3, 32, %o3
-        or      %i3, %lo(_init), %i3
-	sllx	%o4, 32, %o4
-        or      %i4, %lo(_fini), %i4
-	or	%o0, %i0, %o0
-	or	%o3, %i3, %o3
-	or	%o4, %i4, %o4
+        sethi   %hi(main), %o0
+        sethi   %hi(_init), %o3
+        sethi   %hi(_fini), %o4
+        or      %o0, %lo(main), %o0
+        or      %o3, %lo(_init), %o3
+        or      %o4, %lo(_fini), %o4
 
   /* When starting a binary via the dynamic linker, %g1 contains the
      address of the shared library termination function, which will be
diff --git a/sysdeps/sparc/sparc64/lshift.S b/sysdeps/sparc/sparc64/lshift.S
index 1678991529..f211924ddb 100644
--- a/sysdeps/sparc/sparc64/lshift.S
+++ b/sysdeps/sparc/sparc64/lshift.S
@@ -28,7 +28,7 @@
    cnt		%i3  */
 
 ENTRY(__mpn_lshift)
-	save	%sp, -128, %sp
+	save	%sp, -192, %sp
 
 	sllx	%i2,3,%g1
 	add	%i1,%g1,%i1	! make %i1 point at end of src
diff --git a/sysdeps/sparc/sparc64/mul_1.S b/sysdeps/sparc/sparc64/mul_1.S
index 87e441d7eb..757856b4cd 100644
--- a/sysdeps/sparc/sparc64/mul_1.S
+++ b/sysdeps/sparc/sparc64/mul_1.S
@@ -31,7 +31,7 @@
 
 ENTRY(__mpn_mul_1)
 	!#PROLOGUE#	0
-	save	%sp,-128,%sp
+	save	%sp,-192,%sp
 	!#PROLOGUE#	1
 
 	sub	%g0,%i2,%o7
diff --git a/sysdeps/sparc/sparc64/rshift.S b/sysdeps/sparc/sparc64/rshift.S
index e1b3aca112..51eb4af3ab 100644
--- a/sysdeps/sparc/sparc64/rshift.S
+++ b/sysdeps/sparc/sparc64/rshift.S
@@ -28,7 +28,7 @@
    cnt		%i3  */
 
 ENTRY(__mpn_rshift)
-	save	%sp, -128, %sp
+	save	%sp, -192, %sp
 
 	ldx	[%i1],%g2	! load first limb
 	sub	%g0,%i3,%i5	! negate shift count
diff --git a/sysdeps/sparc/sparc64/submul_1.S b/sysdeps/sparc/sparc64/submul_1.S
index 3cc0e3bb08..ce9a80464f 100644
--- a/sysdeps/sparc/sparc64/submul_1.S
+++ b/sysdeps/sparc/sparc64/submul_1.S
@@ -31,7 +31,7 @@
 
 ENTRY(__mpn_submul_1)
 	!#PROLOGUE#	0
-	save	%sp,-128,%sp
+	save	%sp,-192,%sp
 	!#PROLOGUE#	1
 
 	sub	%g0,%i2,%o7