5 files changed, 352 insertions, 23 deletions
diff --git a/linuxthreads/ChangeLog b/linuxthreads/ChangeLog
index e205a2e3ec..10abc40329 100644
--- a/linuxthreads/ChangeLog
+++ b/linuxthreads/ChangeLog
@@ -1,3 +1,73 @@
+2000-05-05  H.J. Lu  <hjl@gnu.org>
+
+	* sysdeps/ia64/pt-machine.h (__compare_and_swap): Change it to
+	have acquire semantics.
+	(__compare_and_swap_with_release_semantics): New inline
+	function.
+	(HAS_COMPARE_AND_SWAP_WITH_RELEASE_SEMANTICS): New macro.
+
+2000-01-28  Hans Boehm  <hboehm@exch.hpl.hp.com>
+
+	* manager.c: Fix the problem with signals at startup.
+	Change the way that thread stacks are allocated on IA64.
+	Clean up some of the guard page allocation stuff.
+
+1999-12-19  H.J. Lu  <hjl@gnu.org>
+
+	* internals.h (page_roundup): New.
+	* attr.c (__pthread_attr_setguardsize); Use page_roundup
+	instead of roundup.
+	* manager.c (pthread_allocate_stack): Make sure guardaddr is
+	page aligned with page_roundup if NEED_SEPARATE_REGISTER_STACK
+	is define.
+
+1999-12-17  Hans Boehm  <hboehm@exch.hpl.hp.com>
+
+	* manager.c (pthread_allocate_stack): Unmap the stack top
+	if failed to map the stack bottom.
+	Fix the guard page.
+	(pthread_free): Fix the guard page.
+
+	* pthread.c (pthread_initialize): Set rlimit correctly for
+	NEED_SEPARATE_REGISTER_STACK.
+
+1999-12-16  H.J. Lu  <hjl@gnu.org>
+
+	* pthread.c (__pthread_initialize_manager): Pass
+	__pthread_manager_thread_bos instead of
+	__pthread_manager_thread_tos to __clone2.
+
+1999-12-16  H.J. Lu  <hjl@gnu.org>
+
+	* manager.c (pthread_allocate_stack): Correct the calculation
+	of "new_thread_bottom". Remove MAP_GROWSDOWN from mmap for
+	stack bottom.
+
+1999-12-13  H.J. Lu  <hjl@gnu.org>
+
+	* sysdeps/ia64/pt-machine.h (__compare_and_swap): Added a stop
+	bit after setting ar.ccv.
+
+1999-12-12  H.J. Lu  <hjl@gnu.org>
+
+	* manager.c (pthread_allocate_stack): Make the starting
+	address of the stack bottom page aligned. FIXME: it may
+	need changes in other places.
+	(pthread_handle_create): Likewise.
+
+1999-12-11  Hans Boehm  <hboehm@exch.hpl.hp.com>
+
+	* manager.c (pthread_allocate_stack): Handle
+	NEED_SEPARATE_REGISTER_STACK.
+	(pthread_handle_create): Likewise.
+	* pthread.c (__pthread_initialize_manager): Likewise.
+
+	* sysdeps/ia64/pt-machine.h: Use r13 for thread pointer.
+
+1999-12-02  H.J. Lu  <hjl@gnu.org>
+
+	* sysdeps/ia64/pt-machine.h: New.
+
 2000-07-13  Ulrich Drepper  <drepper@redhat.com>
 
 	* wrapsyscall.c: Mark non-__ protected names as weak.
diff --git a/linuxthreads/internals.h b/linuxthreads/internals.h
index e3fbf8c521..118eecfff0 100644
--- a/linuxthreads/internals.h
+++ b/linuxthreads/internals.h
@@ -311,6 +311,11 @@ static inline int nonexisting_handle(pthread_handle h, pthread_t id)
 
 /* Fill in defaults left unspecified by pt-machine.h.  */
 
+/* We round up a value with page size. */
+#ifndef page_roundup
+#define page_roundup(v,p) ((((size_t) (v)) + (p) - 1) & ~((p) - 1))
+#endif
+
 /* The page size we can get from the system.  This should likely not be
    changed by the machine file but, you never know.  */
 #ifndef PAGE_SIZE
diff --git a/linuxthreads/manager.c b/linuxthreads/manager.c
index 0ca172c8dd..76ef6cf9fb 100644
--- a/linuxthreads/manager.c
+++ b/linuxthreads/manager.c
@@ -82,6 +82,13 @@ static int main_thread_exiting = 0;
 
 static pthread_t pthread_threads_counter = 0;
 
+#ifdef NEED_SEPARATE_REGISTER_STACK
+/* Signal masks for the manager.  These have to be global only when clone2
+   is used since it's currently borken wrt signals in the child.  */
+static sigset_t manager_mask;		/* Manager normal signal mask	*/
+static sigset_t manager_mask_all;	/* All bits set.	*/
+#endif
+
 /* Forward declarations */
 
 static int pthread_handle_create(pthread_t *thread, const pthread_attr_t *attr,
@@ -100,7 +107,9 @@ int __pthread_manager(void *arg)
 {
   int reqfd = (int) (long int) arg;
   struct pollfd ufd;
-  sigset_t mask;
+#ifndef NEED_SEPARATE_REGISTER_STACK
+  sigset_t manager_mask;
+#endif
   int n;
   struct pthread_request request;
 
@@ -112,12 +121,15 @@ int __pthread_manager(void *arg)
   __pthread_manager_thread.p_errnop = &__pthread_manager_thread.p_errno;
   __pthread_manager_thread.p_h_errnop = &__pthread_manager_thread.p_h_errno;
   /* Block all signals except __pthread_sig_cancel and SIGTRAP */
-  sigfillset(&mask);
-  sigdelset(&mask, __pthread_sig_cancel); /* for thread termination */
-  sigdelset(&mask, SIGTRAP);            /* for debugging purposes */
+  sigfillset(&manager_mask);
+  sigdelset(&manager_mask, __pthread_sig_cancel); /* for thread termination */
+  sigdelset(&manager_mask, SIGTRAP);            /* for debugging purposes */
   if (__pthread_threads_debug && __pthread_sig_debug > 0)
-    sigdelset(&mask, __pthread_sig_debug);
-  sigprocmask(SIG_SETMASK, &mask, NULL);
+    sigdelset(&manager_mask, __pthread_sig_debug);
+  sigprocmask(SIG_SETMASK, &manager_mask, NULL);
+#ifdef NEED_SEPARATE_REGISTER_STACK
+  sigfillset(&manager_mask_all);
+#endif
   /* Raise our priority to match that of main thread */
   __pthread_manager_adjust_prio(__pthread_main_thread->p_priority);
   /* Synchronize debugging of the thread manager */
@@ -294,7 +306,16 @@ static int pthread_allocate_stack(const pthread_attr_t *attr,
 
   if (attr != NULL && attr->__stackaddr_set)
     {
-      /* The user provided a stack. */
+      /* The user provided a stack.  For now we interpret the supplied
+	 address as 1 + the highest addr. in the stack segment.  If a
+	 separate register stack is needed, we place it at the low end
+	 of the segment, relying on the associated stacksize to
+	 determine the low end of the segment.  This differs from many
+	 (but not all) other pthreads implementations.  The intent is
+	 that on machines with a single stack growing toward higher
+	 addresses, stackaddr would be the lowest address in the stack
+	 segment, so that it is consistently close to the initial sp
+	 value. */
       new_thread =
         (pthread_descr) ((long)(attr->__stackaddr) & -sizeof(void *)) - 1;
       new_thread_bottom = (char *) attr->__stackaddr - attr->__stacksize;
@@ -304,11 +325,57 @@ static int pthread_allocate_stack(const pthread_attr_t *attr,
     }
   else
     {
-      stacksize = STACK_SIZE - pagesize;
-      if (attr != NULL)
-        stacksize = MIN (stacksize, roundup(attr->__stacksize, pagesize));
+#ifdef NEED_SEPARATE_REGISTER_STACK
+      size_t granularity = 2 * pagesize;
+      /* Try to make stacksize/2 a multiple of pagesize */
+#else
+      size_t granularity = pagesize;
+#endif
       /* Allocate space for stack and thread descriptor at default address */
+      if (attr != NULL)
+	{
+	  guardsize = page_roundup (attr->__guardsize, granularity);
+	  stacksize = STACK_SIZE - guardsize;
+	  stacksize = MIN (stacksize,
+			   page_roundup (attr->__stacksize, granularity));
+	}
+      else
+	{
+	  guardsize = granularity;
+	  stacksize = STACK_SIZE - granularity;
+	}
       new_thread = default_new_thread;
+#ifdef NEED_SEPARATE_REGISTER_STACK
+      new_thread_bottom = (char *) (new_thread + 1) - stacksize - guardsize;
+      /* Includes guard area, unlike the normal case.  Use the bottom
+       end of the segment as backing store for the register stack.
+       Needed on IA64.  In this case, we also map the entire stack at
+       once.  According to David Mosberger, that's cheaper.  It also
+       avoids the risk of intermittent failures due to other mappings
+       in the same region.  The cost is that we might be able to map
+       slightly fewer stacks.  */
+
+      /* First the main stack: */
+      if (mmap((caddr_t)((char *)(new_thread + 1) - stacksize / 2),
+	       stacksize / 2, PROT_READ | PROT_WRITE | PROT_EXEC,
+	       MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0)
+	  == MAP_FAILED)
+	/* Bad luck, this segment is already mapped. */
+	return -1;
+      /* Then the register stack:	*/
+      if (mmap((caddr_t)new_thread_bottom, stacksize/2,
+	       PROT_READ | PROT_WRITE | PROT_EXEC,
+	       MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0)
+	  == MAP_FAILED)
+	{
+	  munmap((caddr_t)((char *)(new_thread + 1) - stacksize/2),
+		 stacksize/2);
+	  return -1;
+	}
+
+      guardaddr = new_thread_bottom + stacksize/2;
+      /* We leave the guard area in the middle unmapped.	*/
+#else  /* !NEED_SEPARATE_REGISTER_STACK */
       new_thread_bottom = (char *) (new_thread + 1) - stacksize;
       if (mmap((caddr_t)((char *)(new_thread + 1) - INITIAL_STACK_SIZE),
                INITIAL_STACK_SIZE, PROT_READ | PROT_WRITE | PROT_EXEC,
@@ -317,10 +384,10 @@ static int pthread_allocate_stack(const pthread_attr_t *attr,
         /* Bad luck, this segment is already mapped. */
         return -1;
       /* We manage to get a stack.  Now see whether we need a guard
-         and allocate it if necessary.  Notice that the default
-         attributes (stack_size = STACK_SIZE - pagesize) do not need
-	 a guard page, since the RLIMIT_STACK soft limit prevents stacks
-	 from running into one another. */
+	 and allocate it if necessary.  Notice that the default
+	 attributes (stack_size = STACK_SIZE - pagesize and guardsize
+	 = pagesize) do not need a guard page, since the RLIMIT_STACK
+	 soft limit prevents stacks from running into one another. */
       if (stacksize == STACK_SIZE - pagesize)
         {
           /* We don't need a guard page. */
@@ -330,7 +397,6 @@ static int pthread_allocate_stack(const pthread_attr_t *attr,
       else
         {
           /* Put a bad page at the bottom of the stack */
-          guardsize = attr->__guardsize;
 	  guardaddr = (void *)new_thread_bottom - guardsize;
           if (mmap ((caddr_t) guardaddr, guardsize, 0, MAP_FIXED, -1, 0)
               == MAP_FAILED)
@@ -340,6 +406,7 @@ static int pthread_allocate_stack(const pthread_attr_t *attr,
               guardsize = 0;
             }
         }
+#endif /* !NEED_SEPARATE_REGISTER_STACK */
     }
   /* Clear the thread data structure.  */
   memset (new_thread, '\0', sizeof (*new_thread));
@@ -452,9 +519,30 @@ static int pthread_handle_create(pthread_t *thread, const pthread_attr_t *attr,
 	  __pthread_lock(new_thread->p_lock, NULL);
 
 	  /* We have to report this event.  */
+#ifdef NEED_SEPARATE_REGISTER_STACK
+	  /* Perhaps this version should be used on all platforms. But
+	   this requires that __clone2 be uniformly supported
+	   everywhere.
+
+	   And there is some argument for changing the __clone2
+	   interface to pass sp and bsp instead, making it more IA64
+	   specific, but allowing stacks to grow outward from each
+	   other, to get less paging and fewer mmaps.  Clone2
+	   currently can't take signals in the child right after
+	   process creation.  Mask them in the child.  It resets the
+	   mask once it starts up.  */
+	  sigprocmask(SIG_SETMASK, &manager_mask_all, NULL);
+	  pid = __clone2(pthread_start_thread_event,
+  		 (void **)new_thread_bottom,
+			 (char *)new_thread - new_thread_bottom,
+			 CLONE_VM | CLONE_FS | CLONE_FILES | CLONE_SIGHAND |
+			 __pthread_sig_cancel, new_thread);
+	  sigprocmask(SIG_SETMASK, &manager_mask, NULL);
+#else
 	  pid = __clone(pthread_start_thread_event, (void **) new_thread,
 			CLONE_VM | CLONE_FS | CLONE_FILES | CLONE_SIGHAND |
 			__pthread_sig_cancel, new_thread);
+#endif
 	  if (pid != -1)
 	    {
 	      /* Now fill in the information about the new thread in
@@ -479,18 +567,38 @@ static int pthread_handle_create(pthread_t *thread, const pthread_attr_t *attr,
 	}
     }
   if (pid == 0)
-    pid = __clone(pthread_start_thread, (void **) new_thread,
-		  CLONE_VM | CLONE_FS | CLONE_FILES | CLONE_SIGHAND |
-		  __pthread_sig_cancel, new_thread);
+    {
+#ifdef NEED_SEPARATE_REGISTER_STACK
+      sigprocmask(SIG_SETMASK, &manager_mask_all, NULL);
+      pid = __clone2(pthread_start_thread,
+		     (void **)new_thread_bottom,
+                     (char *)new_thread - new_thread_bottom,
+		     CLONE_VM | CLONE_FS | CLONE_FILES | CLONE_SIGHAND |
+		     __pthread_sig_cancel, new_thread);
+      sigprocmask(SIG_SETMASK, &manager_mask, NULL);
+#else
+      pid = __clone(pthread_start_thread, (void **) new_thread,
+		    CLONE_VM | CLONE_FS | CLONE_FILES | CLONE_SIGHAND |
+		    __pthread_sig_cancel, new_thread);
+#endif /* !NEED_SEPARATE_REGISTER_STACK */
+    }
   /* Check if cloning succeeded */
   if (pid == -1) {
     /* Free the stack if we allocated it */
     if (attr == NULL || !attr->__stackaddr_set)
       {
+#ifdef NEED_SEPARATE_REGISTER_STACK
+	size_t stacksize = ((char *)(new_thread->p_guardaddr)
+			    - new_thread_bottom);
+	munmap((caddr_t)new_thread_bottom, stacksize);
+	munmap((caddr_t)new_thread_bottom + stacksize
+	       + new_thread->p_guardsize, stacksize);
+#else
 	if (new_thread->p_guardsize != 0)
 	  munmap(new_thread->p_guardaddr, new_thread->p_guardsize);
 	munmap((caddr_t)((char *)(new_thread+1) - INITIAL_STACK_SIZE),
 	       INITIAL_STACK_SIZE);
+#endif
       }
     __pthread_handles[sseg].h_descr = NULL;
     __pthread_handles[sseg].h_bottom = NULL;
@@ -550,10 +658,27 @@ static void pthread_free(pthread_descr th)
   if (th == &__pthread_initial_thread) return;
   if (!th->p_userstack)
     {
+      size_t guardsize = th->p_guardsize;
       /* Free the stack and thread descriptor area */
-      if (th->p_guardsize != 0)
-	munmap(th->p_guardaddr, th->p_guardsize);
+#ifdef NEED_SEPARATE_REGISTER_STACK
+      char *guardaddr = th->p_guardaddr;
+      /* We unmap exactly what we mapped, in case there was something
+	 else in the same region.  Guardaddr is always set, eve if
+	 guardsize is 0.  This allows us to compute everything else.  */
+      size_t stacksize = (char *)(th+1) - guardaddr - guardsize;
+      /* Unmap the register stack, which is below guardaddr.  */
+      munmap((caddr_t)(guardaddr-stacksize), stacksize);
+      /* Unmap the main stack.	*/
+      munmap((caddr_t)(guardaddr+guardsize), stacksize);
+#else
+      /* The following assumes that we only allocate stacks of one
+	 size.  That's currently true but probably shouldn't be.  This
+	 looks like it fails for growing stacks if there was something
+	 else mapped just below the stack?  */
+      if (guardsize != 0)
+	munmap(th->p_guardaddr, guardsize);
       munmap((caddr_t) ((char *)(th+1) - STACK_SIZE), STACK_SIZE);
+#endif
     }
 }
 
diff --git a/linuxthreads/pthread.c b/linuxthreads/pthread.c
index 2700a29fb1..d70e3f4b1f 100644
--- a/linuxthreads/pthread.c
+++ b/linuxthreads/pthread.c
@@ -362,7 +362,13 @@ static void pthread_initialize(void)
   /* Play with the stack size limit to make sure that no stack ever grows
      beyond STACK_SIZE minus one page (to act as a guard page). */
   getrlimit(RLIMIT_STACK, &limit);
+#ifdef NEED_SEPARATE_REGISTER_STACK
+  /* STACK_SIZE bytes hold both the main stack and register backing
+     store. The rlimit value applies to each individually.  */
+  max_stack = STACK_SIZE/2 - __getpagesize();
+#else
   max_stack = STACK_SIZE - __getpagesize();
+#endif
   if (limit.rlim_cur > max_stack) {
     limit.rlim_cur = max_stack;
     setrlimit(RLIMIT_STACK, &limit);
@@ -444,10 +450,18 @@ int __pthread_initialize_manager(void)
 		   | __pthread_initial_thread.p_eventbuf.eventmask.event_bits[idx]))
 	  != 0)
 	{
+#ifdef NEED_SEPARATE_REGISTER_STACK
+	  pid = __clone2(__pthread_manager_event,
+			 (void **) __pthread_manager_thread_bos,
+			 THREAD_MANAGER_STACK_SIZE,
+			 CLONE_VM | CLONE_FS | CLONE_FILES | CLONE_SIGHAND,
+			 (void *)(long)manager_pipe[0]);
+#else
 	  pid = __clone(__pthread_manager_event,
 			(void **) __pthread_manager_thread_tos,
 			CLONE_VM | CLONE_FS | CLONE_FILES | CLONE_SIGHAND,
 			(void *)(long)manager_pipe[0]);
+#endif
 
 	  if (pid != -1)
 	    {
@@ -472,9 +486,18 @@ int __pthread_initialize_manager(void)
     }
 
   if (pid == 0)
-    pid = __clone(__pthread_manager, (void **) __pthread_manager_thread_tos,
-		  CLONE_VM | CLONE_FS | CLONE_FILES | CLONE_SIGHAND,
-		  (void *)(long)manager_pipe[0]);
+    {
+#ifdef NEED_SEPARATE_REGISTER_STACK
+      pid = __clone2(__pthread_manager, (void **) __pthread_manager_thread_bos,
+		     THREAD_MANAGER_STACK_SIZE,
+		     CLONE_VM | CLONE_FS | CLONE_FILES | CLONE_SIGHAND,
+		     (void *)(long)manager_pipe[0]);
+#else
+      pid = __clone(__pthread_manager, (void **) __pthread_manager_thread_tos,
+		    CLONE_VM | CLONE_FS | CLONE_FILES | CLONE_SIGHAND,
+		    (void *)(long)manager_pipe[0]);
+#endif
+    }
   if (pid == -1) {
     free(__pthread_manager_thread_bos);
     __libc_close(manager_pipe[0]);
diff --git a/linuxthreads/sysdeps/ia64/pt-machine.h b/linuxthreads/sysdeps/ia64/pt-machine.h
new file mode 100644
index 0000000000..58cccc2962
--- /dev/null
+++ b/linuxthreads/sysdeps/ia64/pt-machine.h
@@ -0,0 +1,106 @@
+/* Machine-dependent pthreads configuration and inline functions.
+   IA-64 version.
+   Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Library General Public License as
+   published by the Free Software Foundation; either version 2 of the
+   License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Library General Public License for more details.
+
+   You should have received a copy of the GNU Library General Public
+   License along with the GNU C Library; see the file COPYING.LIB.  If not,
+   write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+   Boston, MA 02111-1307, USA.  */
+
+#ifndef PT_EI
+# define PT_EI extern inline
+#endif
+
+/* Make sure gcc doesn't try to be clever and move things around on
+   us. We need to use _exactly_ the address the user gave us, not some
+   alias that contains the same information.  */
+#define __atomic_fool_gcc(x) (*(volatile struct { int a[100]; } *)x)
+
+#ifndef ELF_MACHINE_NAME
+
+#define NEED_SEPARATE_REGISTER_STACK
+
+/* Get some notion of the current stack.  Need not be exactly the top
+   of the stack, just something somewhere in the current frame.
+   r12 (sp) is the stack pointer. */
+#define CURRENT_STACK_FRAME  stack_pointer
+register char *stack_pointer __asm__ ("sp");
+
+
+/* Register r13 (tp) is reserved by the ABI as "thread pointer". */
+struct _pthread_descr_struct;
+register struct _pthread_descr_struct *__thread_self __asm__("r13");
+
+/* Return the thread descriptor for the current thread.  */
+#define THREAD_SELF  __thread_self
+
+/* Initialize the thread-unique value.  */
+#define INIT_THREAD_SELF(descr, nr)  (__thread_self = (descr))
+
+
+/* Access to data in the thread descriptor is easy.  */
+#define THREAD_GETMEM(descr, member) __thread_self->member
+#define THREAD_GETMEM_NC(descr, member) __thread_self->member
+#define THREAD_SETMEM(descr, member, value) __thread_self->member = (value)
+#define THREAD_SETMEM_NC(descr, member, value) __thread_self->member = (value)
+
+
+#define HAS_COMPARE_AND_SWAP_WITH_RELEASE_SEMANTICS
+
+PT_EI long int
+__compare_and_swap (long int *p, long int oldval, long int newval)
+{
+  long int readval;
+
+  __asm__ __volatile__
+       ("mov ar.ccv=%4;;\n\t"
+	"cmpxchg8.acq %0=%1,%2,ar.ccv"
+	: "=r" (readval), "=m" (__atomic_fool_gcc (p))
+	: "r"(newval), "1" (__atomic_fool_gcc (p)), "r" (oldval)
+	: "memory");
+  return readval == oldval;
+}
+
+PT_EI long int
+__compare_and_swap_with_release_semantics (long int *p,
+					   long int oldval,
+					   long int newval)
+{
+  long int readval;
+
+  __asm__ __volatile__
+       ("mov ar.ccv=%4;;\n\t"
+	"cmpxchg8.rel %0=%1,%2,ar.ccv"
+	: "=r" (readval), "=m" (__atomic_fool_gcc (p))
+	: "r"(newval), "1" (__atomic_fool_gcc (p)), "r" (oldval)
+	: "memory");
+  return readval == oldval;
+}
+
+#endif /* ELF_MACHINE_NAME */
+
+/* Spinlock implementation; required.  */
+PT_EI long int
+testandset (int *spinlock)
+{
+  long int ret;
+
+  __asm__ __volatile__(
+       "xchg4 %0=%1,%2"
+       : "=r"(ret), "=m"(__atomic_fool_gcc (spinlock))
+       : "r"(1), "1"(__atomic_fool_gcc (spinlock))
+       : "memory");
+
+  return ret;
+}