1 files changed, 595 insertions, 159 deletions
diff --git a/nptl/pthread_cond_wait.c b/nptl/pthread_cond_wait.c
index 3f62acc6bd..2b434026c6 100644
--- a/nptl/pthread_cond_wait.c
+++ b/nptl/pthread_cond_wait.c
@@ -19,219 +19,655 @@
 #include <endian.h>
 #include <errno.h>
 #include <sysdep.h>
-#include <lowlevellock.h>
+#include <futex-internal.h>
 #include <pthread.h>
 #include <pthreadP.h>
-#include <kernel-features.h>
+#include <sys/time.h>
+#include <atomic.h>
+#include <stdint.h>
+#include <stdbool.h>
 
 #include <shlib-compat.h>
 #include <stap-probe.h>
+#include <time.h>
+
+#include "pthread_cond_common.c"
+
 
 struct _condvar_cleanup_buffer
 {
-  int oldtype;
+  uint64_t wseq;
   pthread_cond_t *cond;
   pthread_mutex_t *mutex;
-  unsigned int bc_seq;
+  int private;
 };
 
 
-void
-__attribute__ ((visibility ("hidden")))
-__condvar_cleanup (void *arg)
+/* Decrease the waiter reference count.  */
+static void
+__condvar_confirm_wakeup (pthread_cond_t *cond, int private)
 {
-  struct _condvar_cleanup_buffer *cbuffer =
-    (struct _condvar_cleanup_buffer *) arg;
-  unsigned int destroying;
-  int pshared = (cbuffer->cond->__data.__mutex == (void *) ~0l)
-		? LLL_SHARED : LLL_PRIVATE;
+  /* If destruction is pending (i.e., the wake-request flag is nonzero) and we
+     are the last waiter (prior value of __wrefs was 1 << 3), then wake any
+     threads waiting in pthread_cond_destroy.  Release MO to synchronize with
+     these threads.  Don't bother clearing the wake-up request flag.  */
+  if ((atomic_fetch_add_release (&cond->__data.__wrefs, -8) >> 2) == 3)
+    futex_wake (&cond->__data.__wrefs, INT_MAX, private);
+}
+
 
-  /* We are going to modify shared data.  */
-  lll_lock (cbuffer->cond->__data.__lock, pshared);
+/* Cancel waiting after having registered as a waiter previously.  SEQ is our
+   position and G is our group index.
+   The goal of cancellation is to make our group smaller if that is still
+   possible.  If we are in a closed group, this is not possible anymore; in
+   this case, we need to send a replacement signal for the one we effectively
+   consumed because the signal should have gotten consumed by another waiter
+   instead; we must not both cancel waiting and consume a signal.
+
+   Must not be called while still holding a reference on the group.
+
+   Returns true iff we consumed a signal.
+
+   On some kind of timeouts, we may be able to pretend that a signal we
+   effectively consumed happened before the timeout (i.e., similarly to first
+   spinning on signals before actually checking whether the timeout has
+   passed already).  Doing this would allow us to skip sending a replacement
+   signal, but this case might happen rarely because the end of the timeout
+   must race with someone else sending a signal.  Therefore, we don't bother
+   trying to optimize this.  */
+static void
+__condvar_cancel_waiting (pthread_cond_t *cond, uint64_t seq, unsigned int g,
+			  int private)
+{
+  bool consumed_signal = false;
 
-  if (cbuffer->bc_seq == cbuffer->cond->__data.__broadcast_seq)
+  /* No deadlock with group switching is possible here because we have do
+     not hold a reference on the group.  */
+  __condvar_acquire_lock (cond, private);
+
+  uint64_t g1_start = __condvar_load_g1_start_relaxed (cond) >> 1;
+  if (g1_start > seq)
+    {
+      /* Our group is closed, so someone provided enough signals for it.
+	 Thus, we effectively consumed a signal.  */
+      consumed_signal = true;
+    }
+  else
     {
-      /* This thread is not waiting anymore.  Adjust the sequence counters
-	 appropriately.  We do not increment WAKEUP_SEQ if this would
-	 bump it over the value of TOTAL_SEQ.  This can happen if a thread
-	 was woken and then canceled.  */
-      if (cbuffer->cond->__data.__wakeup_seq
-	  < cbuffer->cond->__data.__total_seq)
+      if (g1_start + __condvar_get_orig_size (cond) <= seq)
+	{
+	  /* We are in the current G2 and thus cannot have consumed a signal.
+	     Reduce its effective size or handle overflow.  Remember that in
+	     G2, unsigned int size is zero or a negative value.  */
+	  if (cond->__data.__g_size[g] + __PTHREAD_COND_MAX_GROUP_SIZE > 0)
+	    {
+	      cond->__data.__g_size[g]--;
+	    }
+	  else
+	    {
+	      /* Cancellations would overflow the maximum group size.  Just
+		 wake up everyone spuriously to create a clean state.  This
+		 also means we do not consume a signal someone else sent.  */
+	      __condvar_release_lock (cond, private);
+	      __pthread_cond_broadcast (cond);
+	      return;
+	    }
+	}
+      else
 	{
-	  ++cbuffer->cond->__data.__wakeup_seq;
-	  ++cbuffer->cond->__data.__futex;
+	  /* We are in current G1.  If the group's size is zero, someone put
+	     a signal in the group that nobody else but us can consume.  */
+	  if (cond->__data.__g_size[g] == 0)
+	    consumed_signal = true;
+	  else
+	    {
+	      /* Otherwise, we decrease the size of the group.  This is
+		 equivalent to atomically putting in a signal just for us and
+		 consuming it right away.  We do not consume a signal sent
+		 by someone else.  We also cannot have consumed a futex
+		 wake-up because if we were cancelled or timed out in a futex
+		 call, the futex will wake another waiter.  */
+	      cond->__data.__g_size[g]--;
+	    }
 	}
-      ++cbuffer->cond->__data.__woken_seq;
     }
 
-  cbuffer->cond->__data.__nwaiters -= 1 << COND_NWAITERS_SHIFT;
+  __condvar_release_lock (cond, private);
 
-  /* If pthread_cond_destroy was called on this variable already,
-     notify the pthread_cond_destroy caller all waiters have left
-     and it can be successfully destroyed.  */
-  destroying = 0;
-  if (cbuffer->cond->__data.__total_seq == -1ULL
-      && cbuffer->cond->__data.__nwaiters < (1 << COND_NWAITERS_SHIFT))
+  if (consumed_signal)
     {
-      lll_futex_wake (&cbuffer->cond->__data.__nwaiters, 1, pshared);
-      destroying = 1;
+      /* We effectively consumed a signal even though we didn't want to.
+	 Therefore, we need to send a replacement signal.
+	 If we would want to optimize this, we could do what
+	 pthread_cond_signal does right in the critical section above.  */
+      __pthread_cond_signal (cond);
     }
+}
 
-  /* We are done.  */
-  lll_unlock (cbuffer->cond->__data.__lock, pshared);
-
-  /* Wake everybody to make sure no condvar signal gets lost.  */
-  if (! destroying)
-    lll_futex_wake (&cbuffer->cond->__data.__futex, INT_MAX, pshared);
-
-  /* Get the mutex before returning unless asynchronous cancellation
-     is in effect.  We don't try to get the mutex if we already own it.  */
-  if (!(USE_REQUEUE_PI (cbuffer->mutex))
-      || ((cbuffer->mutex->__data.__lock & FUTEX_TID_MASK)
-	  != THREAD_GETMEM (THREAD_SELF, tid)))
-  {
-    __pthread_mutex_cond_lock (cbuffer->mutex);
-  }
-  else
-    __pthread_mutex_cond_lock_adjust (cbuffer->mutex);
+/* Wake up any signalers that might be waiting.  */
+static void
+__condvar_dec_grefs (pthread_cond_t *cond, unsigned int g, int private)
+{
+  /* Release MO to synchronize-with the acquire load in
+     __condvar_quiesce_and_switch_g1.  */
+  if (atomic_fetch_add_release (cond->__data.__g_refs + g, -2) == 3)
+    {
+      /* Clear the wake-up request flag before waking up.  We do not need more
+	 than relaxed MO and it doesn't matter if we apply this for an aliased
+	 group because we wake all futex waiters right after clearing the
+	 flag.  */
+      atomic_fetch_and_relaxed (cond->__data.__g_refs + g, ~(unsigned int) 1);
+      futex_wake (cond->__data.__g_refs + g, INT_MAX, private);
+    }
 }
 
+/* Clean-up for cancellation of waiters waiting for normal signals.  We cancel
+   our registration as a waiter, confirm we have woken up, and re-acquire the
+   mutex.  */
+static void
+__condvar_cleanup_waiting (void *arg)
+{
+  struct _condvar_cleanup_buffer *cbuffer =
+    (struct _condvar_cleanup_buffer *) arg;
+  pthread_cond_t *cond = cbuffer->cond;
+  unsigned g = cbuffer->wseq & 1;
 
-int
-__pthread_cond_wait (pthread_cond_t *cond, pthread_mutex_t *mutex)
+  __condvar_dec_grefs (cond, g, cbuffer->private);
+
+  __condvar_cancel_waiting (cond, cbuffer->wseq >> 1, g, cbuffer->private);
+  /* FIXME With the current cancellation implementation, it is possible that
+     a thread is cancelled after it has returned from a syscall.  This could
+     result in a cancelled waiter consuming a futex wake-up that is then
+     causing another waiter in the same group to not wake up.  To work around
+     this issue until we have fixed cancellation, just add a futex wake-up
+     conservatively.  */
+  futex_wake (cond->__data.__g_signals + g, 1, cbuffer->private);
+
+  __condvar_confirm_wakeup (cond, cbuffer->private);
+
+  /* XXX If locking the mutex fails, should we just stop execution?  This
+     might be better than silently ignoring the error.  */
+  __pthread_mutex_cond_lock (cbuffer->mutex);
+}
+
+/* This condvar implementation guarantees that all calls to signal and
+   broadcast and all of the three virtually atomic parts of each call to wait
+   (i.e., (1) releasing the mutex and blocking, (2) unblocking, and (3) re-
+   acquiring the mutex) happen in some total order that is consistent with the
+   happens-before relations in the calling program.  However, this order does
+   not necessarily result in additional happens-before relations being
+   established (which aligns well with spurious wake-ups being allowed).
+
+   All waiters acquire a certain position in a 64b waiter sequence (__wseq).
+   This sequence determines which waiters are allowed to consume signals.
+   A broadcast is equal to sending as many signals as are unblocked waiters.
+   When a signal arrives, it samples the current value of __wseq with a
+   relaxed-MO load (i.e., the position the next waiter would get).  (This is
+   sufficient because it is consistent with happens-before; the caller can
+   enforce stronger ordering constraints by calling signal while holding the
+   mutex.)  Only waiters with a position less than the __wseq value observed
+   by the signal are eligible to consume this signal.
+
+   This would be straight-forward to implement if waiters would just spin but
+   we need to let them block using futexes.  Futexes give no guarantee of
+   waking in FIFO order, so we cannot reliably wake eligible waiters if we
+   just use a single futex.  Also, futex words are 32b in size, but we need
+   to distinguish more than 1<<32 states because we need to represent the
+   order of wake-up (and thus which waiters are eligible to consume signals);
+   blocking in a futex is not atomic with a waiter determining its position in
+   the waiter sequence, so we need the futex word to reliably notify waiters
+   that they should not attempt to block anymore because they have been
+   already signaled in the meantime.  While an ABA issue on a 32b value will
+   be rare, ignoring it when we are aware of it is not the right thing to do
+   either.
+
+   Therefore, we use a 64b counter to represent the waiter sequence (on
+   architectures which only support 32b atomics, we use a few bits less).
+   To deal with the blocking using futexes, we maintain two groups of waiters:
+   * Group G1 consists of waiters that are all eligible to consume signals;
+     incoming signals will always signal waiters in this group until all
+     waiters in G1 have been signaled.
+   * Group G2 consists of waiters that arrive when a G1 is present and still
+     contains waiters that have not been signaled.  When all waiters in G1
+     are signaled and a new signal arrives, the new signal will convert G2
+     into the new G1 and create a new G2 for future waiters.
+
+   We cannot allocate new memory because of process-shared condvars, so we
+   have just two slots of groups that change their role between G1 and G2.
+   Each has a separate futex word, a number of signals available for
+   consumption, a size (number of waiters in the group that have not been
+   signaled), and a reference count.
+
+   The group reference count is used to maintain the number of waiters that
+   are using the group's futex.  Before a group can change its role, the
+   reference count must show that no waiters are using the futex anymore; this
+   prevents ABA issues on the futex word.
+
+   To represent which intervals in the waiter sequence the groups cover (and
+   thus also which group slot contains G1 or G2), we use a 64b counter to
+   designate the start position of G1 (inclusive), and a single bit in the
+   waiter sequence counter to represent which group slot currently contains
+   G2.  This allows us to switch group roles atomically wrt. waiters obtaining
+   a position in the waiter sequence.  The G1 start position allows waiters to
+   figure out whether they are in a group that has already been completely
+   signaled (i.e., if the current G1 starts at a later position that the
+   waiter's position).  Waiters cannot determine whether they are currently
+   in G2 or G1 -- but they do not have too because all they are interested in
+   is whether there are available signals, and they always start in G2 (whose
+   group slot they know because of the bit in the waiter sequence.  Signalers
+   will simply fill the right group until it is completely signaled and can
+   be closed (they do not switch group roles until they really have to to
+   decrease the likelihood of having to wait for waiters still holding a
+   reference on the now-closed G1).
+
+   Signalers maintain the initial size of G1 to be able to determine where
+   G2 starts (G2 is always open-ended until it becomes G1).  They track the
+   remaining size of a group; when waiters cancel waiting (due to PThreads
+   cancellation or timeouts), they will decrease this remaining size as well.
+
+   To implement condvar destruction requirements (i.e., that
+   pthread_cond_destroy can be called as soon as all waiters have been
+   signaled), waiters increment a reference count before starting to wait and
+   decrement it after they stopped waiting but right before they acquire the
+   mutex associated with the condvar.
+
+   pthread_cond_t thus consists of the following (bits that are used for
+   flags and are not part of the primary value of each field but necessary
+   to make some things atomic or because there was no space for them
+   elsewhere in the data structure):
+
+   __wseq: Waiter sequence counter
+     * LSB is index of current G2.
+     * Waiters fetch-add while having acquire the mutex associated with the
+       condvar.  Signalers load it and fetch-xor it concurrently.
+   __g1_start: Starting position of G1 (inclusive)
+     * LSB is index of current G2.
+     * Modified by signalers while having acquired the condvar-internal lock
+       and observed concurrently by waiters.
+   __g1_orig_size: Initial size of G1
+     * The two least-significant bits represent the condvar-internal lock.
+     * Only accessed while having acquired the condvar-internal lock.
+   __wrefs: Waiter reference counter.
+     * Bit 2 is true if waiters should run futex_wake when they remove the
+       last reference.  pthread_cond_destroy uses this as futex word.
+     * Bit 1 is the clock ID (0 == CLOCK_REALTIME, 1 == CLOCK_MONOTONIC).
+     * Bit 0 is true iff this is a process-shared condvar.
+     * Simple reference count used by both waiters and pthread_cond_destroy.
+     (If the format of __wrefs is changed, update nptl_lock_constants.pysym
+      and the pretty printers.)
+   For each of the two groups, we have:
+   __g_refs: Futex waiter reference count.
+     * LSB is true if waiters should run futex_wake when they remove the
+       last reference.
+     * Reference count used by waiters concurrently with signalers that have
+       acquired the condvar-internal lock.
+   __g_signals: The number of signals that can still be consumed.
+     * Used as a futex word by waiters.  Used concurrently by waiters and
+       signalers.
+     * LSB is true iff this group has been completely signaled (i.e., it is
+       closed).
+   __g_size: Waiters remaining in this group (i.e., which have not been
+     signaled yet.
+     * Accessed by signalers and waiters that cancel waiting (both do so only
+       when having acquired the condvar-internal lock.
+     * The size of G2 is always zero because it cannot be determined until
+       the group becomes G1.
+     * Although this is of unsigned type, we rely on using unsigned overflow
+       rules to make this hold effectively negative values too (in
+       particular, when waiters in G2 cancel waiting).
+
+   A PTHREAD_COND_INITIALIZER condvar has all fields set to zero, which yields
+   a condvar that has G2 starting at position 0 and a G1 that is closed.
+
+   Because waiters do not claim ownership of a group right when obtaining a
+   position in __wseq but only reference count the group when using futexes
+   to block, it can happen that a group gets closed before a waiter can
+   increment the reference count.  Therefore, waiters have to check whether
+   their group is already closed using __g1_start.  They also have to perform
+   this check when spinning when trying to grab a signal from __g_signals.
+   Note that for these checks, using relaxed MO to load __g1_start is
+   sufficient because if a waiter can see a sufficiently large value, it could
+   have also consume a signal in the waiters group.
+
+   Waiters try to grab a signal from __g_signals without holding a reference
+   count, which can lead to stealing a signal from a more recent group after
+   their own group was already closed.  They cannot always detect whether they
+   in fact did because they do not know when they stole, but they can
+   conservatively add a signal back to the group they stole from; if they
+   did so unnecessarily, all that happens is a spurious wake-up.  To make this
+   even less likely, __g1_start contains the index of the current g2 too,
+   which allows waiters to check if there aliasing on the group slots; if
+   there wasn't, they didn't steal from the current G1, which means that the
+   G1 they stole from must have been already closed and they do not need to
+   fix anything.
+
+   It is essential that the last field in pthread_cond_t is __g_signals[1]:
+   The previous condvar used a pointer-sized field in pthread_cond_t, so a
+   PTHREAD_COND_INITIALIZER from that condvar implementation might only
+   initialize 4 bytes to zero instead of the 8 bytes we need (i.e., 44 bytes
+   in total instead of the 48 we need).  __g_signals[1] is not accessed before
+   the first group switch (G2 starts at index 0), which will set its value to
+   zero after a harmless fetch-or whose return value is ignored.  This
+   effectively completes initialization.
+
+
+   Limitations:
+   * This condvar isn't designed to allow for more than
+     __PTHREAD_COND_MAX_GROUP_SIZE * (1 << 31) calls to __pthread_cond_wait.
+   * More than __PTHREAD_COND_MAX_GROUP_SIZE concurrent waiters are not
+     supported.
+   * Beyond what is allowed as errors by POSIX or documented, we can also
+     return the following errors:
+     * EPERM if MUTEX is a recursive mutex and the caller doesn't own it.
+     * EOWNERDEAD or ENOTRECOVERABLE when using robust mutexes.  Unlike
+       for other errors, this can happen when we re-acquire the mutex; this
+       isn't allowed by POSIX (which requires all errors to virtually happen
+       before we release the mutex or change the condvar state), but there's
+       nothing we can do really.
+     * When using PTHREAD_MUTEX_PP_* mutexes, we can also return all errors
+       returned by __pthread_tpp_change_priority.  We will already have
+       released the mutex in such cases, so the caller cannot expect to own
+       MUTEX.
+
+   Other notes:
+   * Instead of the normal mutex unlock / lock functions, we use
+     __pthread_mutex_unlock_usercnt(m, 0) / __pthread_mutex_cond_lock(m)
+     because those will not change the mutex-internal users count, so that it
+     can be detected when a condvar is still associated with a particular
+     mutex because there is a waiter blocked on this condvar using this mutex.
+*/
+static __always_inline int
+__pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex,
+    const struct timespec *abstime)
 {
-  struct _pthread_cleanup_buffer buffer;
-  struct _condvar_cleanup_buffer cbuffer;
+  const int maxspin = 0;
   int err;
-  int pshared = (cond->__data.__mutex == (void *) ~0l)
-		? LLL_SHARED : LLL_PRIVATE;
-
-#if (defined lll_futex_wait_requeue_pi \
-     && defined __ASSUME_REQUEUE_PI)
-  int pi_flag = 0;
-#endif
+  int result = 0;
 
   LIBC_PROBE (cond_wait, 2, cond, mutex);
 
-  /* Make sure we are alone.  */
-  lll_lock (cond->__data.__lock, pshared);
-
-  /* Now we can release the mutex.  */
+  /* Acquire a position (SEQ) in the waiter sequence (WSEQ).  We use an
+     atomic operation because signals and broadcasts may update the group
+     switch without acquiring the mutex.  We do not need release MO here
+     because we do not need to establish any happens-before relation with
+     signalers (see __pthread_cond_signal); modification order alone
+     establishes a total order of waiters/signals.  We do need acquire MO
+     to synchronize with group reinitialization in
+     __condvar_quiesce_and_switch_g1.  */
+  uint64_t wseq = __condvar_fetch_add_wseq_acquire (cond, 2);
+  /* Find our group's index.  We always go into what was G2 when we acquired
+     our position.  */
+  unsigned int g = wseq & 1;
+  uint64_t seq = wseq >> 1;
+
+  /* Increase the waiter reference count.  Relaxed MO is sufficient because
+     we only need to synchronize when decrementing the reference count.  */
+  unsigned int flags = atomic_fetch_add_relaxed (&cond->__data.__wrefs, 8);
+  int private = __condvar_get_private (flags);
+
+  /* Now that we are registered as a waiter, we can release the mutex.
+     Waiting on the condvar must be atomic with releasing the mutex, so if
+     the mutex is used to establish a happens-before relation with any
+     signaler, the waiter must be visible to the latter; thus, we release the
+     mutex after registering as waiter.
+     If releasing the mutex fails, we just cancel our registration as a
+     waiter and confirm that we have woken up.  */
   err = __pthread_mutex_unlock_usercnt (mutex, 0);
-  if (__glibc_unlikely (err))
+  if (__glibc_unlikely (err != 0))
     {
-      lll_unlock (cond->__data.__lock, pshared);
+      __condvar_cancel_waiting (cond, seq, g, private);
+      __condvar_confirm_wakeup (cond, private);
       return err;
     }
 
-  /* We have one new user of the condvar.  */
-  ++cond->__data.__total_seq;
-  ++cond->__data.__futex;
-  cond->__data.__nwaiters += 1 << COND_NWAITERS_SHIFT;
-
-  /* Remember the mutex we are using here.  If there is already a
-     different address store this is a bad user bug.  Do not store
-     anything for pshared condvars.  */
-  if (cond->__data.__mutex != (void *) ~0l)
-    cond->__data.__mutex = mutex;
-
-  /* Prepare structure passed to cancellation handler.  */
-  cbuffer.cond = cond;
-  cbuffer.mutex = mutex;
-
-  /* Before we block we enable cancellation.  Therefore we have to
-     install a cancellation handler.  */
-  __pthread_cleanup_push (&buffer, __condvar_cleanup, &cbuffer);
-
-  /* The current values of the wakeup counter.  The "woken" counter
-     must exceed this value.  */
-  unsigned long long int val;
-  unsigned long long int seq;
-  val = seq = cond->__data.__wakeup_seq;
-  /* Remember the broadcast counter.  */
-  cbuffer.bc_seq = cond->__data.__broadcast_seq;
+  /* Now wait until a signal is available in our group or it is closed.
+     Acquire MO so that if we observe a value of zero written after group
+     switching in __condvar_quiesce_and_switch_g1, we synchronize with that
+     store and will see the prior update of __g1_start done while switching
+     groups too.  */
+  unsigned int signals = atomic_load_acquire (cond->__data.__g_signals + g);
 
   do
     {
-      unsigned int futex_val = cond->__data.__futex;
-      /* Prepare to wait.  Release the condvar futex.  */
-      lll_unlock (cond->__data.__lock, pshared);
-
-      /* Enable asynchronous cancellation.  Required by the standard.  */
-      cbuffer.oldtype = __pthread_enable_asynccancel ();
-
-#if (defined lll_futex_wait_requeue_pi \
-     && defined __ASSUME_REQUEUE_PI)
-      /* If pi_flag remained 1 then it means that we had the lock and the mutex
-	 but a spurious waker raced ahead of us.  Give back the mutex before
-	 going into wait again.  */
-      if (pi_flag)
+      while (1)
 	{
-	  __pthread_mutex_cond_lock_adjust (mutex);
-	  __pthread_mutex_unlock_usercnt (mutex, 0);
+	  /* Spin-wait first.
+	     Note that spinning first without checking whether a timeout
+	     passed might lead to what looks like a spurious wake-up even
+	     though we should return ETIMEDOUT (e.g., if the caller provides
+	     an absolute timeout that is clearly in the past).  However,
+	     (1) spurious wake-ups are allowed, (2) it seems unlikely that a
+	     user will (ab)use pthread_cond_wait as a check for whether a
+	     point in time is in the past, and (3) spinning first without
+	     having to compare against the current time seems to be the right
+	     choice from a performance perspective for most use cases.  */
+	  unsigned int spin = maxspin;
+	  while (signals == 0 && spin > 0)
+	    {
+	      /* Check that we are not spinning on a group that's already
+		 closed.  */
+	      if (seq < (__condvar_load_g1_start_relaxed (cond) >> 1))
+		goto done;
+
+	      /* TODO Back off.  */
+
+	      /* Reload signals.  See above for MO.  */
+	      signals = atomic_load_acquire (cond->__data.__g_signals + g);
+	      spin--;
+	    }
+
+	  /* If our group will be closed as indicated by the flag on signals,
+	     don't bother grabbing a signal.  */
+	  if (signals & 1)
+	    goto done;
+
+	  /* If there is an available signal, don't block.  */
+	  if (signals != 0)
+	    break;
+
+	  /* No signals available after spinning, so prepare to block.
+	     We first acquire a group reference and use acquire MO for that so
+	     that we synchronize with the dummy read-modify-write in
+	     __condvar_quiesce_and_switch_g1 if we read from that.  In turn,
+	     in this case this will make us see the closed flag on __g_signals
+	     that designates a concurrent attempt to reuse the group's slot.
+	     We use acquire MO for the __g_signals check to make the
+	     __g1_start check work (see spinning above).
+	     Note that the group reference acquisition will not mask the
+	     release MO when decrementing the reference count because we use
+	     an atomic read-modify-write operation and thus extend the release
+	     sequence.  */
+	  atomic_fetch_add_acquire (cond->__data.__g_refs + g, 2);
+	  if (((atomic_load_acquire (cond->__data.__g_signals + g) & 1) != 0)
+	      || (seq < (__condvar_load_g1_start_relaxed (cond) >> 1)))
+	    {
+	      /* Our group is closed.  Wake up any signalers that might be
+		 waiting.  */
+	      __condvar_dec_grefs (cond, g, private);
+	      goto done;
+	    }
+
+	  // Now block.
+	  struct _pthread_cleanup_buffer buffer;
+	  struct _condvar_cleanup_buffer cbuffer;
+	  cbuffer.wseq = wseq;
+	  cbuffer.cond = cond;
+	  cbuffer.mutex = mutex;
+	  cbuffer.private = private;
+	  __pthread_cleanup_push (&buffer, __condvar_cleanup_waiting, &cbuffer);
+
+	  if (abstime == NULL)
+	    {
+	      /* Block without a timeout.  */
+	      err = futex_wait_cancelable (
+		  cond->__data.__g_signals + g, 0, private);
+	    }
+	  else
+	    {
+	      /* Block, but with a timeout.
+		 Work around the fact that the kernel rejects negative timeout
+		 values despite them being valid.  */
+	      if (__glibc_unlikely (abstime->tv_sec < 0))
+	        err = ETIMEDOUT;
+
+	      else if ((flags & __PTHREAD_COND_CLOCK_MONOTONIC_MASK) != 0)
+		{
+		  /* CLOCK_MONOTONIC is requested.  */
+		  struct timespec rt;
+		  if (__clock_gettime (CLOCK_MONOTONIC, &rt) != 0)
+		    __libc_fatal ("clock_gettime does not support "
+				  "CLOCK_MONOTONIC");
+		  /* Convert the absolute timeout value to a relative
+		     timeout.  */
+		  rt.tv_sec = abstime->tv_sec - rt.tv_sec;
+		  rt.tv_nsec = abstime->tv_nsec - rt.tv_nsec;
+		  if (rt.tv_nsec < 0)
+		    {
+		      rt.tv_nsec += 1000000000;
+		      --rt.tv_sec;
+		    }
+		  /* Did we already time out?  */
+		  if (__glibc_unlikely (rt.tv_sec < 0))
+		    err = ETIMEDOUT;
+		  else
+		    err = futex_reltimed_wait_cancelable
+			(cond->__data.__g_signals + g, 0, &rt, private);
+		}
+	      else
+		{
+		  /* Use CLOCK_REALTIME.  */
+		  err = futex_abstimed_wait_cancelable
+		      (cond->__data.__g_signals + g, 0, abstime, private);
+		}
+	    }
+
+	  __pthread_cleanup_pop (&buffer, 0);
+
+	  if (__glibc_unlikely (err == ETIMEDOUT))
+	    {
+	      __condvar_dec_grefs (cond, g, private);
+	      /* If we timed out, we effectively cancel waiting.  Note that
+		 we have decremented __g_refs before cancellation, so that a
+		 deadlock between waiting for quiescence of our group in
+		 __condvar_quiesce_and_switch_g1 and us trying to acquire
+		 the lock during cancellation is not possible.  */
+	      __condvar_cancel_waiting (cond, seq, g, private);
+	      result = ETIMEDOUT;
+	      goto done;
+	    }
+	  else
+	    __condvar_dec_grefs (cond, g, private);
+
+	  /* Reload signals.  See above for MO.  */
+	  signals = atomic_load_acquire (cond->__data.__g_signals + g);
 	}
-      pi_flag = USE_REQUEUE_PI (mutex);
 
-      if (pi_flag)
+    }
+  /* Try to grab a signal.  Use acquire MO so that we see an up-to-date value
+     of __g1_start below (see spinning above for a similar case).  In
+     particular, if we steal from a more recent group, we will also see a
+     more recent __g1_start below.  */
+  while (!atomic_compare_exchange_weak_acquire (cond->__data.__g_signals + g,
+						&signals, signals - 2));
+
+  /* We consumed a signal but we could have consumed from a more recent group
+     that aliased with ours due to being in the same group slot.  If this
+     might be the case our group must be closed as visible through
+     __g1_start.  */
+  uint64_t g1_start = __condvar_load_g1_start_relaxed (cond);
+  if (seq < (g1_start >> 1))
+    {
+      /* We potentially stole a signal from a more recent group but we do not
+	 know which group we really consumed from.
+	 We do not care about groups older than current G1 because they are
+	 closed; we could have stolen from these, but then we just add a
+	 spurious wake-up for the current groups.
+	 We will never steal a signal from current G2 that was really intended
+	 for G2 because G2 never receives signals (until it becomes G1).  We
+	 could have stolen a signal from G2 that was conservatively added by a
+	 previous waiter that also thought it stole a signal -- but given that
+	 that signal was added unnecessarily, it's not a problem if we steal
+	 it.
+	 Thus, the remaining case is that we could have stolen from the current
+	 G1, where "current" means the __g1_start value we observed.  However,
+	 if the current G1 does not have the same slot index as we do, we did
+	 not steal from it and do not need to undo that.  This is the reason
+	 for putting a bit with G2's index into__g1_start as well.  */
+      if (((g1_start & 1) ^ 1) == g)
 	{
-	  err = lll_futex_wait_requeue_pi (&cond->__data.__futex,
-					   futex_val, &mutex->__data.__lock,
-					   pshared);
-
-	  pi_flag = (err == 0);
+	  /* We have to conservatively undo our potential mistake of stealing
+	     a signal.  We can stop trying to do that when the current G1
+	     changes because other spinning waiters will notice this too and
+	     __condvar_quiesce_and_switch_g1 has checked that there are no
+	     futex waiters anymore before switching G1.
+	     Relaxed MO is fine for the __g1_start load because we need to
+	     merely be able to observe this fact and not have to observe
+	     something else as well.
+	     ??? Would it help to spin for a little while to see whether the
+	     current G1 gets closed?  This might be worthwhile if the group is
+	     small or close to being closed.  */
+	  unsigned int s = atomic_load_relaxed (cond->__data.__g_signals + g);
+	  while (__condvar_load_g1_start_relaxed (cond) == g1_start)
+	    {
+	      /* Try to add a signal.  We don't need to acquire the lock
+		 because at worst we can cause a spurious wake-up.  If the
+		 group is in the process of being closed (LSB is true), this
+		 has an effect similar to us adding a signal.  */
+	      if (((s & 1) != 0)
+		  || atomic_compare_exchange_weak_relaxed
+		       (cond->__data.__g_signals + g, &s, s + 2))
+		{
+		  /* If we added a signal, we also need to add a wake-up on
+		     the futex.  We also need to do that if we skipped adding
+		     a signal because the group is being closed because
+		     while __condvar_quiesce_and_switch_g1 could have closed
+		     the group, it might stil be waiting for futex waiters to
+		     leave (and one of those waiters might be the one we stole
+		     the signal from, which cause it to block using the
+		     futex).  */
+		  futex_wake (cond->__data.__g_signals + g, 1, private);
+		  break;
+		}
+	      /* TODO Back off.  */
+	    }
 	}
-      else
-#endif
-	  /* Wait until woken by signal or broadcast.  */
-	lll_futex_wait (&cond->__data.__futex, futex_val, pshared);
-
-      /* Disable asynchronous cancellation.  */
-      __pthread_disable_asynccancel (cbuffer.oldtype);
-
-      /* We are going to look at shared data again, so get the lock.  */
-      lll_lock (cond->__data.__lock, pshared);
-
-      /* If a broadcast happened, we are done.  */
-      if (cbuffer.bc_seq != cond->__data.__broadcast_seq)
-	goto bc_out;
-
-      /* Check whether we are eligible for wakeup.  */
-      val = cond->__data.__wakeup_seq;
     }
-  while (val == seq || cond->__data.__woken_seq == val);
 
-  /* Another thread woken up.  */
-  ++cond->__data.__woken_seq;
+ done:
 
- bc_out:
+  /* Confirm that we have been woken.  We do that before acquiring the mutex
+     to allow for execution of pthread_cond_destroy while having acquired the
+     mutex.  */
+  __condvar_confirm_wakeup (cond, private);
 
-  cond->__data.__nwaiters -= 1 << COND_NWAITERS_SHIFT;
-
-  /* If pthread_cond_destroy was called on this varaible already,
-     notify the pthread_cond_destroy caller all waiters have left
-     and it can be successfully destroyed.  */
-  if (cond->__data.__total_seq == -1ULL
-      && cond->__data.__nwaiters < (1 << COND_NWAITERS_SHIFT))
-    lll_futex_wake (&cond->__data.__nwaiters, 1, pshared);
+  /* Woken up; now re-acquire the mutex.  If this doesn't fail, return RESULT,
+     which is set to ETIMEDOUT if a timeout occured, or zero otherwise.  */
+  err = __pthread_mutex_cond_lock (mutex);
+  /* XXX Abort on errors that are disallowed by POSIX?  */
+  return (err != 0) ? err : result;
+}
 
-  /* We are done with the condvar.  */
-  lll_unlock (cond->__data.__lock, pshared);
 
-  /* The cancellation handling is back to normal, remove the handler.  */
-  __pthread_cleanup_pop (&buffer, 0);
+/* See __pthread_cond_wait_common.  */
+int
+__pthread_cond_wait (pthread_cond_t *cond, pthread_mutex_t *mutex)
+{
+  return __pthread_cond_wait_common (cond, mutex, NULL);
+}
 
-  /* Get the mutex before returning.  Not needed for PI.  */
-#if (defined lll_futex_wait_requeue_pi \
-     && defined __ASSUME_REQUEUE_PI)
-  if (pi_flag)
-    {
-      __pthread_mutex_cond_lock_adjust (mutex);
-      return 0;
-    }
-  else
-#endif
-    return __pthread_mutex_cond_lock (mutex);
+/* See __pthread_cond_wait_common.  */
+int
+__pthread_cond_timedwait (pthread_cond_t *cond, pthread_mutex_t *mutex,
+    const struct timespec *abstime)
+{
+  /* Check parameter validity.  This should also tell the compiler that
+     it can assume that abstime is not NULL.  */
+  if (abstime->tv_nsec < 0 || abstime->tv_nsec >= 1000000000)
+    return EINVAL;
+  return __pthread_cond_wait_common (cond, mutex, abstime);
 }
 
 versioned_symbol (libpthread, __pthread_cond_wait, pthread_cond_wait,
 		  GLIBC_2_3_2);
+versioned_symbol (libpthread, __pthread_cond_timedwait, pthread_cond_timedwait,
+		  GLIBC_2_3_2);