about summary refs log tree commit diff
path: root/nptl
diff options
context:
space:
mode:
Diffstat (limited to 'nptl')
-rw-r--r--nptl/DESIGN-condvar.txt134
-rw-r--r--nptl/Makefile6
-rw-r--r--nptl/lowlevelcond.sym16
-rw-r--r--nptl/nptl-printers.py70
-rw-r--r--nptl/nptl_lock_constants.pysym27
-rw-r--r--nptl/pthreadP.h7
-rw-r--r--nptl/pthread_cond_broadcast.c99
-rw-r--r--nptl/pthread_cond_common.c466
-rw-r--r--nptl/pthread_cond_destroy.c82
-rw-r--r--nptl/pthread_cond_init.c28
-rw-r--r--nptl/pthread_cond_signal.c99
-rw-r--r--nptl/pthread_cond_timedwait.c268
-rw-r--r--nptl/pthread_cond_wait.c754
-rw-r--r--nptl/pthread_condattr_getclock.c2
-rw-r--r--nptl/pthread_condattr_getpshared.c3
-rw-r--r--nptl/pthread_condattr_init.c4
-rw-r--r--nptl/pthread_condattr_setclock.c11
-rw-r--r--nptl/test-cond-printers.py2
-rw-r--r--nptl/tst-cond1.c3
-rw-r--r--nptl/tst-cond20.c5
-rw-r--r--nptl/tst-cond22.c18
21 files changed, 1274 insertions, 830 deletions
diff --git a/nptl/DESIGN-condvar.txt b/nptl/DESIGN-condvar.txt
deleted file mode 100644
index 4845251c75..0000000000
--- a/nptl/DESIGN-condvar.txt
+++ /dev/null
@@ -1,134 +0,0 @@
-Conditional Variable pseudocode.
-================================
-
-       int pthread_cond_timedwait (pthread_cond_t *cv, pthread_mutex_t *mutex);
-       int pthread_cond_signal    (pthread_cond_t *cv);
-       int pthread_cond_broadcast (pthread_cond_t *cv);
-
-struct pthread_cond_t {
-
-   unsigned int cond_lock;
-
-         internal mutex
-
-   uint64_t total_seq;
-
-     Total number of threads using the conditional variable.
-
-   uint64_t wakeup_seq;
-
-     sequence number for next wakeup.
-
-   uint64_t woken_seq;
-
-     sequence number of last woken thread.
-
-   uint32_t broadcast_seq;
-
-}
-
-
-struct cv_data {
-
-   pthread_cond_t *cv;
-
-   uint32_t bc_seq
-
-}
-
-
-
-cleanup_handler(cv_data)
-{
-  cv = cv_data->cv;
-  lll_lock(cv->lock);
-
-  if (cv_data->bc_seq == cv->broadcast_seq) {
-    ++cv->wakeup_seq;
-    ++cv->woken_seq;
-  }
-
-  /* make sure no signal gets lost.  */
-  FUTEX_WAKE(cv->wakeup_seq, ALL);
-
-  lll_unlock(cv->lock);
-}
-
-
-cond_timedwait(cv, mutex, timeout):
-{
-   lll_lock(cv->lock);
-   mutex_unlock(mutex);
-
-   cleanup_push
-
-   ++cv->total_seq;
-   val = seq =  cv->wakeup_seq;
-   cv_data.bc = cv->broadcast_seq;
-   cv_data.cv = cv;
-
-   while (1) {
-
-     lll_unlock(cv->lock);
-
-     enable_async(&cv_data);
-
-     ret = FUTEX_WAIT(cv->wakeup_seq, val, timeout);
-
-     restore_async
-
-     lll_lock(cv->lock);
-
-     if (bc != cv->broadcast_seq)
-       goto bc_out;
-
-     val = cv->wakeup_seq;
-
-     if (val != seq && cv->woken_seq != val) {
-       ret = 0;
-       break;
-     }
-
-     if (ret == TIMEDOUT) {
-       ++cv->wakeup_seq;
-       break;
-     }
-   }
-
-   ++cv->woken_seq;
-
- bc_out:
-   lll_unlock(cv->lock);
-
-   cleanup_pop
-
-   mutex_lock(mutex);
-
-   return ret;
-}
-
-cond_signal(cv)
-{
-   lll_lock(cv->lock);
-
-   if (cv->total_seq > cv->wakeup_seq) {
-     ++cv->wakeup_seq;
-     FUTEX_WAKE(cv->wakeup_seq, 1);
-   }
-
-   lll_unlock(cv->lock);
-}
-
-cond_broadcast(cv)
-{
-   lll_lock(cv->lock);
-
-   if (cv->total_seq > cv->wakeup_seq) {
-     cv->wakeup_seq = cv->total_seq;
-     cv->woken_seq = cv->total_seq;
-     ++cv->broadcast_seq;
-     FUTEX_WAKE(cv->wakeup_seq, ALL);
-   }
-
-   lll_unlock(cv->lock);
-}
diff --git a/nptl/Makefile b/nptl/Makefile
index bed5babfd9..62b0951ec0 100644
--- a/nptl/Makefile
+++ b/nptl/Makefile
@@ -83,7 +83,7 @@ libpthread-routines = nptl-init vars events version pt-interp \
 		      pthread_rwlockattr_getkind_np \
 		      pthread_rwlockattr_setkind_np \
 		      pthread_cond_init pthread_cond_destroy \
-		      pthread_cond_wait pthread_cond_timedwait \
+		      pthread_cond_wait \
 		      pthread_cond_signal pthread_cond_broadcast \
 		      old_pthread_cond_init old_pthread_cond_destroy \
 		      old_pthread_cond_wait old_pthread_cond_timedwait \
@@ -186,7 +186,6 @@ CFLAGS-pthread_timedjoin.c = -fexceptions -fasynchronous-unwind-tables
 CFLAGS-pthread_once.c = $(uses-callbacks) -fexceptions \
 			-fasynchronous-unwind-tables
 CFLAGS-pthread_cond_wait.c = -fexceptions -fasynchronous-unwind-tables
-CFLAGS-pthread_cond_timedwait.c = -fexceptions -fasynchronous-unwind-tables
 CFLAGS-sem_wait.c = -fexceptions -fasynchronous-unwind-tables
 CFLAGS-sem_timedwait.c = -fexceptions -fasynchronous-unwind-tables
 
@@ -307,8 +306,7 @@ test-xfail-tst-once5 = yes
 # Files which must not be linked with libpthread.
 tests-nolibpthread = tst-unload
 
-gen-as-const-headers = pthread-errnos.sym \
-		       lowlevelcond.sym lowlevelrwlock.sym \
+gen-as-const-headers = pthread-errnos.sym lowlevelrwlock.sym \
 		       unwindbuf.sym \
 		       lowlevelrobustlock.sym pthread-pi-defines.sym
 
diff --git a/nptl/lowlevelcond.sym b/nptl/lowlevelcond.sym
deleted file mode 100644
index 18e1adad43..0000000000
--- a/nptl/lowlevelcond.sym
+++ /dev/null
@@ -1,16 +0,0 @@
-#include <stddef.h>
-#include <sched.h>
-#include <bits/pthreadtypes.h>
-#include <internaltypes.h>
-
---
-
-cond_lock	offsetof (pthread_cond_t, __data.__lock)
-cond_futex	offsetof (pthread_cond_t, __data.__futex)
-cond_nwaiters	offsetof (pthread_cond_t, __data.__nwaiters)
-total_seq	offsetof (pthread_cond_t, __data.__total_seq)
-wakeup_seq	offsetof (pthread_cond_t, __data.__wakeup_seq)
-woken_seq	offsetof (pthread_cond_t, __data.__woken_seq)
-dep_mutex	offsetof (pthread_cond_t, __data.__mutex)
-broadcast_seq	offsetof (pthread_cond_t, __data.__broadcast_seq)
-nwaiters_shift	COND_NWAITERS_SHIFT
diff --git a/nptl/nptl-printers.py b/nptl/nptl-printers.py
index e402f232c7..76adaddd95 100644
--- a/nptl/nptl-printers.py
+++ b/nptl/nptl-printers.py
@@ -293,16 +293,6 @@ class MutexAttributesPrinter(object):
         elif protocol == PTHREAD_PRIO_PROTECT:
             self.values.append(('Protocol', 'Priority protect'))
 
-CLOCK_IDS = {
-    CLOCK_REALTIME: 'CLOCK_REALTIME',
-    CLOCK_MONOTONIC: 'CLOCK_MONOTONIC',
-    CLOCK_PROCESS_CPUTIME_ID: 'CLOCK_PROCESS_CPUTIME_ID',
-    CLOCK_THREAD_CPUTIME_ID: 'CLOCK_THREAD_CPUTIME_ID',
-    CLOCK_MONOTONIC_RAW: 'CLOCK_MONOTONIC_RAW',
-    CLOCK_REALTIME_COARSE: 'CLOCK_REALTIME_COARSE',
-    CLOCK_MONOTONIC_COARSE: 'CLOCK_MONOTONIC_COARSE'
-}
-
 class ConditionVariablePrinter(object):
     """Pretty printer for pthread_cond_t."""
 
@@ -313,24 +303,8 @@ class ConditionVariablePrinter(object):
             cond: A gdb.value representing a pthread_cond_t.
         """
 
-        # Since PTHREAD_COND_SHARED is an integer, we need to cast it to void *
-        # to be able to compare it to the condvar's __data.__mutex member.
-        #
-        # While it looks like self.shared_value should be a class variable,
-        # that would result in it having an incorrect size if we're loading
-        # these printers through .gdbinit for a 64-bit objfile in AMD64.
-        # This is because gdb initially assumes the pointer size to be 4 bytes,
-        # and only sets it to 8 after loading the 64-bit objfiles.  Since
-        # .gdbinit runs before any objfiles are loaded, this would effectively
-        # make self.shared_value have a size of 4, thus breaking later
-        # comparisons with pointers whose types are looked up at runtime.
-        void_ptr_type = gdb.lookup_type('void').pointer()
-        self.shared_value = gdb.Value(PTHREAD_COND_SHARED).cast(void_ptr_type)
-
         data = cond['__data']
-        self.total_seq = data['__total_seq']
-        self.mutex = data['__mutex']
-        self.nwaiters = data['__nwaiters']
+        self.wrefs = data['__wrefs']
         self.values = []
 
         self.read_values()
@@ -360,7 +334,6 @@ class ConditionVariablePrinter(object):
 
         self.read_status()
         self.read_attributes()
-        self.read_mutex_info()
 
     def read_status(self):
         """Read the status of the condvar.
@@ -369,41 +342,22 @@ class ConditionVariablePrinter(object):
         are waiting for it.
         """
 
-        if self.total_seq == PTHREAD_COND_DESTROYED:
-            self.values.append(('Status', 'Destroyed'))
-
-        self.values.append(('Threads waiting for this condvar',
-                            self.nwaiters >> COND_NWAITERS_SHIFT))
+        self.values.append(('Threads known to still execute a wait function',
+                            self.wrefs >> PTHREAD_COND_WREFS_SHIFT))
 
     def read_attributes(self):
         """Read the condvar's attributes."""
 
-        clock_id = self.nwaiters & ((1 << COND_NWAITERS_SHIFT) - 1)
-
-        # clock_id must be casted to int because it's a gdb.Value
-        self.values.append(('Clock ID', CLOCK_IDS[int(clock_id)]))
+	if (self.wrefs & PTHREAD_COND_CLOCK_MONOTONIC_MASK) != 0:
+		self.values.append(('Clock ID', 'CLOCK_MONOTONIC'))
+	else:
+		self.values.append(('Clock ID', 'CLOCK_REALTIME'))
 
-        shared = (self.mutex == self.shared_value)
-
-        if shared:
+        if (self.wrefs & PTHREAD_COND_SHARED_MASK) != 0:
             self.values.append(('Shared', 'Yes'))
         else:
             self.values.append(('Shared', 'No'))
 
-    def read_mutex_info(self):
-        """Read the data of the mutex this condvar is bound to.
-
-        A pthread_cond_t's __data.__mutex member is a void * which
-        must be casted to pthread_mutex_t *.  For shared condvars, this
-        member isn't recorded and has a special value instead.
-        """
-
-        if self.mutex and self.mutex != self.shared_value:
-            mutex_type = gdb.lookup_type('pthread_mutex_t')
-            mutex = self.mutex.cast(mutex_type.pointer()).dereference()
-
-            self.values.append(('Mutex', mutex))
-
 class ConditionVariableAttributesPrinter(object):
     """Pretty printer for pthread_condattr_t.
 
@@ -453,10 +407,12 @@ class ConditionVariableAttributesPrinter(object):
         created in self.children.
         """
 
-        clock_id = self.condattr & ((1 << COND_NWAITERS_SHIFT) - 1)
+        clock_id = (self.condattr >> 1) & ((1 << COND_CLOCK_BITS) - 1)
 
-        # clock_id must be casted to int because it's a gdb.Value
-        self.values.append(('Clock ID', CLOCK_IDS[int(clock_id)]))
+	if clock_id != 0:
+		self.values.append(('Clock ID', 'CLOCK_MONOTONIC'))
+	else:
+		self.values.append(('Clock ID', 'CLOCK_REALTIME'))
 
         if self.condattr & 1:
             self.values.append(('Shared', 'Yes'))
diff --git a/nptl/nptl_lock_constants.pysym b/nptl/nptl_lock_constants.pysym
index 303ec61213..2ab3179155 100644
--- a/nptl/nptl_lock_constants.pysym
+++ b/nptl/nptl_lock_constants.pysym
@@ -44,26 +44,13 @@ PTHREAD_PRIO_NONE
 PTHREAD_PRIO_INHERIT
 PTHREAD_PRIO_PROTECT
 
--- These values are hardcoded as well:
--- Value of __mutex for shared condvars.
-PTHREAD_COND_SHARED             (void *)~0l
-
--- Value of __total_seq for destroyed condvars.
-PTHREAD_COND_DESTROYED          -1ull
-
--- __nwaiters encodes the number of threads waiting on a condvar
--- and the clock ID.
--- __nwaiters >> COND_NWAITERS_SHIFT gives us the number of waiters.
-COND_NWAITERS_SHIFT
-
--- Condvar clock IDs
-CLOCK_REALTIME
-CLOCK_MONOTONIC
-CLOCK_PROCESS_CPUTIME_ID
-CLOCK_THREAD_CPUTIME_ID
-CLOCK_MONOTONIC_RAW
-CLOCK_REALTIME_COARSE
-CLOCK_MONOTONIC_COARSE
+-- Condition variable
+-- FIXME Why do macros prefixed with __ cannot be used directly?
+PTHREAD_COND_SHARED_MASK          __PTHREAD_COND_SHARED_MASK
+PTHREAD_COND_CLOCK_MONOTONIC_MASK __PTHREAD_COND_CLOCK_MONOTONIC_MASK
+COND_CLOCK_BITS
+-- These values are hardcoded:
+PTHREAD_COND_WREFS_SHIFT          3
 
 -- Rwlock attributes
 PTHREAD_RWLOCK_PREFER_READER_NP
diff --git a/nptl/pthreadP.h b/nptl/pthreadP.h
index 6e0dd09f4f..92a9992e1f 100644
--- a/nptl/pthreadP.h
+++ b/nptl/pthreadP.h
@@ -167,6 +167,13 @@ enum
 #define __PTHREAD_ONCE_FORK_GEN_INCR	4
 
 
+/* Condition variable definitions.  See __pthread_cond_wait_common.
+   Need to be defined here so there is one place from which
+   nptl_lock_constants can grab them.  */
+#define __PTHREAD_COND_CLOCK_MONOTONIC_MASK 2
+#define __PTHREAD_COND_SHARED_MASK 1
+
+
 /* Internal variables.  */
 
 
diff --git a/nptl/pthread_cond_broadcast.c b/nptl/pthread_cond_broadcast.c
index 552fd42f60..87c07552cf 100644
--- a/nptl/pthread_cond_broadcast.c
+++ b/nptl/pthread_cond_broadcast.c
@@ -19,72 +19,71 @@
 #include <endian.h>
 #include <errno.h>
 #include <sysdep.h>
-#include <lowlevellock.h>
+#include <futex-internal.h>
 #include <pthread.h>
 #include <pthreadP.h>
 #include <stap-probe.h>
+#include <atomic.h>
 
 #include <shlib-compat.h>
-#include <kernel-features.h>
 
+#include "pthread_cond_common.c"
 
+
+/* We do the following steps from __pthread_cond_signal in one critical
+   section: (1) signal all waiters in G1, (2) close G1 so that it can become
+   the new G2 and make G2 the new G1, and (3) signal all waiters in the new
+   G1.  We don't need to do all these steps if there are no waiters in G1
+   and/or G2.  See __pthread_cond_signal for further details.  */
 int
 __pthread_cond_broadcast (pthread_cond_t *cond)
 {
   LIBC_PROBE (cond_broadcast, 1, cond);
 
-  int pshared = (cond->__data.__mutex == (void *) ~0l)
-		? LLL_SHARED : LLL_PRIVATE;
-  /* Make sure we are alone.  */
-  lll_lock (cond->__data.__lock, pshared);
+  unsigned int wrefs = atomic_load_relaxed (&cond->__data.__wrefs);
+  if (wrefs >> 3 == 0)
+    return 0;
+  int private = __condvar_get_private (wrefs);
+
+  __condvar_acquire_lock (cond, private);
 
-  /* Are there any waiters to be woken?  */
-  if (cond->__data.__total_seq > cond->__data.__wakeup_seq)
+  unsigned long long int wseq = __condvar_load_wseq_relaxed (cond);
+  unsigned int g2 = wseq & 1;
+  unsigned int g1 = g2 ^ 1;
+  wseq >>= 1;
+  bool do_futex_wake = false;
+
+  /* Step (1): signal all waiters remaining in G1.  */
+  if (cond->__data.__g_size[g1] != 0)
     {
-      /* Yes.  Mark them all as woken.  */
-      cond->__data.__wakeup_seq = cond->__data.__total_seq;
-      cond->__data.__woken_seq = cond->__data.__total_seq;
-      cond->__data.__futex = (unsigned int) cond->__data.__total_seq * 2;
-      int futex_val = cond->__data.__futex;
-      /* Signal that a broadcast happened.  */
-      ++cond->__data.__broadcast_seq;
-
-      /* We are done.  */
-      lll_unlock (cond->__data.__lock, pshared);
-
-      /* Wake everybody.  */
-      pthread_mutex_t *mut = (pthread_mutex_t *) cond->__data.__mutex;
-
-      /* Do not use requeue for pshared condvars.  */
-      if (mut == (void *) ~0l
-	  || PTHREAD_MUTEX_PSHARED (mut) & PTHREAD_MUTEX_PSHARED_BIT)
-	goto wake_all;
-
-#if (defined lll_futex_cmp_requeue_pi \
-     && defined __ASSUME_REQUEUE_PI)
-      if (USE_REQUEUE_PI (mut))
-	{
-	  if (lll_futex_cmp_requeue_pi (&cond->__data.__futex, 1, INT_MAX,
-					&mut->__data.__lock, futex_val,
-					LLL_PRIVATE) == 0)
-	    return 0;
-	}
-      else
-#endif
-	/* lll_futex_requeue returns 0 for success and non-zero
-	   for errors.  */
-	if (!__builtin_expect (lll_futex_requeue (&cond->__data.__futex, 1,
-						  INT_MAX, &mut->__data.__lock,
-						  futex_val, LLL_PRIVATE), 0))
-	  return 0;
-
-wake_all:
-      lll_futex_wake (&cond->__data.__futex, INT_MAX, pshared);
-      return 0;
+      /* Add as many signals as the remaining size of the group.  */
+      atomic_fetch_add_relaxed (cond->__data.__g_signals + g1,
+				cond->__data.__g_size[g1] << 1);
+      cond->__data.__g_size[g1] = 0;
+
+      /* We need to wake G1 waiters before we quiesce G1 below.  */
+      /* TODO Only set it if there are indeed futex waiters.  We could
+	 also try to move this out of the critical section in cases when
+	 G2 is empty (and we don't need to quiesce).  */
+      futex_wake (cond->__data.__g_signals + g1, INT_MAX, private);
     }
 
-  /* We are done.  */
-  lll_unlock (cond->__data.__lock, pshared);
+  /* G1 is complete.  Step (2) is next unless there are no waiters in G2, in
+     which case we can stop.  */
+  if (__condvar_quiesce_and_switch_g1 (cond, wseq, &g1, private))
+    {
+      /* Step (3): Send signals to all waiters in the old G2 / new G1.  */
+      atomic_fetch_add_relaxed (cond->__data.__g_signals + g1,
+				cond->__data.__g_size[g1] << 1);
+      cond->__data.__g_size[g1] = 0;
+      /* TODO Only set it if there are indeed futex waiters.  */
+      do_futex_wake = true;
+    }
+
+  __condvar_release_lock (cond, private);
+
+  if (do_futex_wake)
+    futex_wake (cond->__data.__g_signals + g1, INT_MAX, private);
 
   return 0;
 }
diff --git a/nptl/pthread_cond_common.c b/nptl/pthread_cond_common.c
new file mode 100644
index 0000000000..b374396d45
--- /dev/null
+++ b/nptl/pthread_cond_common.c
@@ -0,0 +1,466 @@
+/* pthread_cond_common -- shared code for condition variable.
+   Copyright (C) 2016 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.	 See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <atomic.h>
+#include <stdint.h>
+#include <pthread.h>
+#include <libc-internal.h>
+
+/* We need 3 least-significant bits on __wrefs for something else.  */
+#define __PTHREAD_COND_MAX_GROUP_SIZE ((unsigned) 1 << 29)
+
+#if __HAVE_64B_ATOMICS == 1
+
+static uint64_t __attribute__ ((unused))
+__condvar_load_wseq_relaxed (pthread_cond_t *cond)
+{
+  return atomic_load_relaxed (&cond->__data.__wseq);
+}
+
+static uint64_t __attribute__ ((unused))
+__condvar_fetch_add_wseq_acquire (pthread_cond_t *cond, unsigned int val)
+{
+  return atomic_fetch_add_acquire (&cond->__data.__wseq, val);
+}
+
+static uint64_t __attribute__ ((unused))
+__condvar_fetch_xor_wseq_release (pthread_cond_t *cond, unsigned int val)
+{
+  return atomic_fetch_xor_release (&cond->__data.__wseq, val);
+}
+
+static uint64_t __attribute__ ((unused))
+__condvar_load_g1_start_relaxed (pthread_cond_t *cond)
+{
+  return atomic_load_relaxed (&cond->__data.__g1_start);
+}
+
+static void __attribute__ ((unused))
+__condvar_add_g1_start_relaxed (pthread_cond_t *cond, unsigned int val)
+{
+  atomic_store_relaxed (&cond->__data.__g1_start,
+      atomic_load_relaxed (&cond->__data.__g1_start) + val);
+}
+
+#else
+
+/* We use two 64b counters: __wseq and __g1_start.  They are monotonically
+   increasing and single-writer-multiple-readers counters, so we can implement
+   load, fetch-and-add, and fetch-and-xor operations even when we just have
+   32b atomics.  Values we add or xor are less than or equal to 1<<31 (*),
+   so we only have to make overflow-and-addition atomic wrt. to concurrent
+   load operations and xor operations.  To do that, we split each counter into
+   two 32b values of which we reserve the MSB of each to represent an
+   overflow from the lower-order half to the higher-order half.
+
+   In the common case, the state is (higher-order / lower-order half, and . is
+   basically concatenation of the bits):
+   0.h     / 0.l  = h.l
+
+   When we add a value of x that overflows (i.e., 0.l + x == 1.L), we run the
+   following steps S1-S4 (the values these represent are on the right-hand
+   side):
+   S1:  0.h     / 1.L == (h+1).L
+   S2:  1.(h+1) / 1.L == (h+1).L
+   S3:  1.(h+1) / 0.L == (h+1).L
+   S4:  0.(h+1) / 0.L == (h+1).L
+   If the LSB of the higher-order half is set, readers will ignore the
+   overflow bit in the lower-order half.
+
+   To get an atomic snapshot in load operations, we exploit that the
+   higher-order half is monotonically increasing; if we load a value V from
+   it, then read the lower-order half, and then read the higher-order half
+   again and see the same value V, we know that both halves have existed in
+   the sequence of values the full counter had.  This is similar to the
+   validated reads in the time-based STMs in GCC's libitm (e.g.,
+   method_ml_wt).
+
+   The xor operation needs to be an atomic read-modify-write.  The write
+   itself is not an issue as it affects just the lower-order half but not bits
+   used in the add operation.  To make the full fetch-and-xor atomic, we
+   exploit that concurrently, the value can increase by at most 1<<31 (*): The
+   xor operation is only called while having acquired the lock, so not more
+   than __PTHREAD_COND_MAX_GROUP_SIZE waiters can enter concurrently and thus
+   increment __wseq.  Therefore, if the xor operation observes a value of
+   __wseq, then the value it applies the modification to later on can be
+   derived (see below).
+
+   One benefit of this scheme is that this makes load operations
+   obstruction-free because unlike if we would just lock the counter, readers
+   can almost always interpret a snapshot of each halves.  Readers can be
+   forced to read a new snapshot when the read is concurrent with an overflow.
+   However, overflows will happen infrequently, so load operations are
+   practically lock-free.
+
+   (*) The highest value we add is __PTHREAD_COND_MAX_GROUP_SIZE << 2 to
+   __g1_start (the two extra bits are for the lock in the two LSBs of
+   __g1_start).  */
+
+typedef struct
+{
+  unsigned int low;
+  unsigned int high;
+} _condvar_lohi;
+
+static uint64_t
+__condvar_fetch_add_64_relaxed (_condvar_lohi *lh, unsigned int op)
+{
+  /* S1. Note that this is an atomic read-modify-write so it extends the
+     release sequence of release MO store at S3.  */
+  unsigned int l = atomic_fetch_add_relaxed (&lh->low, op);
+  unsigned int h = atomic_load_relaxed (&lh->high);
+  uint64_t result = ((uint64_t) h << 31) | l;
+  l += op;
+  if ((l >> 31) > 0)
+    {
+      /* Overflow.  Need to increment higher-order half.  Note that all
+	 add operations are ordered in happens-before.  */
+      h++;
+      /* S2. Release MO to synchronize with the loads of the higher-order half
+	 in the load operation.  See __condvar_load_64_relaxed.  */
+      atomic_store_release (&lh->high, h | ((unsigned int) 1 << 31));
+      l ^= (unsigned int) 1 << 31;
+      /* S3.  See __condvar_load_64_relaxed.  */
+      atomic_store_release (&lh->low, l);
+      /* S4.  Likewise.  */
+      atomic_store_release (&lh->high, h);
+    }
+  return result;
+}
+
+static uint64_t
+__condvar_load_64_relaxed (_condvar_lohi *lh)
+{
+  unsigned int h, l, h2;
+  do
+    {
+      /* This load and the second one below to the same location read from the
+	 stores in the overflow handling of the add operation or the
+	 initializing stores (which is a simple special case because
+	 initialization always completely happens before further use).
+	 Because no two stores to the higher-order half write the same value,
+	 the loop ensures that if we continue to use the snapshot, this load
+	 and the second one read from the same store operation.  All candidate
+	 store operations have release MO.
+	 If we read from S2 in the first load, then we will see the value of
+	 S1 on the next load (because we synchronize with S2), or a value
+	 later in modification order.  We correctly ignore the lower-half's
+	 overflow bit in this case.  If we read from S4, then we will see the
+	 value of S3 in the next load (or a later value), which does not have
+	 the overflow bit set anymore.
+	  */
+      h = atomic_load_acquire (&lh->high);
+      /* This will read from the release sequence of S3 (i.e, either the S3
+	 store or the read-modify-writes at S1 following S3 in modification
+	 order).  Thus, the read synchronizes with S3, and the following load
+	 of the higher-order half will read from the matching S2 (or a later
+	 value).
+	 Thus, if we read a lower-half value here that already overflowed and
+	 belongs to an increased higher-order half value, we will see the
+	 latter and h and h2 will not be equal.  */
+      l = atomic_load_acquire (&lh->low);
+      /* See above.  */
+      h2 = atomic_load_relaxed (&lh->high);
+    }
+  while (h != h2);
+  if (((l >> 31) > 0) && ((h >> 31) > 0))
+    l ^= (unsigned int) 1 << 31;
+  return ((uint64_t) (h & ~((unsigned int) 1 << 31)) << 31) + l;
+}
+
+static uint64_t __attribute__ ((unused))
+__condvar_load_wseq_relaxed (pthread_cond_t *cond)
+{
+  return __condvar_load_64_relaxed ((_condvar_lohi *) &cond->__data.__wseq32);
+}
+
+static uint64_t __attribute__ ((unused))
+__condvar_fetch_add_wseq_acquire (pthread_cond_t *cond, unsigned int val)
+{
+  uint64_t r = __condvar_fetch_add_64_relaxed
+      ((_condvar_lohi *) &cond->__data.__wseq32, val);
+  atomic_thread_fence_acquire ();
+  return r;
+}
+
+static uint64_t __attribute__ ((unused))
+__condvar_fetch_xor_wseq_release (pthread_cond_t *cond, unsigned int val)
+{
+  _condvar_lohi *lh = (_condvar_lohi *) &cond->__data.__wseq32;
+  /* First, get the current value.  See __condvar_load_64_relaxed.  */
+  unsigned int h, l, h2;
+  do
+    {
+      h = atomic_load_acquire (&lh->high);
+      l = atomic_load_acquire (&lh->low);
+      h2 = atomic_load_relaxed (&lh->high);
+    }
+  while (h != h2);
+  if (((l >> 31) > 0) && ((h >> 31) == 0))
+    h++;
+  h &= ~((unsigned int) 1 << 31);
+  l &= ~((unsigned int) 1 << 31);
+
+  /* Now modify.  Due to the coherence rules, the prior load will read a value
+     earlier in modification order than the following fetch-xor.
+     This uses release MO to make the full operation have release semantics
+     (all other operations access the lower-order half).  */
+  unsigned int l2 = atomic_fetch_xor_release (&lh->low, val)
+      & ~((unsigned int) 1 << 31);
+  if (l2 < l)
+    /* The lower-order half overflowed in the meantime.  This happened exactly
+       once due to the limit on concurrent waiters (see above).  */
+    h++;
+  return ((uint64_t) h << 31) + l2;
+}
+
+static uint64_t __attribute__ ((unused))
+__condvar_load_g1_start_relaxed (pthread_cond_t *cond)
+{
+  return __condvar_load_64_relaxed
+      ((_condvar_lohi *) &cond->__data.__g1_start32);
+}
+
+static void __attribute__ ((unused))
+__condvar_add_g1_start_relaxed (pthread_cond_t *cond, unsigned int val)
+{
+  ignore_value (__condvar_fetch_add_64_relaxed
+      ((_condvar_lohi *) &cond->__data.__g1_start32, val));
+}
+
+#endif  /* !__HAVE_64B_ATOMICS  */
+
+
+/* The lock that signalers use.  See pthread_cond_wait_common for uses.
+   The lock is our normal three-state lock: not acquired (0) / acquired (1) /
+   acquired-with-futex_wake-request (2).  However, we need to preserve the
+   other bits in the unsigned int used for the lock, and therefore it is a
+   little more complex.  */
+static void __attribute__ ((unused))
+__condvar_acquire_lock (pthread_cond_t *cond, int private)
+{
+  unsigned int s = atomic_load_relaxed (&cond->__data.__g1_orig_size);
+  while ((s & 3) == 0)
+    {
+      if (atomic_compare_exchange_weak_acquire (&cond->__data.__g1_orig_size,
+	  &s, s | 1))
+	return;
+      /* TODO Spinning and back-off.  */
+    }
+  /* We can't change from not acquired to acquired, so try to change to
+     acquired-with-futex-wake-request and do a futex wait if we cannot change
+     from not acquired.  */
+  while (1)
+    {
+      while ((s & 3) != 2)
+	{
+	  if (atomic_compare_exchange_weak_acquire
+	      (&cond->__data.__g1_orig_size, &s, (s & ~(unsigned int) 3) | 2))
+	    {
+	      if ((s & 3) == 0)
+		return;
+	      break;
+	    }
+	  /* TODO Back off.  */
+	}
+      futex_wait_simple (&cond->__data.__g1_orig_size,
+	  (s & ~(unsigned int) 3) | 2, private);
+      /* Reload so we see a recent value.  */
+      s = atomic_load_relaxed (&cond->__data.__g1_orig_size);
+    }
+}
+
+/* See __condvar_acquire_lock.  */
+static void __attribute__ ((unused))
+__condvar_release_lock (pthread_cond_t *cond, int private)
+{
+  if ((atomic_fetch_and_release (&cond->__data.__g1_orig_size,
+				 ~(unsigned int) 3) & 3)
+      == 2)
+    futex_wake (&cond->__data.__g1_orig_size, 1, private);
+}
+
+/* Only use this when having acquired the lock.  */
+static unsigned int __attribute__ ((unused))
+__condvar_get_orig_size (pthread_cond_t *cond)
+{
+  return atomic_load_relaxed (&cond->__data.__g1_orig_size) >> 2;
+}
+
+/* Only use this when having acquired the lock.  */
+static void __attribute__ ((unused))
+__condvar_set_orig_size (pthread_cond_t *cond, unsigned int size)
+{
+  /* We have acquired the lock, but might get one concurrent update due to a
+     lock state change from acquired to acquired-with-futex_wake-request.
+     The store with relaxed MO is fine because there will be no further
+     changes to the lock bits nor the size, and we will subsequently release
+     the lock with release MO.  */
+  unsigned int s;
+  s = (atomic_load_relaxed (&cond->__data.__g1_orig_size) & 3)
+      | (size << 2);
+  if ((atomic_exchange_relaxed (&cond->__data.__g1_orig_size, s) & 3)
+      != (s & 3))
+    atomic_store_relaxed (&cond->__data.__g1_orig_size, (size << 2) | 2);
+}
+
+/* Returns FUTEX_SHARED or FUTEX_PRIVATE based on the provided __wrefs
+   value.  */
+static int __attribute__ ((unused))
+__condvar_get_private (int flags)
+{
+  if ((flags & __PTHREAD_COND_SHARED_MASK) == 0)
+    return FUTEX_PRIVATE;
+  else
+    return FUTEX_SHARED;
+}
+
+/* This closes G1 (whose index is in G1INDEX), waits for all futex waiters to
+   leave G1, converts G1 into a fresh G2, and then switches group roles so that
+   the former G2 becomes the new G1 ending at the current __wseq value when we
+   eventually make the switch (WSEQ is just an observation of __wseq by the
+   signaler).
+   If G2 is empty, it will not switch groups because then it would create an
+   empty G1 which would require switching groups again on the next signal.
+   Returns false iff groups were not switched because G2 was empty.  */
+static bool __attribute__ ((unused))
+__condvar_quiesce_and_switch_g1 (pthread_cond_t *cond, uint64_t wseq,
+    unsigned int *g1index, int private)
+{
+  const unsigned int maxspin = 0;
+  unsigned int g1 = *g1index;
+
+  /* If there is no waiter in G2, we don't do anything.  The expression may
+     look odd but remember that __g_size might hold a negative value, so
+     putting the expression this way avoids relying on implementation-defined
+     behavior.
+     Note that this works correctly for a zero-initialized condvar too.  */
+  unsigned int old_orig_size = __condvar_get_orig_size (cond);
+  uint64_t old_g1_start = __condvar_load_g1_start_relaxed (cond) >> 1;
+  if (((unsigned) (wseq - old_g1_start - old_orig_size)
+	  + cond->__data.__g_size[g1 ^ 1]) == 0)
+	return false;
+
+  /* Now try to close and quiesce G1.  We have to consider the following kinds
+     of waiters:
+     * Waiters from less recent groups than G1 are not affected because
+       nothing will change for them apart from __g1_start getting larger.
+     * New waiters arriving concurrently with the group switching will all go
+       into G2 until we atomically make the switch.  Waiters existing in G2
+       are not affected.
+     * Waiters in G1 will be closed out immediately by setting a flag in
+       __g_signals, which will prevent waiters from blocking using a futex on
+       __g_signals and also notifies them that the group is closed.  As a
+       result, they will eventually remove their group reference, allowing us
+       to close switch group roles.  */
+
+  /* First, set the closed flag on __g_signals.  This tells waiters that are
+     about to wait that they shouldn't do that anymore.  This basically
+     serves as an advance notificaton of the upcoming change to __g1_start;
+     waiters interpret it as if __g1_start was larger than their waiter
+     sequence position.  This allows us to change __g1_start after waiting
+     for all existing waiters with group references to leave, which in turn
+     makes recovery after stealing a signal simpler because it then can be
+     skipped if __g1_start indicates that the group is closed (otherwise,
+     we would have to recover always because waiters don't know how big their
+     groups are).  Relaxed MO is fine.  */
+  atomic_fetch_or_relaxed (cond->__data.__g_signals + g1, 1);
+
+  /* Wait until there are no group references anymore.  The fetch-or operation
+     injects us into the modification order of __g_refs; release MO ensures
+     that waiters incrementing __g_refs after our fetch-or see the previous
+     changes to __g_signals and to __g1_start that had to happen before we can
+     switch this G1 and alias with an older group (we have two groups, so
+     aliasing requires switching group roles twice).  Note that nobody else
+     can have set the wake-request flag, so we do not have to act upon it.
+
+     Also note that it is harmless if older waiters or waiters from this G1
+     get a group reference after we have quiesced the group because it will
+     remain closed for them either because of the closed flag in __g_signals
+     or the later update to __g1_start.  New waiters will never arrive here
+     but instead continue to go into the still current G2.  */
+  unsigned r = atomic_fetch_or_release (cond->__data.__g_refs + g1, 0);
+  while ((r >> 1) > 0)
+    {
+      for (unsigned int spin = maxspin; ((r >> 1) > 0) && (spin > 0); spin--)
+	{
+	  /* TODO Back off.  */
+	  r = atomic_load_relaxed (cond->__data.__g_refs + g1);
+	}
+      if ((r >> 1) > 0)
+	{
+	  /* There is still a waiter after spinning.  Set the wake-request
+	     flag and block.  Relaxed MO is fine because this is just about
+	     this futex word.  */
+	  r = atomic_fetch_or_relaxed (cond->__data.__g_refs + g1, 1);
+
+	  if ((r >> 1) > 0)
+	    futex_wait_simple (cond->__data.__g_refs + g1, r, private);
+	  /* Reload here so we eventually see the most recent value even if we
+	     do not spin.   */
+	  r = atomic_load_relaxed (cond->__data.__g_refs + g1);
+	}
+    }
+  /* Acquire MO so that we synchronize with the release operation that waiters
+     use to decrement __g_refs and thus happen after the waiters we waited
+     for.  */
+  atomic_thread_fence_acquire ();
+
+  /* Update __g1_start, which finishes closing this group.  The value we add
+     will never be negative because old_orig_size can only be zero when we
+     switch groups the first time after a condvar was initialized, in which
+     case G1 will be at index 1 and we will add a value of 1.  See above for
+     why this takes place after waiting for quiescence of the group.
+     Relaxed MO is fine because the change comes with no additional
+     constraints that others would have to observe.  */
+  __condvar_add_g1_start_relaxed (cond,
+      (old_orig_size << 1) + (g1 == 1 ? 1 : - 1));
+
+  /* Now reopen the group, thus enabling waiters to again block using the
+     futex controlled by __g_signals.  Release MO so that observers that see
+     no signals (and thus can block) also see the write __g1_start and thus
+     that this is now a new group (see __pthread_cond_wait_common for the
+     matching acquire MO loads).  */
+  atomic_store_release (cond->__data.__g_signals + g1, 0);
+
+  /* At this point, the old G1 is now a valid new G2 (but not in use yet).
+     No old waiter can neither grab a signal nor acquire a reference without
+     noticing that __g1_start is larger.
+     We can now publish the group switch by flipping the G2 index in __wseq.
+     Release MO so that this synchronizes with the acquire MO operation
+     waiters use to obtain a position in the waiter sequence.  */
+  wseq = __condvar_fetch_xor_wseq_release (cond, 1) >> 1;
+  g1 ^= 1;
+  *g1index ^= 1;
+
+  /* These values are just observed by signalers, and thus protected by the
+     lock.  */
+  unsigned int orig_size = wseq - (old_g1_start + old_orig_size);
+  __condvar_set_orig_size (cond, orig_size);
+  /* Use and addition to not loose track of cancellations in what was
+     previously G2.  */
+  cond->__data.__g_size[g1] += orig_size;
+
+  /* The new G1's size may be zero because of cancellations during its time
+     as G2.  If this happens, there are no waiters that have to receive a
+     signal, so we do not need to add any and return false.  */
+  if (cond->__data.__g_size[g1] == 0)
+    return false;
+
+  return true;
+}
diff --git a/nptl/pthread_cond_destroy.c b/nptl/pthread_cond_destroy.c
index 1acd8042d8..5845c6a7ad 100644
--- a/nptl/pthread_cond_destroy.c
+++ b/nptl/pthread_cond_destroy.c
@@ -20,66 +20,42 @@
 #include <shlib-compat.h>
 #include "pthreadP.h"
 #include <stap-probe.h>
-
-
+#include <atomic.h>
+#include <futex-internal.h>
+
+#include "pthread_cond_common.c"
+
+/* See __pthread_cond_wait for a high-level description of the algorithm.
+
+   A correct program must make sure that no waiters are blocked on the condvar
+   when it is destroyed, and that there are no concurrent signals or
+   broadcasts.  To wake waiters reliably, the program must signal or
+   broadcast while holding the mutex or after having held the mutex.  It must
+   also ensure that no signal or broadcast are still pending to unblock
+   waiters; IOW, because waiters can wake up spuriously, the program must
+   effectively ensure that destruction happens after the execution of those
+   signal or broadcast calls.
+   Thus, we can assume that all waiters that are still accessing the condvar
+   have been woken.  We wait until they have confirmed to have woken up by
+   decrementing __wrefs.  */
 int
 __pthread_cond_destroy (pthread_cond_t *cond)
 {
-  int pshared = (cond->__data.__mutex == (void *) ~0l)
-		? LLL_SHARED : LLL_PRIVATE;
-
   LIBC_PROBE (cond_destroy, 1, cond);
 
-  /* Make sure we are alone.  */
-  lll_lock (cond->__data.__lock, pshared);
-
-  if (cond->__data.__total_seq > cond->__data.__wakeup_seq)
-    {
-      /* If there are still some waiters which have not been
-	 woken up, this is an application bug.  */
-      lll_unlock (cond->__data.__lock, pshared);
-      return EBUSY;
-    }
-
-  /* Tell pthread_cond_*wait that this condvar is being destroyed.  */
-  cond->__data.__total_seq = -1ULL;
-
-  /* If there are waiters which have been already signalled or
-     broadcasted, but still are using the pthread_cond_t structure,
-     pthread_cond_destroy needs to wait for them.  */
-  unsigned int nwaiters = cond->__data.__nwaiters;
-
-  if (nwaiters >= (1 << COND_NWAITERS_SHIFT))
+  /* Set the wake request flag.  We could also spin, but destruction that is
+     concurrent with still-active waiters is probably neither common nor
+     performance critical.  Acquire MO to synchronize with waiters confirming
+     that they finished.  */
+  unsigned int wrefs = atomic_fetch_or_acquire (&cond->__data.__wrefs, 4);
+  int private = __condvar_get_private (wrefs);
+  while (wrefs >> 3 != 0)
     {
-      /* Wake everybody on the associated mutex in case there are
-	 threads that have been requeued to it.
-	 Without this, pthread_cond_destroy could block potentially
-	 for a long time or forever, as it would depend on other
-	 thread's using the mutex.
-	 When all threads waiting on the mutex are woken up, pthread_cond_wait
-	 only waits for threads to acquire and release the internal
-	 condvar lock.  */
-      if (cond->__data.__mutex != NULL
-	  && cond->__data.__mutex != (void *) ~0l)
-	{
-	  pthread_mutex_t *mut = (pthread_mutex_t *) cond->__data.__mutex;
-	  lll_futex_wake (&mut->__data.__lock, INT_MAX,
-			  PTHREAD_MUTEX_PSHARED (mut));
-	}
-
-      do
-	{
-	  lll_unlock (cond->__data.__lock, pshared);
-
-	  lll_futex_wait (&cond->__data.__nwaiters, nwaiters, pshared);
-
-	  lll_lock (cond->__data.__lock, pshared);
-
-	  nwaiters = cond->__data.__nwaiters;
-	}
-      while (nwaiters >= (1 << COND_NWAITERS_SHIFT));
+      futex_wait_simple (&cond->__data.__wrefs, wrefs, private);
+      /* See above.  */
+      wrefs = atomic_load_acquire (&cond->__data.__wrefs);
     }
-
+  /* The memory the condvar occupies can now be reused.  */
   return 0;
 }
 versioned_symbol (libpthread, __pthread_cond_destroy,
diff --git a/nptl/pthread_cond_init.c b/nptl/pthread_cond_init.c
index 9023370278..c1eac5f779 100644
--- a/nptl/pthread_cond_init.c
+++ b/nptl/pthread_cond_init.c
@@ -19,25 +19,29 @@
 #include <shlib-compat.h>
 #include "pthreadP.h"
 #include <stap-probe.h>
+#include <string.h>
 
 
+/* See __pthread_cond_wait for details.  */
 int
 __pthread_cond_init (pthread_cond_t *cond, const pthread_condattr_t *cond_attr)
 {
   struct pthread_condattr *icond_attr = (struct pthread_condattr *) cond_attr;
 
-  cond->__data.__lock = LLL_LOCK_INITIALIZER;
-  cond->__data.__futex = 0;
-  cond->__data.__nwaiters = (icond_attr != NULL
-			     ? ((icond_attr->value >> 1)
-				& ((1 << COND_NWAITERS_SHIFT) - 1))
-			     : CLOCK_REALTIME);
-  cond->__data.__total_seq = 0;
-  cond->__data.__wakeup_seq = 0;
-  cond->__data.__woken_seq = 0;
-  cond->__data.__mutex = (icond_attr == NULL || (icond_attr->value & 1) == 0
-			  ? NULL : (void *) ~0l);
-  cond->__data.__broadcast_seq = 0;
+  memset (cond, 0, sizeof (pthread_cond_t));
+
+  /* Update the pretty printers if the internal representation of icond_attr
+     is changed.  */
+
+  /* Iff not equal to ~0l, this is a PTHREAD_PROCESS_PRIVATE condvar.  */
+  if (icond_attr != NULL && (icond_attr->value & 1) != 0)
+    cond->__data.__wrefs |= __PTHREAD_COND_SHARED_MASK;
+  int clockid = (icond_attr != NULL
+		 ? ((icond_attr->value >> 1) & ((1 << COND_CLOCK_BITS) - 1))
+		 : CLOCK_REALTIME);
+  /* If 0, CLOCK_REALTIME is used; CLOCK_MONOTONIC otherwise.  */
+  if (clockid != CLOCK_REALTIME)
+    cond->__data.__wrefs |= __PTHREAD_COND_CLOCK_MONOTONIC_MASK;
 
   LIBC_PROBE (cond_init, 2, cond, cond_attr);
 
diff --git a/nptl/pthread_cond_signal.c b/nptl/pthread_cond_signal.c
index b3a6d3d2a4..a95d5690af 100644
--- a/nptl/pthread_cond_signal.c
+++ b/nptl/pthread_cond_signal.c
@@ -19,62 +19,79 @@
 #include <endian.h>
 #include <errno.h>
 #include <sysdep.h>
-#include <lowlevellock.h>
+#include <futex-internal.h>
 #include <pthread.h>
 #include <pthreadP.h>
+#include <atomic.h>
+#include <stdint.h>
 
 #include <shlib-compat.h>
-#include <kernel-features.h>
 #include <stap-probe.h>
 
+#include "pthread_cond_common.c"
 
+/* See __pthread_cond_wait for a high-level description of the algorithm.  */
 int
 __pthread_cond_signal (pthread_cond_t *cond)
 {
-  int pshared = (cond->__data.__mutex == (void *) ~0l)
-		? LLL_SHARED : LLL_PRIVATE;
-
   LIBC_PROBE (cond_signal, 1, cond);
 
-  /* Make sure we are alone.  */
-  lll_lock (cond->__data.__lock, pshared);
-
-  /* Are there any waiters to be woken?  */
-  if (cond->__data.__total_seq > cond->__data.__wakeup_seq)
+  /* First check whether there are waiters.  Relaxed MO is fine for that for
+     the same reasons that relaxed MO is fine when observing __wseq (see
+     below).  */
+  unsigned int wrefs = atomic_load_relaxed (&cond->__data.__wrefs);
+  if (wrefs >> 3 == 0)
+    return 0;
+  int private = __condvar_get_private (wrefs);
+
+  __condvar_acquire_lock (cond, private);
+
+  /* Load the waiter sequence number, which represents our relative ordering
+     to any waiters.  Relaxed MO is sufficient for that because:
+     1) We can pick any position that is allowed by external happens-before
+        constraints.  In particular, if another __pthread_cond_wait call
+        happened before us, this waiter must be eligible for being woken by
+        us.  The only way do establish such a happens-before is by signaling
+        while having acquired the mutex associated with the condvar and
+        ensuring that the signal's critical section happens after the waiter.
+        Thus, the mutex ensures that we see that waiter's __wseq increase.
+     2) Once we pick a position, we do not need to communicate this to the
+        program via a happens-before that we set up: First, any wake-up could
+        be a spurious wake-up, so the program must not interpret a wake-up as
+        an indication that the waiter happened before a particular signal;
+        second, a program cannot detect whether a waiter has not yet been
+        woken (i.e., it cannot distinguish between a non-woken waiter and one
+        that has been woken but hasn't resumed execution yet), and thus it
+        cannot try to deduce that a signal happened before a particular
+        waiter.  */
+  unsigned long long int wseq = __condvar_load_wseq_relaxed (cond);
+  unsigned int g1 = (wseq & 1) ^ 1;
+  wseq >>= 1;
+  bool do_futex_wake = false;
+
+  /* If G1 is still receiving signals, we put the signal there.  If not, we
+     check if G2 has waiters, and if so, quiesce and switch G1 to the former
+     G2; if this results in a new G1 with waiters (G2 might have cancellations
+     already, see __condvar_quiesce_and_switch_g1), we put the signal in the
+     new G1.  */
+  if ((cond->__data.__g_size[g1] != 0)
+      || __condvar_quiesce_and_switch_g1 (cond, wseq, &g1, private))
     {
-      /* Yes.  Mark one of them as woken.  */
-      ++cond->__data.__wakeup_seq;
-      ++cond->__data.__futex;
-
-#if (defined lll_futex_cmp_requeue_pi \
-     && defined __ASSUME_REQUEUE_PI)
-      pthread_mutex_t *mut = cond->__data.__mutex;
-
-      if (USE_REQUEUE_PI (mut)
-	/* This can only really fail with a ENOSYS, since nobody can modify
-	   futex while we have the cond_lock.  */
-	  && lll_futex_cmp_requeue_pi (&cond->__data.__futex, 1, 0,
-				       &mut->__data.__lock,
-				       cond->__data.__futex, pshared) == 0)
-	{
-	  lll_unlock (cond->__data.__lock, pshared);
-	  return 0;
-	}
-      else
-#endif
-	/* Wake one.  */
-	if (! __builtin_expect (lll_futex_wake_unlock (&cond->__data.__futex,
-						       1, 1,
-						       &cond->__data.__lock,
-						       pshared), 0))
-	  return 0;
-
-      /* Fallback if neither of them work.  */
-      lll_futex_wake (&cond->__data.__futex, 1, pshared);
+      /* Add a signal.  Relaxed MO is fine because signaling does not need to
+	 establish a happens-before relation (see above).  We do not mask the
+	 release-MO store when initializing a group in
+	 __condvar_quiesce_and_switch_g1 because we use an atomic
+	 read-modify-write and thus extend that store's release sequence.  */
+      atomic_fetch_add_relaxed (cond->__data.__g_signals + g1, 2);
+      cond->__data.__g_size[g1]--;
+      /* TODO Only set it if there are indeed futex waiters.  */
+      do_futex_wake = true;
     }
 
-  /* We are done.  */
-  lll_unlock (cond->__data.__lock, pshared);
+  __condvar_release_lock (cond, private);
+
+  if (do_futex_wake)
+    futex_wake (cond->__data.__g_signals + g1, 1, private);
 
   return 0;
 }
diff --git a/nptl/pthread_cond_timedwait.c b/nptl/pthread_cond_timedwait.c
deleted file mode 100644
index 711a51de20..0000000000
--- a/nptl/pthread_cond_timedwait.c
+++ /dev/null
@@ -1,268 +0,0 @@
-/* Copyright (C) 2003-2016 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-   Contributed by Martin Schwidefsky <schwidefsky@de.ibm.com>, 2003.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.	 See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <http://www.gnu.org/licenses/>.  */
-
-#include <endian.h>
-#include <errno.h>
-#include <sysdep.h>
-#include <lowlevellock.h>
-#include <pthread.h>
-#include <pthreadP.h>
-#include <sys/time.h>
-#include <kernel-features.h>
-
-#include <shlib-compat.h>
-
-#ifndef HAVE_CLOCK_GETTIME_VSYSCALL
-# undef INTERNAL_VSYSCALL
-# define INTERNAL_VSYSCALL INTERNAL_SYSCALL
-# undef INLINE_VSYSCALL
-# define INLINE_VSYSCALL INLINE_SYSCALL
-#else
-# include <libc-vdso.h>
-#endif
-
-/* Cleanup handler, defined in pthread_cond_wait.c.  */
-extern void __condvar_cleanup (void *arg)
-     __attribute__ ((visibility ("hidden")));
-
-struct _condvar_cleanup_buffer
-{
-  int oldtype;
-  pthread_cond_t *cond;
-  pthread_mutex_t *mutex;
-  unsigned int bc_seq;
-};
-
-int
-__pthread_cond_timedwait (pthread_cond_t *cond, pthread_mutex_t *mutex,
-			  const struct timespec *abstime)
-{
-  struct _pthread_cleanup_buffer buffer;
-  struct _condvar_cleanup_buffer cbuffer;
-  int result = 0;
-
-  /* Catch invalid parameters.  */
-  if (abstime->tv_nsec < 0 || abstime->tv_nsec >= 1000000000)
-    return EINVAL;
-
-  int pshared = (cond->__data.__mutex == (void *) ~0l)
-		? LLL_SHARED : LLL_PRIVATE;
-
-#if (defined lll_futex_timed_wait_requeue_pi \
-     && defined __ASSUME_REQUEUE_PI)
-  int pi_flag = 0;
-#endif
-
-  /* Make sure we are alone.  */
-  lll_lock (cond->__data.__lock, pshared);
-
-  /* Now we can release the mutex.  */
-  int err = __pthread_mutex_unlock_usercnt (mutex, 0);
-  if (err)
-    {
-      lll_unlock (cond->__data.__lock, pshared);
-      return err;
-    }
-
-  /* We have one new user of the condvar.  */
-  ++cond->__data.__total_seq;
-  ++cond->__data.__futex;
-  cond->__data.__nwaiters += 1 << COND_NWAITERS_SHIFT;
-
-  /* Work around the fact that the kernel rejects negative timeout values
-     despite them being valid.  */
-  if (__glibc_unlikely (abstime->tv_sec < 0))
-    goto timeout;
-
-  /* Remember the mutex we are using here.  If there is already a
-     different address store this is a bad user bug.  Do not store
-     anything for pshared condvars.  */
-  if (cond->__data.__mutex != (void *) ~0l)
-    cond->__data.__mutex = mutex;
-
-  /* Prepare structure passed to cancellation handler.  */
-  cbuffer.cond = cond;
-  cbuffer.mutex = mutex;
-
-  /* Before we block we enable cancellation.  Therefore we have to
-     install a cancellation handler.  */
-  __pthread_cleanup_push (&buffer, __condvar_cleanup, &cbuffer);
-
-  /* The current values of the wakeup counter.  The "woken" counter
-     must exceed this value.  */
-  unsigned long long int val;
-  unsigned long long int seq;
-  val = seq = cond->__data.__wakeup_seq;
-  /* Remember the broadcast counter.  */
-  cbuffer.bc_seq = cond->__data.__broadcast_seq;
-
-  while (1)
-    {
-#if (!defined __ASSUME_FUTEX_CLOCK_REALTIME \
-     || !defined lll_futex_timed_wait_bitset)
-      struct timespec rt;
-      {
-# ifdef __NR_clock_gettime
-	INTERNAL_SYSCALL_DECL (err);
-	(void) INTERNAL_VSYSCALL (clock_gettime, err, 2,
-				  (cond->__data.__nwaiters
-				   & ((1 << COND_NWAITERS_SHIFT) - 1)),
-				  &rt);
-	/* Convert the absolute timeout value to a relative timeout.  */
-	rt.tv_sec = abstime->tv_sec - rt.tv_sec;
-	rt.tv_nsec = abstime->tv_nsec - rt.tv_nsec;
-# else
-	/* Get the current time.  So far we support only one clock.  */
-	struct timeval tv;
-	(void) __gettimeofday (&tv, NULL);
-
-	/* Convert the absolute timeout value to a relative timeout.  */
-	rt.tv_sec = abstime->tv_sec - tv.tv_sec;
-	rt.tv_nsec = abstime->tv_nsec - tv.tv_usec * 1000;
-# endif
-      }
-      if (rt.tv_nsec < 0)
-	{
-	  rt.tv_nsec += 1000000000;
-	  --rt.tv_sec;
-	}
-      /* Did we already time out?  */
-      if (__glibc_unlikely (rt.tv_sec < 0))
-	{
-	  if (cbuffer.bc_seq != cond->__data.__broadcast_seq)
-	    goto bc_out;
-
-	  goto timeout;
-	}
-#endif
-
-      unsigned int futex_val = cond->__data.__futex;
-
-      /* Prepare to wait.  Release the condvar futex.  */
-      lll_unlock (cond->__data.__lock, pshared);
-
-      /* Enable asynchronous cancellation.  Required by the standard.  */
-      cbuffer.oldtype = __pthread_enable_asynccancel ();
-
-/* REQUEUE_PI was implemented after FUTEX_CLOCK_REALTIME, so it is sufficient
-   to check just the former.  */
-#if (defined lll_futex_timed_wait_requeue_pi \
-     && defined __ASSUME_REQUEUE_PI)
-      /* If pi_flag remained 1 then it means that we had the lock and the mutex
-	 but a spurious waker raced ahead of us.  Give back the mutex before
-	 going into wait again.  */
-      if (pi_flag)
-	{
-	  __pthread_mutex_cond_lock_adjust (mutex);
-	  __pthread_mutex_unlock_usercnt (mutex, 0);
-	}
-      pi_flag = USE_REQUEUE_PI (mutex);
-
-      if (pi_flag)
-	{
-	  unsigned int clockbit = (cond->__data.__nwaiters & 1
-				   ? 0 : FUTEX_CLOCK_REALTIME);
-	  err = lll_futex_timed_wait_requeue_pi (&cond->__data.__futex,
-						 futex_val, abstime, clockbit,
-						 &mutex->__data.__lock,
-						 pshared);
-	  pi_flag = (err == 0);
-	}
-      else
-#endif
-
-	{
-#if (!defined __ASSUME_FUTEX_CLOCK_REALTIME \
-     || !defined lll_futex_timed_wait_bitset)
-	  /* Wait until woken by signal or broadcast.  */
-	  err = lll_futex_timed_wait (&cond->__data.__futex,
-				      futex_val, &rt, pshared);
-#else
-	  unsigned int clockbit = (cond->__data.__nwaiters & 1
-				   ? 0 : FUTEX_CLOCK_REALTIME);
-	  err = lll_futex_timed_wait_bitset (&cond->__data.__futex, futex_val,
-					     abstime, clockbit, pshared);
-#endif
-	}
-
-      /* Disable asynchronous cancellation.  */
-      __pthread_disable_asynccancel (cbuffer.oldtype);
-
-      /* We are going to look at shared data again, so get the lock.  */
-      lll_lock (cond->__data.__lock, pshared);
-
-      /* If a broadcast happened, we are done.  */
-      if (cbuffer.bc_seq != cond->__data.__broadcast_seq)
-	goto bc_out;
-
-      /* Check whether we are eligible for wakeup.  */
-      val = cond->__data.__wakeup_seq;
-      if (val != seq && cond->__data.__woken_seq != val)
-	break;
-
-      /* Not woken yet.  Maybe the time expired?  */
-      if (__glibc_unlikely (err == -ETIMEDOUT))
-	{
-	timeout:
-	  /* Yep.  Adjust the counters.  */
-	  ++cond->__data.__wakeup_seq;
-	  ++cond->__data.__futex;
-
-	  /* The error value.  */
-	  result = ETIMEDOUT;
-	  break;
-	}
-    }
-
-  /* Another thread woken up.  */
-  ++cond->__data.__woken_seq;
-
- bc_out:
-
-  cond->__data.__nwaiters -= 1 << COND_NWAITERS_SHIFT;
-
-  /* If pthread_cond_destroy was called on this variable already,
-     notify the pthread_cond_destroy caller all waiters have left
-     and it can be successfully destroyed.  */
-  if (cond->__data.__total_seq == -1ULL
-      && cond->__data.__nwaiters < (1 << COND_NWAITERS_SHIFT))
-    lll_futex_wake (&cond->__data.__nwaiters, 1, pshared);
-
-  /* We are done with the condvar.  */
-  lll_unlock (cond->__data.__lock, pshared);
-
-  /* The cancellation handling is back to normal, remove the handler.  */
-  __pthread_cleanup_pop (&buffer, 0);
-
-  /* Get the mutex before returning.  */
-#if (defined lll_futex_timed_wait_requeue_pi \
-     && defined __ASSUME_REQUEUE_PI)
-  if (pi_flag)
-    {
-      __pthread_mutex_cond_lock_adjust (mutex);
-      err = 0;
-    }
-  else
-#endif
-    err = __pthread_mutex_cond_lock (mutex);
-
-  return err ?: result;
-}
-
-versioned_symbol (libpthread, __pthread_cond_timedwait, pthread_cond_timedwait,
-		  GLIBC_2_3_2);
diff --git a/nptl/pthread_cond_wait.c b/nptl/pthread_cond_wait.c
index 3f62acc6bd..2b434026c6 100644
--- a/nptl/pthread_cond_wait.c
+++ b/nptl/pthread_cond_wait.c
@@ -19,219 +19,655 @@
 #include <endian.h>
 #include <errno.h>
 #include <sysdep.h>
-#include <lowlevellock.h>
+#include <futex-internal.h>
 #include <pthread.h>
 #include <pthreadP.h>
-#include <kernel-features.h>
+#include <sys/time.h>
+#include <atomic.h>
+#include <stdint.h>
+#include <stdbool.h>
 
 #include <shlib-compat.h>
 #include <stap-probe.h>
+#include <time.h>
+
+#include "pthread_cond_common.c"
+
 
 struct _condvar_cleanup_buffer
 {
-  int oldtype;
+  uint64_t wseq;
   pthread_cond_t *cond;
   pthread_mutex_t *mutex;
-  unsigned int bc_seq;
+  int private;
 };
 
 
-void
-__attribute__ ((visibility ("hidden")))
-__condvar_cleanup (void *arg)
+/* Decrease the waiter reference count.  */
+static void
+__condvar_confirm_wakeup (pthread_cond_t *cond, int private)
 {
-  struct _condvar_cleanup_buffer *cbuffer =
-    (struct _condvar_cleanup_buffer *) arg;
-  unsigned int destroying;
-  int pshared = (cbuffer->cond->__data.__mutex == (void *) ~0l)
-		? LLL_SHARED : LLL_PRIVATE;
+  /* If destruction is pending (i.e., the wake-request flag is nonzero) and we
+     are the last waiter (prior value of __wrefs was 1 << 3), then wake any
+     threads waiting in pthread_cond_destroy.  Release MO to synchronize with
+     these threads.  Don't bother clearing the wake-up request flag.  */
+  if ((atomic_fetch_add_release (&cond->__data.__wrefs, -8) >> 2) == 3)
+    futex_wake (&cond->__data.__wrefs, INT_MAX, private);
+}
+
 
-  /* We are going to modify shared data.  */
-  lll_lock (cbuffer->cond->__data.__lock, pshared);
+/* Cancel waiting after having registered as a waiter previously.  SEQ is our
+   position and G is our group index.
+   The goal of cancellation is to make our group smaller if that is still
+   possible.  If we are in a closed group, this is not possible anymore; in
+   this case, we need to send a replacement signal for the one we effectively
+   consumed because the signal should have gotten consumed by another waiter
+   instead; we must not both cancel waiting and consume a signal.
+
+   Must not be called while still holding a reference on the group.
+
+   Returns true iff we consumed a signal.
+
+   On some kind of timeouts, we may be able to pretend that a signal we
+   effectively consumed happened before the timeout (i.e., similarly to first
+   spinning on signals before actually checking whether the timeout has
+   passed already).  Doing this would allow us to skip sending a replacement
+   signal, but this case might happen rarely because the end of the timeout
+   must race with someone else sending a signal.  Therefore, we don't bother
+   trying to optimize this.  */
+static void
+__condvar_cancel_waiting (pthread_cond_t *cond, uint64_t seq, unsigned int g,
+			  int private)
+{
+  bool consumed_signal = false;
 
-  if (cbuffer->bc_seq == cbuffer->cond->__data.__broadcast_seq)
+  /* No deadlock with group switching is possible here because we have do
+     not hold a reference on the group.  */
+  __condvar_acquire_lock (cond, private);
+
+  uint64_t g1_start = __condvar_load_g1_start_relaxed (cond) >> 1;
+  if (g1_start > seq)
+    {
+      /* Our group is closed, so someone provided enough signals for it.
+	 Thus, we effectively consumed a signal.  */
+      consumed_signal = true;
+    }
+  else
     {
-      /* This thread is not waiting anymore.  Adjust the sequence counters
-	 appropriately.  We do not increment WAKEUP_SEQ if this would
-	 bump it over the value of TOTAL_SEQ.  This can happen if a thread
-	 was woken and then canceled.  */
-      if (cbuffer->cond->__data.__wakeup_seq
-	  < cbuffer->cond->__data.__total_seq)
+      if (g1_start + __condvar_get_orig_size (cond) <= seq)
+	{
+	  /* We are in the current G2 and thus cannot have consumed a signal.
+	     Reduce its effective size or handle overflow.  Remember that in
+	     G2, unsigned int size is zero or a negative value.  */
+	  if (cond->__data.__g_size[g] + __PTHREAD_COND_MAX_GROUP_SIZE > 0)
+	    {
+	      cond->__data.__g_size[g]--;
+	    }
+	  else
+	    {
+	      /* Cancellations would overflow the maximum group size.  Just
+		 wake up everyone spuriously to create a clean state.  This
+		 also means we do not consume a signal someone else sent.  */
+	      __condvar_release_lock (cond, private);
+	      __pthread_cond_broadcast (cond);
+	      return;
+	    }
+	}
+      else
 	{
-	  ++cbuffer->cond->__data.__wakeup_seq;
-	  ++cbuffer->cond->__data.__futex;
+	  /* We are in current G1.  If the group's size is zero, someone put
+	     a signal in the group that nobody else but us can consume.  */
+	  if (cond->__data.__g_size[g] == 0)
+	    consumed_signal = true;
+	  else
+	    {
+	      /* Otherwise, we decrease the size of the group.  This is
+		 equivalent to atomically putting in a signal just for us and
+		 consuming it right away.  We do not consume a signal sent
+		 by someone else.  We also cannot have consumed a futex
+		 wake-up because if we were cancelled or timed out in a futex
+		 call, the futex will wake another waiter.  */
+	      cond->__data.__g_size[g]--;
+	    }
 	}
-      ++cbuffer->cond->__data.__woken_seq;
     }
 
-  cbuffer->cond->__data.__nwaiters -= 1 << COND_NWAITERS_SHIFT;
+  __condvar_release_lock (cond, private);
 
-  /* If pthread_cond_destroy was called on this variable already,
-     notify the pthread_cond_destroy caller all waiters have left
-     and it can be successfully destroyed.  */
-  destroying = 0;
-  if (cbuffer->cond->__data.__total_seq == -1ULL
-      && cbuffer->cond->__data.__nwaiters < (1 << COND_NWAITERS_SHIFT))
+  if (consumed_signal)
     {
-      lll_futex_wake (&cbuffer->cond->__data.__nwaiters, 1, pshared);
-      destroying = 1;
+      /* We effectively consumed a signal even though we didn't want to.
+	 Therefore, we need to send a replacement signal.
+	 If we would want to optimize this, we could do what
+	 pthread_cond_signal does right in the critical section above.  */
+      __pthread_cond_signal (cond);
     }
+}
 
-  /* We are done.  */
-  lll_unlock (cbuffer->cond->__data.__lock, pshared);
-
-  /* Wake everybody to make sure no condvar signal gets lost.  */
-  if (! destroying)
-    lll_futex_wake (&cbuffer->cond->__data.__futex, INT_MAX, pshared);
-
-  /* Get the mutex before returning unless asynchronous cancellation
-     is in effect.  We don't try to get the mutex if we already own it.  */
-  if (!(USE_REQUEUE_PI (cbuffer->mutex))
-      || ((cbuffer->mutex->__data.__lock & FUTEX_TID_MASK)
-	  != THREAD_GETMEM (THREAD_SELF, tid)))
-  {
-    __pthread_mutex_cond_lock (cbuffer->mutex);
-  }
-  else
-    __pthread_mutex_cond_lock_adjust (cbuffer->mutex);
+/* Wake up any signalers that might be waiting.  */
+static void
+__condvar_dec_grefs (pthread_cond_t *cond, unsigned int g, int private)
+{
+  /* Release MO to synchronize-with the acquire load in
+     __condvar_quiesce_and_switch_g1.  */
+  if (atomic_fetch_add_release (cond->__data.__g_refs + g, -2) == 3)
+    {
+      /* Clear the wake-up request flag before waking up.  We do not need more
+	 than relaxed MO and it doesn't matter if we apply this for an aliased
+	 group because we wake all futex waiters right after clearing the
+	 flag.  */
+      atomic_fetch_and_relaxed (cond->__data.__g_refs + g, ~(unsigned int) 1);
+      futex_wake (cond->__data.__g_refs + g, INT_MAX, private);
+    }
 }
 
+/* Clean-up for cancellation of waiters waiting for normal signals.  We cancel
+   our registration as a waiter, confirm we have woken up, and re-acquire the
+   mutex.  */
+static void
+__condvar_cleanup_waiting (void *arg)
+{
+  struct _condvar_cleanup_buffer *cbuffer =
+    (struct _condvar_cleanup_buffer *) arg;
+  pthread_cond_t *cond = cbuffer->cond;
+  unsigned g = cbuffer->wseq & 1;
 
-int
-__pthread_cond_wait (pthread_cond_t *cond, pthread_mutex_t *mutex)
+  __condvar_dec_grefs (cond, g, cbuffer->private);
+
+  __condvar_cancel_waiting (cond, cbuffer->wseq >> 1, g, cbuffer->private);
+  /* FIXME With the current cancellation implementation, it is possible that
+     a thread is cancelled after it has returned from a syscall.  This could
+     result in a cancelled waiter consuming a futex wake-up that is then
+     causing another waiter in the same group to not wake up.  To work around
+     this issue until we have fixed cancellation, just add a futex wake-up
+     conservatively.  */
+  futex_wake (cond->__data.__g_signals + g, 1, cbuffer->private);
+
+  __condvar_confirm_wakeup (cond, cbuffer->private);
+
+  /* XXX If locking the mutex fails, should we just stop execution?  This
+     might be better than silently ignoring the error.  */
+  __pthread_mutex_cond_lock (cbuffer->mutex);
+}
+
+/* This condvar implementation guarantees that all calls to signal and
+   broadcast and all of the three virtually atomic parts of each call to wait
+   (i.e., (1) releasing the mutex and blocking, (2) unblocking, and (3) re-
+   acquiring the mutex) happen in some total order that is consistent with the
+   happens-before relations in the calling program.  However, this order does
+   not necessarily result in additional happens-before relations being
+   established (which aligns well with spurious wake-ups being allowed).
+
+   All waiters acquire a certain position in a 64b waiter sequence (__wseq).
+   This sequence determines which waiters are allowed to consume signals.
+   A broadcast is equal to sending as many signals as are unblocked waiters.
+   When a signal arrives, it samples the current value of __wseq with a
+   relaxed-MO load (i.e., the position the next waiter would get).  (This is
+   sufficient because it is consistent with happens-before; the caller can
+   enforce stronger ordering constraints by calling signal while holding the
+   mutex.)  Only waiters with a position less than the __wseq value observed
+   by the signal are eligible to consume this signal.
+
+   This would be straight-forward to implement if waiters would just spin but
+   we need to let them block using futexes.  Futexes give no guarantee of
+   waking in FIFO order, so we cannot reliably wake eligible waiters if we
+   just use a single futex.  Also, futex words are 32b in size, but we need
+   to distinguish more than 1<<32 states because we need to represent the
+   order of wake-up (and thus which waiters are eligible to consume signals);
+   blocking in a futex is not atomic with a waiter determining its position in
+   the waiter sequence, so we need the futex word to reliably notify waiters
+   that they should not attempt to block anymore because they have been
+   already signaled in the meantime.  While an ABA issue on a 32b value will
+   be rare, ignoring it when we are aware of it is not the right thing to do
+   either.
+
+   Therefore, we use a 64b counter to represent the waiter sequence (on
+   architectures which only support 32b atomics, we use a few bits less).
+   To deal with the blocking using futexes, we maintain two groups of waiters:
+   * Group G1 consists of waiters that are all eligible to consume signals;
+     incoming signals will always signal waiters in this group until all
+     waiters in G1 have been signaled.
+   * Group G2 consists of waiters that arrive when a G1 is present and still
+     contains waiters that have not been signaled.  When all waiters in G1
+     are signaled and a new signal arrives, the new signal will convert G2
+     into the new G1 and create a new G2 for future waiters.
+
+   We cannot allocate new memory because of process-shared condvars, so we
+   have just two slots of groups that change their role between G1 and G2.
+   Each has a separate futex word, a number of signals available for
+   consumption, a size (number of waiters in the group that have not been
+   signaled), and a reference count.
+
+   The group reference count is used to maintain the number of waiters that
+   are using the group's futex.  Before a group can change its role, the
+   reference count must show that no waiters are using the futex anymore; this
+   prevents ABA issues on the futex word.
+
+   To represent which intervals in the waiter sequence the groups cover (and
+   thus also which group slot contains G1 or G2), we use a 64b counter to
+   designate the start position of G1 (inclusive), and a single bit in the
+   waiter sequence counter to represent which group slot currently contains
+   G2.  This allows us to switch group roles atomically wrt. waiters obtaining
+   a position in the waiter sequence.  The G1 start position allows waiters to
+   figure out whether they are in a group that has already been completely
+   signaled (i.e., if the current G1 starts at a later position that the
+   waiter's position).  Waiters cannot determine whether they are currently
+   in G2 or G1 -- but they do not have too because all they are interested in
+   is whether there are available signals, and they always start in G2 (whose
+   group slot they know because of the bit in the waiter sequence.  Signalers
+   will simply fill the right group until it is completely signaled and can
+   be closed (they do not switch group roles until they really have to to
+   decrease the likelihood of having to wait for waiters still holding a
+   reference on the now-closed G1).
+
+   Signalers maintain the initial size of G1 to be able to determine where
+   G2 starts (G2 is always open-ended until it becomes G1).  They track the
+   remaining size of a group; when waiters cancel waiting (due to PThreads
+   cancellation or timeouts), they will decrease this remaining size as well.
+
+   To implement condvar destruction requirements (i.e., that
+   pthread_cond_destroy can be called as soon as all waiters have been
+   signaled), waiters increment a reference count before starting to wait and
+   decrement it after they stopped waiting but right before they acquire the
+   mutex associated with the condvar.
+
+   pthread_cond_t thus consists of the following (bits that are used for
+   flags and are not part of the primary value of each field but necessary
+   to make some things atomic or because there was no space for them
+   elsewhere in the data structure):
+
+   __wseq: Waiter sequence counter
+     * LSB is index of current G2.
+     * Waiters fetch-add while having acquire the mutex associated with the
+       condvar.  Signalers load it and fetch-xor it concurrently.
+   __g1_start: Starting position of G1 (inclusive)
+     * LSB is index of current G2.
+     * Modified by signalers while having acquired the condvar-internal lock
+       and observed concurrently by waiters.
+   __g1_orig_size: Initial size of G1
+     * The two least-significant bits represent the condvar-internal lock.
+     * Only accessed while having acquired the condvar-internal lock.
+   __wrefs: Waiter reference counter.
+     * Bit 2 is true if waiters should run futex_wake when they remove the
+       last reference.  pthread_cond_destroy uses this as futex word.
+     * Bit 1 is the clock ID (0 == CLOCK_REALTIME, 1 == CLOCK_MONOTONIC).
+     * Bit 0 is true iff this is a process-shared condvar.
+     * Simple reference count used by both waiters and pthread_cond_destroy.
+     (If the format of __wrefs is changed, update nptl_lock_constants.pysym
+      and the pretty printers.)
+   For each of the two groups, we have:
+   __g_refs: Futex waiter reference count.
+     * LSB is true if waiters should run futex_wake when they remove the
+       last reference.
+     * Reference count used by waiters concurrently with signalers that have
+       acquired the condvar-internal lock.
+   __g_signals: The number of signals that can still be consumed.
+     * Used as a futex word by waiters.  Used concurrently by waiters and
+       signalers.
+     * LSB is true iff this group has been completely signaled (i.e., it is
+       closed).
+   __g_size: Waiters remaining in this group (i.e., which have not been
+     signaled yet.
+     * Accessed by signalers and waiters that cancel waiting (both do so only
+       when having acquired the condvar-internal lock.
+     * The size of G2 is always zero because it cannot be determined until
+       the group becomes G1.
+     * Although this is of unsigned type, we rely on using unsigned overflow
+       rules to make this hold effectively negative values too (in
+       particular, when waiters in G2 cancel waiting).
+
+   A PTHREAD_COND_INITIALIZER condvar has all fields set to zero, which yields
+   a condvar that has G2 starting at position 0 and a G1 that is closed.
+
+   Because waiters do not claim ownership of a group right when obtaining a
+   position in __wseq but only reference count the group when using futexes
+   to block, it can happen that a group gets closed before a waiter can
+   increment the reference count.  Therefore, waiters have to check whether
+   their group is already closed using __g1_start.  They also have to perform
+   this check when spinning when trying to grab a signal from __g_signals.
+   Note that for these checks, using relaxed MO to load __g1_start is
+   sufficient because if a waiter can see a sufficiently large value, it could
+   have also consume a signal in the waiters group.
+
+   Waiters try to grab a signal from __g_signals without holding a reference
+   count, which can lead to stealing a signal from a more recent group after
+   their own group was already closed.  They cannot always detect whether they
+   in fact did because they do not know when they stole, but they can
+   conservatively add a signal back to the group they stole from; if they
+   did so unnecessarily, all that happens is a spurious wake-up.  To make this
+   even less likely, __g1_start contains the index of the current g2 too,
+   which allows waiters to check if there aliasing on the group slots; if
+   there wasn't, they didn't steal from the current G1, which means that the
+   G1 they stole from must have been already closed and they do not need to
+   fix anything.
+
+   It is essential that the last field in pthread_cond_t is __g_signals[1]:
+   The previous condvar used a pointer-sized field in pthread_cond_t, so a
+   PTHREAD_COND_INITIALIZER from that condvar implementation might only
+   initialize 4 bytes to zero instead of the 8 bytes we need (i.e., 44 bytes
+   in total instead of the 48 we need).  __g_signals[1] is not accessed before
+   the first group switch (G2 starts at index 0), which will set its value to
+   zero after a harmless fetch-or whose return value is ignored.  This
+   effectively completes initialization.
+
+
+   Limitations:
+   * This condvar isn't designed to allow for more than
+     __PTHREAD_COND_MAX_GROUP_SIZE * (1 << 31) calls to __pthread_cond_wait.
+   * More than __PTHREAD_COND_MAX_GROUP_SIZE concurrent waiters are not
+     supported.
+   * Beyond what is allowed as errors by POSIX or documented, we can also
+     return the following errors:
+     * EPERM if MUTEX is a recursive mutex and the caller doesn't own it.
+     * EOWNERDEAD or ENOTRECOVERABLE when using robust mutexes.  Unlike
+       for other errors, this can happen when we re-acquire the mutex; this
+       isn't allowed by POSIX (which requires all errors to virtually happen
+       before we release the mutex or change the condvar state), but there's
+       nothing we can do really.
+     * When using PTHREAD_MUTEX_PP_* mutexes, we can also return all errors
+       returned by __pthread_tpp_change_priority.  We will already have
+       released the mutex in such cases, so the caller cannot expect to own
+       MUTEX.
+
+   Other notes:
+   * Instead of the normal mutex unlock / lock functions, we use
+     __pthread_mutex_unlock_usercnt(m, 0) / __pthread_mutex_cond_lock(m)
+     because those will not change the mutex-internal users count, so that it
+     can be detected when a condvar is still associated with a particular
+     mutex because there is a waiter blocked on this condvar using this mutex.
+*/
+static __always_inline int
+__pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex,
+    const struct timespec *abstime)
 {
-  struct _pthread_cleanup_buffer buffer;
-  struct _condvar_cleanup_buffer cbuffer;
+  const int maxspin = 0;
   int err;
-  int pshared = (cond->__data.__mutex == (void *) ~0l)
-		? LLL_SHARED : LLL_PRIVATE;
-
-#if (defined lll_futex_wait_requeue_pi \
-     && defined __ASSUME_REQUEUE_PI)
-  int pi_flag = 0;
-#endif
+  int result = 0;
 
   LIBC_PROBE (cond_wait, 2, cond, mutex);
 
-  /* Make sure we are alone.  */
-  lll_lock (cond->__data.__lock, pshared);
-
-  /* Now we can release the mutex.  */
+  /* Acquire a position (SEQ) in the waiter sequence (WSEQ).  We use an
+     atomic operation because signals and broadcasts may update the group
+     switch without acquiring the mutex.  We do not need release MO here
+     because we do not need to establish any happens-before relation with
+     signalers (see __pthread_cond_signal); modification order alone
+     establishes a total order of waiters/signals.  We do need acquire MO
+     to synchronize with group reinitialization in
+     __condvar_quiesce_and_switch_g1.  */
+  uint64_t wseq = __condvar_fetch_add_wseq_acquire (cond, 2);
+  /* Find our group's index.  We always go into what was G2 when we acquired
+     our position.  */
+  unsigned int g = wseq & 1;
+  uint64_t seq = wseq >> 1;
+
+  /* Increase the waiter reference count.  Relaxed MO is sufficient because
+     we only need to synchronize when decrementing the reference count.  */
+  unsigned int flags = atomic_fetch_add_relaxed (&cond->__data.__wrefs, 8);
+  int private = __condvar_get_private (flags);
+
+  /* Now that we are registered as a waiter, we can release the mutex.
+     Waiting on the condvar must be atomic with releasing the mutex, so if
+     the mutex is used to establish a happens-before relation with any
+     signaler, the waiter must be visible to the latter; thus, we release the
+     mutex after registering as waiter.
+     If releasing the mutex fails, we just cancel our registration as a
+     waiter and confirm that we have woken up.  */
   err = __pthread_mutex_unlock_usercnt (mutex, 0);
-  if (__glibc_unlikely (err))
+  if (__glibc_unlikely (err != 0))
     {
-      lll_unlock (cond->__data.__lock, pshared);
+      __condvar_cancel_waiting (cond, seq, g, private);
+      __condvar_confirm_wakeup (cond, private);
       return err;
     }
 
-  /* We have one new user of the condvar.  */
-  ++cond->__data.__total_seq;
-  ++cond->__data.__futex;
-  cond->__data.__nwaiters += 1 << COND_NWAITERS_SHIFT;
-
-  /* Remember the mutex we are using here.  If there is already a
-     different address store this is a bad user bug.  Do not store
-     anything for pshared condvars.  */
-  if (cond->__data.__mutex != (void *) ~0l)
-    cond->__data.__mutex = mutex;
-
-  /* Prepare structure passed to cancellation handler.  */
-  cbuffer.cond = cond;
-  cbuffer.mutex = mutex;
-
-  /* Before we block we enable cancellation.  Therefore we have to
-     install a cancellation handler.  */
-  __pthread_cleanup_push (&buffer, __condvar_cleanup, &cbuffer);
-
-  /* The current values of the wakeup counter.  The "woken" counter
-     must exceed this value.  */
-  unsigned long long int val;
-  unsigned long long int seq;
-  val = seq = cond->__data.__wakeup_seq;
-  /* Remember the broadcast counter.  */
-  cbuffer.bc_seq = cond->__data.__broadcast_seq;
+  /* Now wait until a signal is available in our group or it is closed.
+     Acquire MO so that if we observe a value of zero written after group
+     switching in __condvar_quiesce_and_switch_g1, we synchronize with that
+     store and will see the prior update of __g1_start done while switching
+     groups too.  */
+  unsigned int signals = atomic_load_acquire (cond->__data.__g_signals + g);
 
   do
     {
-      unsigned int futex_val = cond->__data.__futex;
-      /* Prepare to wait.  Release the condvar futex.  */
-      lll_unlock (cond->__data.__lock, pshared);
-
-      /* Enable asynchronous cancellation.  Required by the standard.  */
-      cbuffer.oldtype = __pthread_enable_asynccancel ();
-
-#if (defined lll_futex_wait_requeue_pi \
-     && defined __ASSUME_REQUEUE_PI)
-      /* If pi_flag remained 1 then it means that we had the lock and the mutex
-	 but a spurious waker raced ahead of us.  Give back the mutex before
-	 going into wait again.  */
-      if (pi_flag)
+      while (1)
 	{
-	  __pthread_mutex_cond_lock_adjust (mutex);
-	  __pthread_mutex_unlock_usercnt (mutex, 0);
+	  /* Spin-wait first.
+	     Note that spinning first without checking whether a timeout
+	     passed might lead to what looks like a spurious wake-up even
+	     though we should return ETIMEDOUT (e.g., if the caller provides
+	     an absolute timeout that is clearly in the past).  However,
+	     (1) spurious wake-ups are allowed, (2) it seems unlikely that a
+	     user will (ab)use pthread_cond_wait as a check for whether a
+	     point in time is in the past, and (3) spinning first without
+	     having to compare against the current time seems to be the right
+	     choice from a performance perspective for most use cases.  */
+	  unsigned int spin = maxspin;
+	  while (signals == 0 && spin > 0)
+	    {
+	      /* Check that we are not spinning on a group that's already
+		 closed.  */
+	      if (seq < (__condvar_load_g1_start_relaxed (cond) >> 1))
+		goto done;
+
+	      /* TODO Back off.  */
+
+	      /* Reload signals.  See above for MO.  */
+	      signals = atomic_load_acquire (cond->__data.__g_signals + g);
+	      spin--;
+	    }
+
+	  /* If our group will be closed as indicated by the flag on signals,
+	     don't bother grabbing a signal.  */
+	  if (signals & 1)
+	    goto done;
+
+	  /* If there is an available signal, don't block.  */
+	  if (signals != 0)
+	    break;
+
+	  /* No signals available after spinning, so prepare to block.
+	     We first acquire a group reference and use acquire MO for that so
+	     that we synchronize with the dummy read-modify-write in
+	     __condvar_quiesce_and_switch_g1 if we read from that.  In turn,
+	     in this case this will make us see the closed flag on __g_signals
+	     that designates a concurrent attempt to reuse the group's slot.
+	     We use acquire MO for the __g_signals check to make the
+	     __g1_start check work (see spinning above).
+	     Note that the group reference acquisition will not mask the
+	     release MO when decrementing the reference count because we use
+	     an atomic read-modify-write operation and thus extend the release
+	     sequence.  */
+	  atomic_fetch_add_acquire (cond->__data.__g_refs + g, 2);
+	  if (((atomic_load_acquire (cond->__data.__g_signals + g) & 1) != 0)
+	      || (seq < (__condvar_load_g1_start_relaxed (cond) >> 1)))
+	    {
+	      /* Our group is closed.  Wake up any signalers that might be
+		 waiting.  */
+	      __condvar_dec_grefs (cond, g, private);
+	      goto done;
+	    }
+
+	  // Now block.
+	  struct _pthread_cleanup_buffer buffer;
+	  struct _condvar_cleanup_buffer cbuffer;
+	  cbuffer.wseq = wseq;
+	  cbuffer.cond = cond;
+	  cbuffer.mutex = mutex;
+	  cbuffer.private = private;
+	  __pthread_cleanup_push (&buffer, __condvar_cleanup_waiting, &cbuffer);
+
+	  if (abstime == NULL)
+	    {
+	      /* Block without a timeout.  */
+	      err = futex_wait_cancelable (
+		  cond->__data.__g_signals + g, 0, private);
+	    }
+	  else
+	    {
+	      /* Block, but with a timeout.
+		 Work around the fact that the kernel rejects negative timeout
+		 values despite them being valid.  */
+	      if (__glibc_unlikely (abstime->tv_sec < 0))
+	        err = ETIMEDOUT;
+
+	      else if ((flags & __PTHREAD_COND_CLOCK_MONOTONIC_MASK) != 0)
+		{
+		  /* CLOCK_MONOTONIC is requested.  */
+		  struct timespec rt;
+		  if (__clock_gettime (CLOCK_MONOTONIC, &rt) != 0)
+		    __libc_fatal ("clock_gettime does not support "
+				  "CLOCK_MONOTONIC");
+		  /* Convert the absolute timeout value to a relative
+		     timeout.  */
+		  rt.tv_sec = abstime->tv_sec - rt.tv_sec;
+		  rt.tv_nsec = abstime->tv_nsec - rt.tv_nsec;
+		  if (rt.tv_nsec < 0)
+		    {
+		      rt.tv_nsec += 1000000000;
+		      --rt.tv_sec;
+		    }
+		  /* Did we already time out?  */
+		  if (__glibc_unlikely (rt.tv_sec < 0))
+		    err = ETIMEDOUT;
+		  else
+		    err = futex_reltimed_wait_cancelable
+			(cond->__data.__g_signals + g, 0, &rt, private);
+		}
+	      else
+		{
+		  /* Use CLOCK_REALTIME.  */
+		  err = futex_abstimed_wait_cancelable
+		      (cond->__data.__g_signals + g, 0, abstime, private);
+		}
+	    }
+
+	  __pthread_cleanup_pop (&buffer, 0);
+
+	  if (__glibc_unlikely (err == ETIMEDOUT))
+	    {
+	      __condvar_dec_grefs (cond, g, private);
+	      /* If we timed out, we effectively cancel waiting.  Note that
+		 we have decremented __g_refs before cancellation, so that a
+		 deadlock between waiting for quiescence of our group in
+		 __condvar_quiesce_and_switch_g1 and us trying to acquire
+		 the lock during cancellation is not possible.  */
+	      __condvar_cancel_waiting (cond, seq, g, private);
+	      result = ETIMEDOUT;
+	      goto done;
+	    }
+	  else
+	    __condvar_dec_grefs (cond, g, private);
+
+	  /* Reload signals.  See above for MO.  */
+	  signals = atomic_load_acquire (cond->__data.__g_signals + g);
 	}
-      pi_flag = USE_REQUEUE_PI (mutex);
 
-      if (pi_flag)
+    }
+  /* Try to grab a signal.  Use acquire MO so that we see an up-to-date value
+     of __g1_start below (see spinning above for a similar case).  In
+     particular, if we steal from a more recent group, we will also see a
+     more recent __g1_start below.  */
+  while (!atomic_compare_exchange_weak_acquire (cond->__data.__g_signals + g,
+						&signals, signals - 2));
+
+  /* We consumed a signal but we could have consumed from a more recent group
+     that aliased with ours due to being in the same group slot.  If this
+     might be the case our group must be closed as visible through
+     __g1_start.  */
+  uint64_t g1_start = __condvar_load_g1_start_relaxed (cond);
+  if (seq < (g1_start >> 1))
+    {
+      /* We potentially stole a signal from a more recent group but we do not
+	 know which group we really consumed from.
+	 We do not care about groups older than current G1 because they are
+	 closed; we could have stolen from these, but then we just add a
+	 spurious wake-up for the current groups.
+	 We will never steal a signal from current G2 that was really intended
+	 for G2 because G2 never receives signals (until it becomes G1).  We
+	 could have stolen a signal from G2 that was conservatively added by a
+	 previous waiter that also thought it stole a signal -- but given that
+	 that signal was added unnecessarily, it's not a problem if we steal
+	 it.
+	 Thus, the remaining case is that we could have stolen from the current
+	 G1, where "current" means the __g1_start value we observed.  However,
+	 if the current G1 does not have the same slot index as we do, we did
+	 not steal from it and do not need to undo that.  This is the reason
+	 for putting a bit with G2's index into__g1_start as well.  */
+      if (((g1_start & 1) ^ 1) == g)
 	{
-	  err = lll_futex_wait_requeue_pi (&cond->__data.__futex,
-					   futex_val, &mutex->__data.__lock,
-					   pshared);
-
-	  pi_flag = (err == 0);
+	  /* We have to conservatively undo our potential mistake of stealing
+	     a signal.  We can stop trying to do that when the current G1
+	     changes because other spinning waiters will notice this too and
+	     __condvar_quiesce_and_switch_g1 has checked that there are no
+	     futex waiters anymore before switching G1.
+	     Relaxed MO is fine for the __g1_start load because we need to
+	     merely be able to observe this fact and not have to observe
+	     something else as well.
+	     ??? Would it help to spin for a little while to see whether the
+	     current G1 gets closed?  This might be worthwhile if the group is
+	     small or close to being closed.  */
+	  unsigned int s = atomic_load_relaxed (cond->__data.__g_signals + g);
+	  while (__condvar_load_g1_start_relaxed (cond) == g1_start)
+	    {
+	      /* Try to add a signal.  We don't need to acquire the lock
+		 because at worst we can cause a spurious wake-up.  If the
+		 group is in the process of being closed (LSB is true), this
+		 has an effect similar to us adding a signal.  */
+	      if (((s & 1) != 0)
+		  || atomic_compare_exchange_weak_relaxed
+		       (cond->__data.__g_signals + g, &s, s + 2))
+		{
+		  /* If we added a signal, we also need to add a wake-up on
+		     the futex.  We also need to do that if we skipped adding
+		     a signal because the group is being closed because
+		     while __condvar_quiesce_and_switch_g1 could have closed
+		     the group, it might stil be waiting for futex waiters to
+		     leave (and one of those waiters might be the one we stole
+		     the signal from, which cause it to block using the
+		     futex).  */
+		  futex_wake (cond->__data.__g_signals + g, 1, private);
+		  break;
+		}
+	      /* TODO Back off.  */
+	    }
 	}
-      else
-#endif
-	  /* Wait until woken by signal or broadcast.  */
-	lll_futex_wait (&cond->__data.__futex, futex_val, pshared);
-
-      /* Disable asynchronous cancellation.  */
-      __pthread_disable_asynccancel (cbuffer.oldtype);
-
-      /* We are going to look at shared data again, so get the lock.  */
-      lll_lock (cond->__data.__lock, pshared);
-
-      /* If a broadcast happened, we are done.  */
-      if (cbuffer.bc_seq != cond->__data.__broadcast_seq)
-	goto bc_out;
-
-      /* Check whether we are eligible for wakeup.  */
-      val = cond->__data.__wakeup_seq;
     }
-  while (val == seq || cond->__data.__woken_seq == val);
 
-  /* Another thread woken up.  */
-  ++cond->__data.__woken_seq;
+ done:
 
- bc_out:
+  /* Confirm that we have been woken.  We do that before acquiring the mutex
+     to allow for execution of pthread_cond_destroy while having acquired the
+     mutex.  */
+  __condvar_confirm_wakeup (cond, private);
 
-  cond->__data.__nwaiters -= 1 << COND_NWAITERS_SHIFT;
-
-  /* If pthread_cond_destroy was called on this varaible already,
-     notify the pthread_cond_destroy caller all waiters have left
-     and it can be successfully destroyed.  */
-  if (cond->__data.__total_seq == -1ULL
-      && cond->__data.__nwaiters < (1 << COND_NWAITERS_SHIFT))
-    lll_futex_wake (&cond->__data.__nwaiters, 1, pshared);
+  /* Woken up; now re-acquire the mutex.  If this doesn't fail, return RESULT,
+     which is set to ETIMEDOUT if a timeout occured, or zero otherwise.  */
+  err = __pthread_mutex_cond_lock (mutex);
+  /* XXX Abort on errors that are disallowed by POSIX?  */
+  return (err != 0) ? err : result;
+}
 
-  /* We are done with the condvar.  */
-  lll_unlock (cond->__data.__lock, pshared);
 
-  /* The cancellation handling is back to normal, remove the handler.  */
-  __pthread_cleanup_pop (&buffer, 0);
+/* See __pthread_cond_wait_common.  */
+int
+__pthread_cond_wait (pthread_cond_t *cond, pthread_mutex_t *mutex)
+{
+  return __pthread_cond_wait_common (cond, mutex, NULL);
+}
 
-  /* Get the mutex before returning.  Not needed for PI.  */
-#if (defined lll_futex_wait_requeue_pi \
-     && defined __ASSUME_REQUEUE_PI)
-  if (pi_flag)
-    {
-      __pthread_mutex_cond_lock_adjust (mutex);
-      return 0;
-    }
-  else
-#endif
-    return __pthread_mutex_cond_lock (mutex);
+/* See __pthread_cond_wait_common.  */
+int
+__pthread_cond_timedwait (pthread_cond_t *cond, pthread_mutex_t *mutex,
+    const struct timespec *abstime)
+{
+  /* Check parameter validity.  This should also tell the compiler that
+     it can assume that abstime is not NULL.  */
+  if (abstime->tv_nsec < 0 || abstime->tv_nsec >= 1000000000)
+    return EINVAL;
+  return __pthread_cond_wait_common (cond, mutex, abstime);
 }
 
 versioned_symbol (libpthread, __pthread_cond_wait, pthread_cond_wait,
 		  GLIBC_2_3_2);
+versioned_symbol (libpthread, __pthread_cond_timedwait, pthread_cond_timedwait,
+		  GLIBC_2_3_2);
diff --git a/nptl/pthread_condattr_getclock.c b/nptl/pthread_condattr_getclock.c
index d156302ffb..cecb4aa8a5 100644
--- a/nptl/pthread_condattr_getclock.c
+++ b/nptl/pthread_condattr_getclock.c
@@ -23,6 +23,6 @@ int
 pthread_condattr_getclock (const pthread_condattr_t *attr, clockid_t *clock_id)
 {
   *clock_id = (((((const struct pthread_condattr *) attr)->value) >> 1)
-	       & ((1 << COND_NWAITERS_SHIFT) - 1));
+	       & ((1 << COND_CLOCK_BITS) - 1));
   return 0;
 }
diff --git a/nptl/pthread_condattr_getpshared.c b/nptl/pthread_condattr_getpshared.c
index 5a10f3eeb0..814796690c 100644
--- a/nptl/pthread_condattr_getpshared.c
+++ b/nptl/pthread_condattr_getpshared.c
@@ -22,7 +22,8 @@
 int
 pthread_condattr_getpshared (const pthread_condattr_t *attr, int *pshared)
 {
-  *pshared = ((const struct pthread_condattr *) attr)->value & 1;
+  *pshared = (((const struct pthread_condattr *) attr)->value & 1
+	      ? PTHREAD_PROCESS_SHARED : PTHREAD_PROCESS_PRIVATE);
 
   return 0;
 }
diff --git a/nptl/pthread_condattr_init.c b/nptl/pthread_condattr_init.c
index 0ce42e5dfa..6e5168dadd 100644
--- a/nptl/pthread_condattr_init.c
+++ b/nptl/pthread_condattr_init.c
@@ -23,7 +23,9 @@
 int
 __pthread_condattr_init (pthread_condattr_t *attr)
 {
-  memset (attr, '\0', sizeof (*attr));
+  struct pthread_condattr *iattr = (struct pthread_condattr *) attr;
+  /* Default is not pshared and CLOCK_REALTIME.  */
+  iattr-> value = CLOCK_REALTIME << 1;
 
   return 0;
 }
diff --git a/nptl/pthread_condattr_setclock.c b/nptl/pthread_condattr_setclock.c
index 25e2a176a0..3cfad84cda 100644
--- a/nptl/pthread_condattr_setclock.c
+++ b/nptl/pthread_condattr_setclock.c
@@ -18,7 +18,7 @@
 
 #include <assert.h>
 #include <errno.h>
-#include <stdbool.h>
+#include <futex-internal.h>
 #include <time.h>
 #include <sysdep.h>
 #include "pthreadP.h"
@@ -33,12 +33,17 @@ pthread_condattr_setclock (pthread_condattr_t *attr, clockid_t clock_id)
        in the pthread_cond_t structure needs to be adjusted.  */
     return EINVAL;
 
+  /* If we do not support waiting using CLOCK_MONOTONIC, return an error.  */
+  if (clock_id == CLOCK_MONOTONIC
+      && !futex_supports_exact_relative_timeouts())
+    return ENOTSUP;
+
   /* Make sure the value fits in the bits we reserved.  */
-  assert (clock_id < (1 << COND_NWAITERS_SHIFT));
+  assert (clock_id < (1 << COND_CLOCK_BITS));
 
   int *valuep = &((struct pthread_condattr *) attr)->value;
 
-  *valuep = ((*valuep & ~(((1 << COND_NWAITERS_SHIFT) - 1) << 1))
+  *valuep = ((*valuep & ~(((1 << COND_CLOCK_BITS) - 1) << 1))
 	     | (clock_id << 1));
 
   return 0;
diff --git a/nptl/test-cond-printers.py b/nptl/test-cond-printers.py
index af0e12eb97..9e807c9f2c 100644
--- a/nptl/test-cond-printers.py
+++ b/nptl/test-cond-printers.py
@@ -35,7 +35,7 @@ try:
 
     break_at(test_source, 'Test status (destroyed)')
     continue_cmd() # Go to test_status_destroyed
-    test_printer(var, to_string, {'Status': 'Destroyed'})
+    test_printer(var, to_string, {'Threads known to still execute a wait function': '0'})
 
     continue_cmd() # Exit
 
diff --git a/nptl/tst-cond1.c b/nptl/tst-cond1.c
index 75ab9c8d8a..509bbd0be4 100644
--- a/nptl/tst-cond1.c
+++ b/nptl/tst-cond1.c
@@ -73,6 +73,9 @@ do_test (void)
 
   puts ("parent: wait for condition");
 
+  /* This test will fail on spurious wake-ups, which are allowed; however,
+     the current implementation shouldn't produce spurious wake-ups in the
+     scenario we are testing here.  */
   err = pthread_cond_wait (&cond, &mut);
   if (err != 0)
     error (EXIT_FAILURE, err, "parent: cannot wait fir signal");
diff --git a/nptl/tst-cond20.c b/nptl/tst-cond20.c
index 918c4adb51..665a66a92e 100644
--- a/nptl/tst-cond20.c
+++ b/nptl/tst-cond20.c
@@ -96,7 +96,10 @@ do_test (void)
 
   for (i = 0; i < ROUNDS; ++i)
     {
-      pthread_cond_wait (&cond2, &mut);
+      /* Make sure we discard spurious wake-ups.  */
+      do
+	pthread_cond_wait (&cond2, &mut);
+      while (count != N);
 
       if (i & 1)
         pthread_mutex_unlock (&mut);
diff --git a/nptl/tst-cond22.c b/nptl/tst-cond22.c
index bd978e50ca..64f19ea0a5 100644
--- a/nptl/tst-cond22.c
+++ b/nptl/tst-cond22.c
@@ -106,10 +106,11 @@ do_test (void)
       status = 1;
     }
 
-  printf ("cond = { %d, %x, %lld, %lld, %lld, %p, %u, %u }\n",
-	  c.__data.__lock, c.__data.__futex, c.__data.__total_seq,
-	  c.__data.__wakeup_seq, c.__data.__woken_seq, c.__data.__mutex,
-	  c.__data.__nwaiters, c.__data.__broadcast_seq);
+  printf ("cond = { %llu, %llu, %u/%u/%u, %u/%u/%u, %u, %u }\n",
+	  c.__data.__wseq, c.__data.__g1_start,
+	  c.__data.__g_signals[0], c.__data.__g_refs[0], c.__data.__g_size[0],
+	  c.__data.__g_signals[1], c.__data.__g_refs[1], c.__data.__g_size[1],
+	  c.__data.__g1_orig_size, c.__data.__wrefs);
 
   if (pthread_create (&th, NULL, tf, (void *) 1l) != 0)
     {
@@ -148,10 +149,11 @@ do_test (void)
       status = 1;
     }
 
-  printf ("cond = { %d, %x, %lld, %lld, %lld, %p, %u, %u }\n",
-	  c.__data.__lock, c.__data.__futex, c.__data.__total_seq,
-	  c.__data.__wakeup_seq, c.__data.__woken_seq, c.__data.__mutex,
-	  c.__data.__nwaiters, c.__data.__broadcast_seq);
+  printf ("cond = { %llu, %llu, %u/%u/%u, %u/%u/%u, %u, %u }\n",
+	  c.__data.__wseq, c.__data.__g1_start,
+	  c.__data.__g_signals[0], c.__data.__g_refs[0], c.__data.__g_size[0],
+	  c.__data.__g_signals[1], c.__data.__g_refs[1], c.__data.__g_size[1],
+	  c.__data.__g1_orig_size, c.__data.__wrefs);
 
   return status;
 }