4 files changed, 100 insertions, 34 deletions
diff --git a/nptl/pthread_spin_init.c b/nptl/pthread_spin_init.c
index 01dec5eea4..fe3091377e 100644
--- a/nptl/pthread_spin_init.c
+++ b/nptl/pthread_spin_init.c
@@ -22,6 +22,7 @@
 int
 pthread_spin_init (pthread_spinlock_t *lock, int pshared)
 {
-  *lock = 0;
+  /* Relaxed MO is fine because this is an initializing store.  */
+  atomic_store_relaxed (lock, 0);
   return 0;
 }
diff --git a/nptl/pthread_spin_lock.c b/nptl/pthread_spin_lock.c
index 4d03b7893a..682af80240 100644
--- a/nptl/pthread_spin_lock.c
+++ b/nptl/pthread_spin_lock.c
@@ -19,27 +19,35 @@
 #include <atomic.h>
 #include "pthreadP.h"
 
-/* A machine-specific version can define SPIN_LOCK_READS_BETWEEN_CMPXCHG
-  to the number of plain reads that it's optimal to spin on between uses
-  of atomic_compare_and_exchange_val_acq.  If spinning forever is optimal
-  then use -1.  If no plain reads here would ever be optimal, use 0.  */
-#ifndef SPIN_LOCK_READS_BETWEEN_CMPXCHG
-# warning machine-dependent file should define SPIN_LOCK_READS_BETWEEN_CMPXCHG
-# define SPIN_LOCK_READS_BETWEEN_CMPXCHG 1000
-#endif
-
 int
 pthread_spin_lock (pthread_spinlock_t *lock)
 {
-  /* atomic_exchange usually takes less instructions than
-     atomic_compare_and_exchange.  On the other hand,
-     atomic_compare_and_exchange potentially generates less bus traffic
-     when the lock is locked.
-     We assume that the first try mostly will be successful, and we use
-     atomic_exchange.  For the subsequent tries we use
-     atomic_compare_and_exchange.  */
-  if (atomic_exchange_acq (lock, 1) == 0)
+  int val = 0;
+
+  /* We assume that the first try mostly will be successful, thus we use
+     atomic_exchange if it is not implemented by a CAS loop (we also assume
+     that atomic_exchange can be faster if it succeeds, see
+     ATOMIC_EXCHANGE_USES_CAS).  Otherwise, we use a weak CAS and not an
+     exchange so we bail out after the first failed attempt to change the
+     state.  For the subsequent attempts we use atomic_compare_and_exchange
+     after we observe that the lock is not acquired.
+     See also comment in pthread_spin_trylock.
+     We use acquire MO to synchronize-with the release MO store in
+     pthread_spin_unlock, and thus ensure that prior critical sections
+     happen-before this critical section.  */
+#if ! ATOMIC_EXCHANGE_USES_CAS
+  /* Try to acquire the lock with an exchange instruction as this architecture
+     has such an instruction and we assume it is faster than a CAS.
+     The acquisition succeeds if the lock is not in an acquired state.  */
+  if (__glibc_likely (atomic_exchange_acquire (lock, 1) == 0))
     return 0;
+#else
+  /* Try to acquire the lock with a CAS instruction as this architecture
+     has no exchange instruction.  The acquisition succeeds if the lock is not
+     acquired.  */
+  if (__glibc_likely (atomic_compare_exchange_weak_acquire (lock, &val, 1)))
+    return 0;
+#endif
 
   do
     {
@@ -47,23 +55,26 @@ pthread_spin_lock (pthread_spinlock_t *lock)
 	 to cmpxchg is not a good idea on many targets as that will force
 	 expensive memory synchronizations among processors and penalize other
 	 running threads.
-	 On the other hand, we do want to update memory state on the local core
-	 once in a while to avoid spinning indefinitely until some event that
-	 will happen to update local memory as a side-effect.  */
-      if (SPIN_LOCK_READS_BETWEEN_CMPXCHG >= 0)
+	 There is no technical reason for throwing in a CAS every now and then,
+	 and so far we have no evidence that it can improve performance.
+	 If that would be the case, we have to adjust other spin-waiting loops
+	 elsewhere, too!
+	 Thus we use relaxed MO reads until we observe the lock to not be
+	 acquired anymore.  */
+      do
 	{
-	  int wait = SPIN_LOCK_READS_BETWEEN_CMPXCHG;
+	  /* TODO Back-off.  */
 
-	  while (*lock != 0 && wait > 0)
-	    --wait;
-	}
-      else
-	{
-	  while (*lock != 0)
-	    ;
+	  atomic_spin_nop ();
+
+	  val = atomic_load_relaxed (lock);
 	}
+      while (val != 0);
+
+      /* We need acquire memory order here for the same reason as mentioned
+	 for the first try to lock the spinlock.  */
     }
-  while (atomic_compare_and_exchange_val_acq (lock, 1, 0) != 0);
+  while (!atomic_compare_exchange_weak_acquire (lock, &val, 1));
 
   return 0;
 }
diff --git a/nptl/pthread_spin_trylock.c b/nptl/pthread_spin_trylock.c
index 593bba3ed8..83921b06b8 100644
--- a/nptl/pthread_spin_trylock.c
+++ b/nptl/pthread_spin_trylock.c
@@ -23,5 +23,57 @@
 int
 pthread_spin_trylock (pthread_spinlock_t *lock)
 {
-  return atomic_exchange_acq (lock, 1) ? EBUSY : 0;
+  /* For the spin try lock, we have the following possibilities:
+
+     1) If we assume that trylock will most likely succeed in practice:
+     * We just do an exchange.
+
+     2) If we want to bias towards cases where trylock succeeds, but don't
+     rule out contention:
+     * If exchange is not implemented by a CAS loop, and exchange is faster
+     than CAS, do an exchange.
+     * If exchange is implemented by a CAS loop, use a weak CAS and not an
+     exchange so we bail out after the first failed attempt to change the state.
+
+     3) If we expect contention to be likely:
+     * If CAS always brings the cache line into an exclusive state even if the
+     spinlock is already acquired, then load the value first with
+     atomic_load_relaxed and test if lock is not acquired. Then do 2).
+
+     We assume that 2) is the common case, and that this won't be slower than
+     1) in the common case.
+
+     We use acquire MO to synchronize-with the release MO store in
+     pthread_spin_unlock, and thus ensure that prior critical sections
+     happen-before this critical section.  */
+#if ! ATOMIC_EXCHANGE_USES_CAS
+  /* Try to acquire the lock with an exchange instruction as this architecture
+     has such an instruction and we assume it is faster than a CAS.
+     The acquisition succeeds if the lock is not in an acquired state.  */
+  if (atomic_exchange_acquire (lock, 1) == 0)
+    return 0;
+#else
+  /* Try to acquire the lock with a CAS instruction as this architecture
+     has no exchange instruction.  The acquisition succeeds if the lock is not
+     acquired.  */
+  do
+    {
+      int val = 0;
+      if (atomic_compare_exchange_weak_acquire (lock, &val, 1))
+	return 0;
+    }
+  /* atomic_compare_exchange_weak_acquire can fail spuriously.  Whereas
+     C++11 and C11 make it clear that trylock operations can fail spuriously,
+     POSIX does not explicitly specify this; it only specifies that failing
+     synchronization operations do not need to have synchronization effects
+     themselves, but a spurious failure is something that could contradict a
+     happens-before established earlier (e.g., that we need to observe that
+     the lock is acquired).  Therefore, we emulate a strong CAS by simply
+     checking with a relaxed MO load that the lock is really acquired before
+     returning EBUSY; the additional overhead this may cause is on the slow
+     path.  */
+  while (atomic_load_relaxed (lock) == 0);
+#endif
+
+  return EBUSY;
 }
diff --git a/nptl/pthread_spin_unlock.c b/nptl/pthread_spin_unlock.c
index 5fd73e578b..f83b69639a 100644
--- a/nptl/pthread_spin_unlock.c
+++ b/nptl/pthread_spin_unlock.c
@@ -23,7 +23,9 @@
 int
 pthread_spin_unlock (pthread_spinlock_t *lock)
 {
-  atomic_full_barrier ();
-  *lock = 0;
+  /* The atomic_store_release synchronizes-with the atomic_exchange_acquire
+     or atomic_compare_exchange_weak_acquire in pthread_spin_lock /
+     pthread_spin_trylock.  */
+  atomic_store_release (lock, 0);
   return 0;
 }