about summary refs log tree commit diff
path: root/stdlib/cxa_thread_atexit_impl.c
diff options
context:
space:
mode:
authorSiddhesh Poyarekar <siddhesh@redhat.com>2015-07-23 11:16:18 +0530
committerSiddhesh Poyarekar <siddhesh@redhat.com>2015-07-23 11:16:18 +0530
commit90b37cac8b5a3e1548c29d91e3e0bff1014d2e5c (patch)
treee0f562cbf253f75ae738e9c0ec088a6096acc60f /stdlib/cxa_thread_atexit_impl.c
parent9c9184b4491461e39008e7d18d5c472570cd0755 (diff)
downloadglibc-90b37cac8b5a3e1548c29d91e3e0bff1014d2e5c.tar.gz
glibc-90b37cac8b5a3e1548c29d91e3e0bff1014d2e5c.tar.xz
glibc-90b37cac8b5a3e1548c29d91e3e0bff1014d2e5c.zip
Also use l_tls_dtor_count to decide on object unload (BZ #18657)
When an TLS destructor is registered, we set the DF_1_NODELETE flag to
signal that the object should not be destroyed.  We then clear the
DF_1_NODELETE flag when all destructors are called, which is wrong -
the flag could have been set by other means too.

This patch replaces this use of the flag by using l_tls_dtor_count
directly to determine whether it is safe to unload the object.  This
change has the added advantage of eliminating the lock taking when
calling the destructors, which could result in a deadlock.  The patch
also fixes the test case tst-tls-atexit - it was making an invalid
dlclose call, which would just return an error silently.

I have also added a detailed note on concurrency which also aims to
justify why I chose the semantics I chose for accesses to
l_tls_dtor_count.  Thanks to Torvald for his help in getting me
started on this and (literally) teaching my how to approach the
problem.

Change verified on x86_64; the test suite does not show any
regressions due to the patch.

ChangeLog:

	[BZ #18657]
	* elf/dl-close.c (_dl_close_worker): Don't unload DSO if there
	are pending TLS destructor calls.
	* include/link.h (struct link_map): Add concurrency note for
	L_TLS_DTOR_COUNT.
	* stdlib/cxa_thread_atexit_impl.c (__cxa_thread_atexit_impl):
	Don't touch the link map flag.  Atomically increment
	l_tls_dtor_count.
	(__call_tls_dtors): Atomically decrement l_tls_dtor_count.
	Avoid taking the load lock and don't touch the link map flag.
	* stdlib/tst-tls-atexit-nodelete.c: New test case.
	* stdlib/Makefile (tests): Use it.
	* stdlib/tst-tls-atexit.c (do_test): dlopen
	tst-tls-atexit-lib.so again before dlclose.  Add conditionals
	to allow tst-tls-atexit-nodelete test case to use it.
Diffstat (limited to 'stdlib/cxa_thread_atexit_impl.c')
-rw-r--r--stdlib/cxa_thread_atexit_impl.c90
1 files changed, 72 insertions, 18 deletions
diff --git a/stdlib/cxa_thread_atexit_impl.c b/stdlib/cxa_thread_atexit_impl.c
index 70c97dabc2..8e26380799 100644
--- a/stdlib/cxa_thread_atexit_impl.c
+++ b/stdlib/cxa_thread_atexit_impl.c
@@ -16,6 +16,61 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
+/* CONCURRENCY NOTES:
+
+   This documents concurrency for the non-POD TLS destructor registration,
+   calling and destruction.  The functions __cxa_thread_atexit_impl,
+   _dl_close_worker and __call_tls_dtors are the three main routines that may
+   run concurrently and access shared data.  The shared data in all possible
+   combinations of all three functions are the link map list, a link map for a
+   DSO and the link map member l_tls_dtor_count.
+
+   __cxa_thread_atexit_impl acquires the load_lock before accessing any shared
+   state and hence multiple of its instances can safely execute concurrently.
+
+   _dl_close_worker acquires the load_lock before accessing any shared state as
+   well and hence can concurrently execute multiple of its own instances as
+   well as those of __cxa_thread_atexit_impl safely.  Not all accesses to
+   l_tls_dtor_count are protected by the load lock, so we need to synchronize
+   using atomics.
+
+   __call_tls_dtors accesses the l_tls_dtor_count without taking the lock; it
+   decrements the value by one.  It does not need the big lock because it does
+   not access any other shared state except for the current DSO link map and
+   its member l_tls_dtor_count.
+
+   Correspondingly, _dl_close_worker loads l_tls_dtor_count and if it is zero,
+   unloads the DSO, thus deallocating the current link map.  This is the goal
+   of maintaining l_tls_dtor_count - to unload the DSO and free resources if
+   there are no pending destructors to be called.
+
+   We want to eliminate the inconsistent state where the DSO is unloaded in
+   _dl_close_worker before it is used in __call_tls_dtors.  This could happen
+   if __call_tls_dtors uses the link map after it sets l_tls_dtor_count to 0,
+   since _dl_close_worker will conclude from the 0 l_tls_dtor_count value that
+   it is safe to unload the DSO.  Hence, to ensure that this does not happen,
+   the following conditions must be met:
+
+   1. In _dl_close_worker, the l_tls_dtor_count load happens before the DSO
+      is unload and its link map is freed
+   2. The link map dereference in __call_tls_dtors happens before the
+      l_tls_dtor_count dereference.
+
+   To ensure this, the l_tls_dtor_count decrement in __call_tls_dtors should
+   have release semantics and the load in _dl_close_worker should have acquire
+   semantics.
+
+   Concurrent executions of __call_tls_dtors should only ensure that the value
+   is accessed atomically; no reordering constraints need to be considered.
+   Likewise for the increment of l_tls_dtor_count in __cxa_thread_atexit_impl.
+
+   There is still a possibility on concurrent execution of _dl_close_worker and
+   __call_tls_dtors where _dl_close_worker reads the value of l_tls_dtor_count
+   as 1, __call_tls_dtors decrements the value of l_tls_dtor_count but
+   _dl_close_worker does not unload the DSO, having read the old value.  This
+   is not very different from a case where __call_tls_dtors is called after
+   _dl_close_worker on the DSO and hence is an accepted execution.  */
+
 #include <stdlib.h>
 #include <ldsodefs.h>
 
@@ -49,9 +104,11 @@ __cxa_thread_atexit_impl (dtor_func func, void *obj, void *dso_symbol)
   new->next = tls_dtor_list;
   tls_dtor_list = new;
 
-  /* See if we already encountered the DSO.  */
+  /* We have to acquire the big lock to prevent a racing dlclose from pulling
+     our DSO from underneath us while we're setting up our destructor.  */
   __rtld_lock_lock_recursive (GL(dl_load_lock));
 
+  /* See if we already encountered the DSO.  */
   if (__glibc_unlikely (dso_symbol_cache != dso_symbol))
     {
       ElfW(Addr) caller = (ElfW(Addr)) dso_symbol;
@@ -62,16 +119,17 @@ __cxa_thread_atexit_impl (dtor_func func, void *obj, void *dso_symbol)
 	 program (we hope).  */
       lm_cache = l ? l : GL(dl_ns)[LM_ID_BASE]._ns_loaded;
     }
-  /* A destructor could result in a thread_local construction and the former
-     could have cleared the flag.  */
-  if (lm_cache->l_type == lt_loaded && lm_cache->l_tls_dtor_count == 0)
-    lm_cache->l_flags_1 |= DF_1_NODELETE;
-
-  new->map = lm_cache;
-  new->map->l_tls_dtor_count++;
 
+  /* This increment may only be concurrently observed either by the decrement
+     in __call_tls_dtors since the other l_tls_dtor_count access in
+     _dl_close_worker is protected by the load lock.  The execution in
+     __call_tls_dtors does not really depend on this value beyond the fact that
+     it should be atomic, so Relaxed MO should be sufficient.  */
+  atomic_fetch_add_relaxed (&lm_cache->l_tls_dtor_count, 1);
   __rtld_lock_unlock_recursive (GL(dl_load_lock));
 
+  new->map = lm_cache;
+
   return 0;
 }
 
@@ -83,19 +141,15 @@ __call_tls_dtors (void)
   while (tls_dtor_list)
     {
       struct dtor_list *cur = tls_dtor_list;
-      tls_dtor_list = tls_dtor_list->next;
 
+      tls_dtor_list = tls_dtor_list->next;
       cur->func (cur->obj);
 
-      __rtld_lock_lock_recursive (GL(dl_load_lock));
-
-      /* Allow DSO unload if count drops to zero.  */
-      cur->map->l_tls_dtor_count--;
-      if (cur->map->l_tls_dtor_count == 0 && cur->map->l_type == lt_loaded)
-        cur->map->l_flags_1 &= ~DF_1_NODELETE;
-
-      __rtld_lock_unlock_recursive (GL(dl_load_lock));
-
+      /* Ensure that the MAP dereference happens before
+	 l_tls_dtor_count decrement.  That way, we protect this access from a
+	 potential DSO unload in _dl_close_worker, which happens when
+	 l_tls_dtor_count is 0.  See CONCURRENCY NOTES for more detail.  */
+      atomic_fetch_add_release (&cur->map->l_tls_dtor_count, -1);
       free (cur);
     }
 }