about summary refs log tree commit diff
path: root/src/ldso
diff options
context:
space:
mode:
authorRich Felker <dalias@aerifal.cx>2019-02-17 23:22:27 -0500
committerRich Felker <dalias@aerifal.cx>2019-02-18 21:01:16 -0500
commit9d44b6460ab603487dab4d916342d9ba4467e6b9 (patch)
treec7aa27a062fe7847972b204ced082217b5e8b0ad /src/ldso
parent805288929fdf511b4044cf07c59e02e2eaa9c546 (diff)
downloadmusl-9d44b6460ab603487dab4d916342d9ba4467e6b9.tar.gz
musl-9d44b6460ab603487dab4d916342d9ba4467e6b9.tar.xz
musl-9d44b6460ab603487dab4d916342d9ba4467e6b9.zip
install dynamic tls synchronously at dlopen, streamline access
previously, dynamic loading of new libraries with thread-local storage
allocated the storage needed for all existing threads at load-time,
precluding late failure that can't be handled, but left installation
in existing threads to take place lazily on first access. this imposed
an additional memory access and branch on every dynamic tls access,
and imposed a requirement, which was not actually met, that the
dynamic tlsdesc asm functions preserve all call-clobbered registers
before calling C code to to install new dynamic tls on first access.
the x86[_64] versions of this code wrongly omitted saving and
restoring of fpu/vector registers, assuming the compiler would not
generate anything using them in the called C code. the arm and aarch64
versions saved known existing registers, but failed to be future-proof
against expansion of the register file.

now that we track live threads in a list, it's possible to install the
new dynamic tls for each thread at dlopen time. for the most part,
synchronization is not needed, because if a thread has not
synchronized with completion of the dlopen, there is no way it can
meaningfully request access to a slot past the end of the old dtv,
which remains valid for accessing slots which already existed.
however, it is necessary to ensure that, if a thread sees its new dtv
pointer, it sees correct pointers in each of the slots that existed
prior to the dlopen. my understanding is that, on most real-world
coherency architectures including all the ones we presently support, a
built-in consume order guarantees this; however, don't rely on that.
instead, the SYS_membarrier syscall is used to ensure that all threads
see the stores to the slots of their new dtv prior to the installation
of the new dtv. if it is not supported, the same is implemented in
userspace via signals, using the same mechanism as __synccall.

the __tls_get_addr function, variants, and dynamic tlsdesc asm
functions are all updated to remove the fallback paths for claiming
new dynamic tls, and are now all branch-free.
Diffstat (limited to 'src/ldso')
-rw-r--r--src/ldso/aarch64/tlsdesc.s59
-rw-r--r--src/ldso/arm/tlsdesc.S19
-rw-r--r--src/ldso/i386/tlsdesc.s8
-rw-r--r--src/ldso/x86_64/tlsdesc.s21
4 files changed, 3 insertions, 104 deletions
diff --git a/src/ldso/aarch64/tlsdesc.s b/src/ldso/aarch64/tlsdesc.s
index 8e4004d7..c91baa45 100644
--- a/src/ldso/aarch64/tlsdesc.s
+++ b/src/ldso/aarch64/tlsdesc.s
@@ -29,67 +29,10 @@ __tlsdesc_dynamic:
 	ldr x0,[x0,#8]        // p
 	ldr x2,[x0]           // p->modidx
 	ldr x3,[x1,#-8]       // dtv
-	ldr x4,[x3]           // dtv[0]
-	cmp x2,x4
-	b.hi 1f
 	ldr x2,[x3,x2,lsl #3] // dtv[p->modidx]
 	ldr x0,[x0,#8]        // p->off
 	add x0,x0,x2
-2:	sub x0,x0,x1
+	sub x0,x0,x1
 	ldp x3,x4,[sp,#16]
 	ldp x1,x2,[sp],#32
 	ret
-
-	// save all registers __tls_get_new may clobber
-	// update sp in two steps because offset must be in [-512,509]
-1:	stp x29,x30,[sp,#-160]!
-	stp x5,x6,[sp,#16]
-	stp x7,x8,[sp,#32]
-	stp x9,x10,[sp,#48]
-	stp x11,x12,[sp,#64]
-	stp x13,x14,[sp,#80]
-	stp x15,x16,[sp,#96]
-	stp x17,x18,[sp,#112]
-	stp q0,q1,[sp,#128]
-	stp q2,q3,[sp,#-480]!
-	stp q4,q5,[sp,#32]
-	stp q6,q7,[sp,#64]
-	stp q8,q9,[sp,#96]
-	stp q10,q11,[sp,#128]
-	stp q12,q13,[sp,#160]
-	stp q14,q15,[sp,#192]
-	stp q16,q17,[sp,#224]
-	stp q18,q19,[sp,#256]
-	stp q20,q21,[sp,#288]
-	stp q22,q23,[sp,#320]
-	stp q24,q25,[sp,#352]
-	stp q26,q27,[sp,#384]
-	stp q28,q29,[sp,#416]
-	stp q30,q31,[sp,#448]
-	bl __tls_get_new
-	mrs x1,tpidr_el0
-	ldp q4,q5,[sp,#32]
-	ldp q6,q7,[sp,#64]
-	ldp q8,q9,[sp,#96]
-	ldp q10,q11,[sp,#128]
-	ldp q12,q13,[sp,#160]
-	ldp q14,q15,[sp,#192]
-	ldp q16,q17,[sp,#224]
-	ldp q18,q19,[sp,#256]
-	ldp q20,q21,[sp,#288]
-	ldp q22,q23,[sp,#320]
-	ldp q24,q25,[sp,#352]
-	ldp q26,q27,[sp,#384]
-	ldp q28,q29,[sp,#416]
-	ldp q30,q31,[sp,#448]
-	ldp q2,q3,[sp],#480
-	ldp x5,x6,[sp,#16]
-	ldp x7,x8,[sp,#32]
-	ldp x9,x10,[sp,#48]
-	ldp x11,x12,[sp,#64]
-	ldp x13,x14,[sp,#80]
-	ldp x15,x16,[sp,#96]
-	ldp x17,x18,[sp,#112]
-	ldp q0,q1,[sp,#128]
-	ldp x29,x30,[sp],#160
-	b 2b
diff --git a/src/ldso/arm/tlsdesc.S b/src/ldso/arm/tlsdesc.S
index 4e67c3e2..455eac1d 100644
--- a/src/ldso/arm/tlsdesc.S
+++ b/src/ldso/arm/tlsdesc.S
@@ -35,13 +35,9 @@ __tlsdesc_dynamic:
 #endif
 #endif
 	ldr r3,[r0,#-4] // r3 = dtv
-	ldr ip,[r3]     // ip = dtv slot count
-	cmp r1,ip
-	bhi 3f
 	ldr ip,[r3,r1,LSL #2]
 	sub r0,ip,r0
 	add r0,r0,r2    // r0 = r3[r1]-r0+r2
-4:
 #if __ARM_ARCH >= 5
 	pop {r2,r3,ip,pc}
 #else
@@ -49,21 +45,6 @@ __tlsdesc_dynamic:
 	bx lr
 #endif
 
-3:
-#if __ARM_PCS_VFP || !__SOFTFP__
-	.fpu vfp
-	vpush {d0-d7}
-#endif
-	push {r0-r3}
-	add r0,sp,#4
-	bl __tls_get_new
-	pop {r1-r3,ip}
-#if __ARM_PCS_VFP || !__SOFTFP__
-	vpop {d0-d7}
-#endif
-	sub r0,r0,r1    // r0 = retval-tp
-	b 4b
-
 #if ((__ARM_ARCH_6K__ || __ARM_ARCH_6KZ__ || __ARM_ARCH_6ZK__) && !__thumb__) \
  || __ARM_ARCH_7A__ || __ARM_ARCH_7R__ || __ARM_ARCH >= 7
 #else
diff --git a/src/ldso/i386/tlsdesc.s b/src/ldso/i386/tlsdesc.s
index 4a553bce..a5c0100c 100644
--- a/src/ldso/i386/tlsdesc.s
+++ b/src/ldso/i386/tlsdesc.s
@@ -17,15 +17,9 @@ __tlsdesc_dynamic:
 	mov %gs:4,%edx
 	push %ecx
 	mov (%eax),%ecx
-	cmp %ecx,(%edx)
-	jc 1f
 	mov 4(%eax),%eax
 	add (%edx,%ecx,4),%eax
-2:	pop %ecx
+	pop %ecx
 	sub %gs:0,%eax
 	pop %edx
 	ret
-1:	push %eax
-	call __tls_get_new
-	pop %ecx
-	jmp 2b
diff --git a/src/ldso/x86_64/tlsdesc.s b/src/ldso/x86_64/tlsdesc.s
index 8238c3eb..0151d15c 100644
--- a/src/ldso/x86_64/tlsdesc.s
+++ b/src/ldso/x86_64/tlsdesc.s
@@ -17,28 +17,9 @@ __tlsdesc_dynamic:
 	mov %fs:8,%rdx
 	push %rcx
 	mov (%rax),%rcx
-	cmp %rcx,(%rdx)
-	jc 1f
 	mov 8(%rax),%rax
 	add (%rdx,%rcx,8),%rax
-2:	pop %rcx
+	pop %rcx
 	sub %fs:0,%rax
 	pop %rdx
 	ret
-1:	push %rdi
-	push %rdi
-	push %rsi
-	push %r8
-	push %r9
-	push %r10
-	push %r11
-	mov %rax,%rdi
-	call __tls_get_new
-	pop %r11
-	pop %r10
-	pop %r9
-	pop %r8
-	pop %rsi
-	pop %rdi
-	pop %rdi
-	jmp 2b