about summary refs log tree commit diff
diff options
context:
space:
mode:
authorRich Felker <dalias@aerifal.cx>2016-01-21 19:08:54 +0000
committerRich Felker <dalias@aerifal.cx>2016-01-21 19:08:54 +0000
commit1315596b510189b5159e742110b504177bdd4932 (patch)
tree27159b7b95b944671454b11f36ee13308241f4b5
parentce3e24eaae91e7a90f87eb7f1edea8df5942de11 (diff)
downloadmusl-1315596b510189b5159e742110b504177bdd4932.tar.gz
musl-1315596b510189b5159e742110b504177bdd4932.tar.xz
musl-1315596b510189b5159e742110b504177bdd4932.zip
refactor internal atomic.h
rather than having each arch provide its own atomic.h, there is a new
shared atomic.h in src/internal which pulls arch-specific definitions
from arc/$(ARCH)/atomic_arch.h. the latter can be extremely minimal,
defining only a_cas or new ll/sc type primitives which the shared
atomic.h will use to construct everything else.

this commit avoids making heavy changes to the individual archs'
atomic implementations. definitions which are identical or
near-identical to what the new shared atomic.h would produce have been
removed, but otherwise the changes made are just hooking up the
arch-specific files to the new infrastructure. major changes to take
advantage of the new system will come in subsequent commits.
-rw-r--r--arch/aarch64/atomic_arch.h (renamed from arch/aarch64/atomic.h)32
-rw-r--r--arch/arm/atomic_arch.h (renamed from arch/arm/atomic.h)115
-rw-r--r--arch/i386/atomic_arch.h (renamed from arch/i386/atomic.h)33
-rw-r--r--arch/microblaze/atomic.h143
-rw-r--r--arch/microblaze/atomic_arch.h53
-rw-r--r--arch/mips/atomic.h205
-rw-r--r--arch/mips/atomic_arch.h61
-rw-r--r--arch/or1k/atomic.h120
-rw-r--r--arch/or1k/atomic_arch.h14
-rw-r--r--arch/powerpc/atomic.h126
-rw-r--r--arch/powerpc/atomic_arch.h15
-rw-r--r--arch/sh/atomic_arch.h (renamed from arch/sh/atomic.h)72
-rw-r--r--arch/x32/atomic_arch.h (renamed from arch/x32/atomic.h)31
-rw-r--r--arch/x86_64/atomic_arch.h (renamed from arch/x86_64/atomic.h)30
-rw-r--r--src/internal/atomic.h275
15 files changed, 491 insertions, 834 deletions
diff --git a/arch/aarch64/atomic.h b/arch/aarch64/atomic_arch.h
index e7c82c2e..0755534f 100644
--- a/arch/aarch64/atomic.h
+++ b/arch/aarch64/atomic_arch.h
@@ -1,8 +1,4 @@
-#ifndef _INTERNAL_ATOMIC_H
-#define _INTERNAL_ATOMIC_H
-
-#include <stdint.h>
-
+#define a_ctz_64 a_ctz_64
 static inline int a_ctz_64(uint64_t x)
 {
 	__asm__(
@@ -12,16 +8,13 @@ static inline int a_ctz_64(uint64_t x)
 	return x;
 }
 
-static inline int a_ctz_l(unsigned long x)
-{
-	return a_ctz_64(x);
-}
-
+#define a_barrier a_barrier
 static inline void a_barrier()
 {
 	__asm__ __volatile__("dmb ish");
 }
 
+#define a_cas_p a_cas_p
 static inline void *a_cas_p(volatile void *p, void *t, void *s)
 {
 	void *old;
@@ -40,6 +33,7 @@ static inline void *a_cas_p(volatile void *p, void *t, void *s)
 	return old;
 }
 
+#define a_cas a_cas
 static inline int a_cas(volatile int *p, int t, int s)
 {
 	int old;
@@ -58,6 +52,7 @@ static inline int a_cas(volatile int *p, int t, int s)
 	return old;
 }
 
+#define a_swap a_swap
 static inline int a_swap(volatile int *x, int v)
 {
 	int old, tmp;
@@ -73,6 +68,7 @@ static inline int a_swap(volatile int *x, int v)
 	return old;
 }
 
+#define a_fetch_add a_fetch_add
 static inline int a_fetch_add(volatile int *x, int v)
 {
 	int old, tmp;
@@ -89,6 +85,7 @@ static inline int a_fetch_add(volatile int *x, int v)
 	return old-v;
 }
 
+#define a_inc a_inc
 static inline void a_inc(volatile int *x)
 {
 	int tmp, tmp2;
@@ -104,6 +101,7 @@ static inline void a_inc(volatile int *x)
 		: "memory", "cc" );
 }
 
+#define a_dec a_dec
 static inline void a_dec(volatile int *x)
 {
 	int tmp, tmp2;
@@ -119,6 +117,7 @@ static inline void a_dec(volatile int *x)
 		: "memory", "cc" );
 }
 
+#define a_and_64 a_and_64
 static inline void a_and_64(volatile uint64_t *p, uint64_t v)
 {
 	int tmp, tmp2;
@@ -134,6 +133,7 @@ static inline void a_and_64(volatile uint64_t *p, uint64_t v)
 		: "memory", "cc" );
 }
 
+#define a_and a_and
 static inline void a_and(volatile int *p, int v)
 {
 	int tmp, tmp2;
@@ -149,6 +149,7 @@ static inline void a_and(volatile int *p, int v)
 		: "memory", "cc" );
 }
 
+#define a_or_64 a_or_64
 static inline void a_or_64(volatile uint64_t *p, uint64_t v)
 {
 	int tmp, tmp2;
@@ -164,11 +165,13 @@ static inline void a_or_64(volatile uint64_t *p, uint64_t v)
 		: "memory", "cc" );
 }
 
+#define a_or_l a_or_l
 static inline void a_or_l(volatile void *p, long v)
 {
 	return a_or_64(p, v);
 }
 
+#define a_or a_or
 static inline void a_or(volatile int *p, int v)
 {
 	int tmp, tmp2;
@@ -184,6 +187,7 @@ static inline void a_or(volatile int *p, int v)
 		: "memory", "cc" );
 }
 
+#define a_store a_store
 static inline void a_store(volatile int *p, int x)
 {
 	__asm__ __volatile__(
@@ -196,11 +200,3 @@ static inline void a_store(volatile int *p, int x)
 }
 
 #define a_spin a_barrier
-
-static inline void a_crash()
-{
-	*(volatile char *)0=0;
-}
-
-
-#endif
diff --git a/arch/arm/atomic.h b/arch/arm/atomic_arch.h
index 8ae35bb7..5ab20a55 100644
--- a/arch/arm/atomic.h
+++ b/arch/arm/atomic_arch.h
@@ -1,34 +1,12 @@
-#ifndef _INTERNAL_ATOMIC_H
-#define _INTERNAL_ATOMIC_H
-
-#include <stdint.h>
-
-static inline int a_ctz_l(unsigned long x)
-{
-	static const char debruijn32[32] = {
-		0, 1, 23, 2, 29, 24, 19, 3, 30, 27, 25, 11, 20, 8, 4, 13,
-		31, 22, 28, 18, 26, 10, 7, 12, 21, 17, 9, 6, 16, 5, 15, 14
-	};
-	return debruijn32[(x&-x)*0x076be629 >> 27];
-}
-
-static inline int a_ctz_64(uint64_t x)
-{
-	uint32_t y = x;
-	if (!y) {
-		y = x>>32;
-		return 32 + a_ctz_l(y);
-	}
-	return a_ctz_l(y);
-}
-
 #if __ARM_ARCH_7A__ || __ARM_ARCH_7R__ ||  __ARM_ARCH >= 7
 
+#define a_barrier a_barrier
 static inline void a_barrier()
 {
 	__asm__ __volatile__("dmb ish");
 }
 
+#define a_cas a_cas
 static inline int a_cas(volatile int *p, int t, int s)
 {
 	int old;
@@ -48,6 +26,7 @@ static inline int a_cas(volatile int *p, int t, int s)
 	return old;
 }
 
+#define a_swap a_swap
 static inline int a_swap(volatile int *x, int v)
 {
 	int old, tmp;
@@ -64,6 +43,7 @@ static inline int a_swap(volatile int *x, int v)
 	return old;
 }
 
+#define a_fetch_add a_fetch_add
 static inline int a_fetch_add(volatile int *x, int v)
 {
 	int old, tmp;
@@ -81,6 +61,7 @@ static inline int a_fetch_add(volatile int *x, int v)
 	return old-v;
 }
 
+#define a_inc a_inc
 static inline void a_inc(volatile int *x)
 {
 	int tmp, tmp2;
@@ -97,6 +78,7 @@ static inline void a_inc(volatile int *x)
 		: "memory", "cc" );
 }
 
+#define a_dec a_dec
 static inline void a_dec(volatile int *x)
 {
 	int tmp, tmp2;
@@ -113,6 +95,7 @@ static inline void a_dec(volatile int *x)
 		: "memory", "cc" );
 }
 
+#define a_and a_and
 static inline void a_and(volatile int *x, int v)
 {
 	int tmp, tmp2;
@@ -129,6 +112,7 @@ static inline void a_and(volatile int *x, int v)
 		: "memory", "cc" );
 }
 
+#define a_or a_or
 static inline void a_or(volatile int *x, int v)
 {
 	int tmp, tmp2;
@@ -145,6 +129,7 @@ static inline void a_or(volatile int *x, int v)
 		: "memory", "cc" );
 }
 
+#define a_store a_store
 static inline void a_store(volatile int *p, int x)
 {
 	__asm__ __volatile__(
@@ -161,12 +146,14 @@ static inline void a_store(volatile int *p, int x)
 int __a_cas(int, int, volatile int *) __attribute__((__visibility__("hidden")));
 #define __k_cas __a_cas
 
+#define a_barrier a_barrier
 static inline void a_barrier()
 {
 	__asm__ __volatile__("bl __a_barrier"
 		: : : "memory", "cc", "ip", "lr" );
 }
 
+#define a_cas a_cas
 static inline int a_cas(volatile int *p, int t, int s)
 {
 	int old;
@@ -178,84 +165,4 @@ static inline int a_cas(volatile int *p, int t, int s)
 	}
 }
 
-static inline int a_swap(volatile int *x, int v)
-{
-	int old;
-	do old = *x;
-	while (__k_cas(old, v, x));
-	return old;
-}
-
-static inline int a_fetch_add(volatile int *x, int v)
-{
-	int old;
-	do old = *x;
-	while (__k_cas(old, old+v, x));
-	return old;
-}
-
-static inline void a_inc(volatile int *x)
-{
-	a_fetch_add(x, 1);
-}
-
-static inline void a_dec(volatile int *x)
-{
-	a_fetch_add(x, -1);
-}
-
-static inline void a_store(volatile int *p, int x)
-{
-	a_barrier();
-	*p = x;
-	a_barrier();
-}
-
-static inline void a_and(volatile int *p, int v)
-{
-	int old;
-	do old = *p;
-	while (__k_cas(old, old&v, p));
-}
-
-static inline void a_or(volatile int *p, int v)
-{
-	int old;
-	do old = *p;
-	while (__k_cas(old, old|v, p));
-}
-
-#endif
-
-static inline void *a_cas_p(volatile void *p, void *t, void *s)
-{
-	return (void *)a_cas(p, (int)t, (int)s);
-}
-
-#define a_spin a_barrier
-
-static inline void a_crash()
-{
-	*(volatile char *)0=0;
-}
-
-static inline void a_or_l(volatile void *p, long v)
-{
-	a_or(p, v);
-}
-
-static inline void a_and_64(volatile uint64_t *p, uint64_t v)
-{
-	union { uint64_t v; uint32_t r[2]; } u = { v };
-	a_and((int *)p, u.r[0]);
-	a_and((int *)p+1, u.r[1]);
-}
-
-static inline void a_or_64(volatile uint64_t *p, uint64_t v)
-{
-	union { uint64_t v; uint32_t r[2]; } u = { v };
-	a_or((int *)p, u.r[0]);
-	a_or((int *)p+1, u.r[1]);
-}
-
 #endif
diff --git a/arch/i386/atomic.h b/arch/i386/atomic_arch.h
index fd222eae..5de862ed 100644
--- a/arch/i386/atomic.h
+++ b/arch/i386/atomic_arch.h
@@ -1,8 +1,4 @@
-#ifndef _INTERNAL_ATOMIC_H
-#define _INTERNAL_ATOMIC_H
-
-#include <stdint.h>
-
+#define a_ctz_64 a_ctz_64
 static inline int a_ctz_64(uint64_t x)
 {
 	int r;
@@ -11,6 +7,7 @@ static inline int a_ctz_64(uint64_t x)
 	return r;
 }
 
+#define a_ctz_l a_ctz_l
 static inline int a_ctz_l(unsigned long x)
 {
 	long r;
@@ -18,31 +15,28 @@ static inline int a_ctz_l(unsigned long x)
 	return r;
 }
 
+#define a_and_64 a_and_64
 static inline void a_and_64(volatile uint64_t *p, uint64_t v)
 {
 	__asm__( "lock ; andl %1, (%0) ; lock ; andl %2, 4(%0)"
 		: : "r"((long *)p), "r"((unsigned)v), "r"((unsigned)(v>>32)) : "memory" );
 }
 
+#define a_or_64 a_or_64
 static inline void a_or_64(volatile uint64_t *p, uint64_t v)
 {
 	__asm__( "lock ; orl %1, (%0) ; lock ; orl %2, 4(%0)"
 		: : "r"((long *)p), "r"((unsigned)v), "r"((unsigned)(v>>32)) : "memory" );
 }
 
+#define a_or_l a_or_l
 static inline void a_or_l(volatile void *p, long v)
 {
 	__asm__( "lock ; orl %1, %0"
 		: "=m"(*(long *)p) : "r"(v) : "memory" );
 }
 
-static inline void *a_cas_p(volatile void *p, void *t, void *s)
-{
-	__asm__( "lock ; cmpxchg %3, %1"
-		: "=a"(t), "=m"(*(long *)p) : "a"(t), "r"(s) : "memory" );
-	return t;
-}
-
+#define a_cas a_cas
 static inline int a_cas(volatile int *p, int t, int s)
 {
 	__asm__( "lock ; cmpxchg %3, %1"
@@ -50,61 +44,66 @@ static inline int a_cas(volatile int *p, int t, int s)
 	return t;
 }
 
+#define a_or a_or
 static inline void a_or(volatile int *p, int v)
 {
 	__asm__( "lock ; orl %1, %0"
 		: "=m"(*p) : "r"(v) : "memory" );
 }
 
+#define a_and a_and
 static inline void a_and(volatile int *p, int v)
 {
 	__asm__( "lock ; andl %1, %0"
 		: "=m"(*p) : "r"(v) : "memory" );
 }
 
+#define a_swap a_swap
 static inline int a_swap(volatile int *x, int v)
 {
 	__asm__( "xchg %0, %1" : "=r"(v), "=m"(*x) : "0"(v) : "memory" );
 	return v;
 }
 
-#define a_xchg a_swap
-
+#define a_fetch_add a_fetch_add
 static inline int a_fetch_add(volatile int *x, int v)
 {
 	__asm__( "lock ; xadd %0, %1" : "=r"(v), "=m"(*x) : "0"(v) : "memory" );
 	return v;
 }
 
+#define a_inc a_inc
 static inline void a_inc(volatile int *x)
 {
 	__asm__( "lock ; incl %0" : "=m"(*x) : "m"(*x) : "memory" );
 }
 
+#define a_dec a_dec
 static inline void a_dec(volatile int *x)
 {
 	__asm__( "lock ; decl %0" : "=m"(*x) : "m"(*x) : "memory" );
 }
 
+#define a_store a_store
 static inline void a_store(volatile int *p, int x)
 {
 	__asm__( "movl %1, %0 ; lock ; orl $0,(%%esp)" : "=m"(*p) : "r"(x) : "memory" );
 }
 
+#define a_spin a_spin
 static inline void a_spin()
 {
 	__asm__ __volatile__( "pause" : : : "memory" );
 }
 
+#define a_barrier a_barrier
 static inline void a_barrier()
 {
 	__asm__ __volatile__( "" : : : "memory" );
 }
 
+#define a_crash a_crash
 static inline void a_crash()
 {
 	__asm__ __volatile__( "hlt" : : : "memory" );
 }
-
-
-#endif
diff --git a/arch/microblaze/atomic.h b/arch/microblaze/atomic.h
deleted file mode 100644
index 93404b94..00000000
--- a/arch/microblaze/atomic.h
+++ /dev/null
@@ -1,143 +0,0 @@
-#ifndef _INTERNAL_ATOMIC_H
-#define _INTERNAL_ATOMIC_H
-
-#include <stdint.h>
-
-static inline int a_ctz_l(unsigned long x)
-{
-	static const char debruijn32[32] = {
-		0, 1, 23, 2, 29, 24, 19, 3, 30, 27, 25, 11, 20, 8, 4, 13,
-		31, 22, 28, 18, 26, 10, 7, 12, 21, 17, 9, 6, 16, 5, 15, 14
-	};
-	return debruijn32[(x&-x)*0x076be629 >> 27];
-}
-
-static inline int a_ctz_64(uint64_t x)
-{
-	uint32_t y = x;
-	if (!y) {
-		y = x>>32;
-		return 32 + a_ctz_l(y);
-	}
-	return a_ctz_l(y);
-}
-
-static inline int a_cas(volatile int *p, int t, int s)
-{
-	register int old, tmp;
-	__asm__ __volatile__ (
-		"	addi %0, r0, 0\n"
-		"1:	lwx %0, %2, r0\n"
-		"	rsubk %1, %0, %3\n"
-		"	bnei %1, 1f\n"
-		"	swx %4, %2, r0\n"
-		"	addic %1, r0, 0\n"
-		"	bnei %1, 1b\n"
-		"1:	"
-		: "=&r"(old), "=&r"(tmp)
-		: "r"(p), "r"(t), "r"(s)
-		: "cc", "memory" );
-	return old;
-}
-
-static inline void *a_cas_p(volatile void *p, void *t, void *s)
-{
-	return (void *)a_cas(p, (int)t, (int)s);
-}
-
-static inline int a_swap(volatile int *x, int v)
-{
-	register int old, tmp;
-	__asm__ __volatile__ (
-		"	addi %0, r0, 0\n"
-		"1:	lwx %0, %2, r0\n"
-		"	swx %3, %2, r0\n"
-		"	addic %1, r0, 0\n"
-		"	bnei %1, 1b\n"
-		"1:	"
-		: "=&r"(old), "=&r"(tmp)
-		: "r"(x), "r"(v)
-		: "cc", "memory" );
-	return old;
-}
-
-static inline int a_fetch_add(volatile int *x, int v)
-{
-	register int new, tmp;
-	__asm__ __volatile__ (
-		"	addi %0, r0, 0\n"
-		"1:	lwx %0, %2, r0\n"
-		"	addk %0, %0, %3\n"
-		"	swx %0, %2, r0\n"
-		"	addic %1, r0, 0\n"
-		"	bnei %1, 1b\n"
-		"1:	"
-		: "=&r"(new), "=&r"(tmp)
-		: "r"(x), "r"(v)
-		: "cc", "memory" );
-	return new-v;
-}
-
-static inline void a_inc(volatile int *x)
-{
-	a_fetch_add(x, 1);
-}
-
-static inline void a_dec(volatile int *x)
-{
-	a_fetch_add(x, -1);
-}
-
-static inline void a_store(volatile int *p, int x)
-{
-	__asm__ __volatile__ (
-		"swi %1, %0"
-		: "=m"(*p) : "r"(x) : "memory" );
-}
-
-#define a_spin a_barrier
-
-static inline void a_barrier()
-{
-	a_cas(&(int){0}, 0, 0);
-}
-
-static inline void a_crash()
-{
-	*(volatile char *)0=0;
-}
-
-static inline void a_and(volatile int *p, int v)
-{
-	int old;
-	do old = *p;
-	while (a_cas(p, old, old&v) != old);
-}
-
-static inline void a_or(volatile int *p, int v)
-{
-	int old;
-	do old = *p;
-	while (a_cas(p, old, old|v) != old);
-}
-
-static inline void a_or_l(volatile void *p, long v)
-{
-	a_or(p, v);
-}
-
-static inline void a_and_64(volatile uint64_t *p, uint64_t v)
-{
-	union { uint64_t v; uint32_t r[2]; } u = { v };
-	a_and((int *)p, u.r[0]);
-	a_and((int *)p+1, u.r[1]);
-}
-
-static inline void a_or_64(volatile uint64_t *p, uint64_t v)
-{
-	union { uint64_t v; uint32_t r[2]; } u = { v };
-	a_or((int *)p, u.r[0]);
-	a_or((int *)p+1, u.r[1]);
-}
-
-#endif
diff --git a/arch/microblaze/atomic_arch.h b/arch/microblaze/atomic_arch.h
new file mode 100644
index 00000000..1152e8cd
--- /dev/null
+++ b/arch/microblaze/atomic_arch.h
@@ -0,0 +1,53 @@
+#define a_cas a_cas
+static inline int a_cas(volatile int *p, int t, int s)
+{
+	register int old, tmp;
+	__asm__ __volatile__ (
+		"	addi %0, r0, 0\n"
+		"1:	lwx %0, %2, r0\n"
+		"	rsubk %1, %0, %3\n"
+		"	bnei %1, 1f\n"
+		"	swx %4, %2, r0\n"
+		"	addic %1, r0, 0\n"
+		"	bnei %1, 1b\n"
+		"1:	"
+		: "=&r"(old), "=&r"(tmp)
+		: "r"(p), "r"(t), "r"(s)
+		: "cc", "memory" );
+	return old;
+}
+
+#define a_swap a_swap
+static inline int a_swap(volatile int *x, int v)
+{
+	register int old, tmp;
+	__asm__ __volatile__ (
+		"	addi %0, r0, 0\n"
+		"1:	lwx %0, %2, r0\n"
+		"	swx %3, %2, r0\n"
+		"	addic %1, r0, 0\n"
+		"	bnei %1, 1b\n"
+		"1:	"
+		: "=&r"(old), "=&r"(tmp)
+		: "r"(x), "r"(v)
+		: "cc", "memory" );
+	return old;
+}
+
+#define a_fetch_add a_fetch_add
+static inline int a_fetch_add(volatile int *x, int v)
+{
+	register int new, tmp;
+	__asm__ __volatile__ (
+		"	addi %0, r0, 0\n"
+		"1:	lwx %0, %2, r0\n"
+		"	addk %0, %0, %3\n"
+		"	swx %0, %2, r0\n"
+		"	addic %1, r0, 0\n"
+		"	bnei %1, 1b\n"
+		"1:	"
+		: "=&r"(new), "=&r"(tmp)
+		: "r"(x), "r"(v)
+		: "cc", "memory" );
+	return new-v;
+}
diff --git a/arch/mips/atomic.h b/arch/mips/atomic.h
deleted file mode 100644
index c82046a8..00000000
--- a/arch/mips/atomic.h
+++ /dev/null
@@ -1,205 +0,0 @@
-#ifndef _INTERNAL_ATOMIC_H
-#define _INTERNAL_ATOMIC_H
-
-#include <stdint.h>
-
-static inline int a_ctz_l(unsigned long x)
-{
-	static const char debruijn32[32] = {
-		0, 1, 23, 2, 29, 24, 19, 3, 30, 27, 25, 11, 20, 8, 4, 13,
-		31, 22, 28, 18, 26, 10, 7, 12, 21, 17, 9, 6, 16, 5, 15, 14
-	};
-	return debruijn32[(x&-x)*0x076be629 >> 27];
-}
-
-static inline int a_ctz_64(uint64_t x)
-{
-	uint32_t y = x;
-	if (!y) {
-		y = x>>32;
-		return 32 + a_ctz_l(y);
-	}
-	return a_ctz_l(y);
-}
-
-static inline int a_cas(volatile int *p, int t, int s)
-{
-	int dummy;
-	__asm__ __volatile__(
-		".set push\n"
-		".set mips2\n"
-		".set noreorder\n"
-		"	sync\n"
-		"1:	ll %0, %2\n"
-		"	bne %0, %3, 1f\n"
-		"	addu %1, %4, $0\n"
-		"	sc %1, %2\n"
-		"	beq %1, $0, 1b\n"
-		"	nop\n"
-		"	sync\n"
-		"1:	\n"
-		".set pop\n"
-		: "=&r"(t), "=&r"(dummy), "+m"(*p) : "r"(t), "r"(s) : "memory" );
-        return t;
-}
-
-static inline void *a_cas_p(volatile void *p, void *t, void *s)
-{
-	return (void *)a_cas(p, (int)t, (int)s);
-}
-
-static inline int a_swap(volatile int *x, int v)
-{
-	int old, dummy;
-	__asm__ __volatile__(
-		".set push\n"
-		".set mips2\n"
-		".set noreorder\n"
-		"	sync\n"
-		"1:	ll %0, %2\n"
-		"	addu %1, %3, $0\n"
-		"	sc %1, %2\n"
-		"	beq %1, $0, 1b\n"
-		"	nop\n"
-		"	sync\n"
-		".set pop\n"
-		: "=&r"(old), "=&r"(dummy), "+m"(*x) : "r"(v) : "memory" );
-        return old;
-}
-
-static inline int a_fetch_add(volatile int *x, int v)
-{
-	int old, dummy;
-	__asm__ __volatile__(
-		".set push\n"
-		".set mips2\n"
-		".set noreorder\n"
-		"	sync\n"
-		"1:	ll %0, %2\n"
-		"	addu %1, %0, %3\n"
-		"	sc %1, %2\n"
-		"	beq %1, $0, 1b\n"
-		"	nop\n"
-		"	sync\n"
-		".set pop\n"
-		: "=&r"(old), "=&r"(dummy), "+m"(*x) : "r"(v) : "memory" );
-        return old;
-}
-
-static inline void a_inc(volatile int *x)
-{
-	int dummy;
-	__asm__ __volatile__(
-		".set push\n"
-		".set mips2\n"
-		".set noreorder\n"
-		"	sync\n"
-		"1:	ll %0, %1\n"
-		"	addu %0, %0, 1\n"
-		"	sc %0, %1\n"
-		"	beq %0, $0, 1b\n"
-		"	nop\n"
-		"	sync\n"
-		".set pop\n"
-		: "=&r"(dummy), "+m"(*x) : : "memory" );
-}
-
-static inline void a_dec(volatile int *x)
-{
-	int dummy;
-	__asm__ __volatile__(
-		".set push\n"
-		".set mips2\n"
-		".set noreorder\n"
-		"	sync\n"
-		"1:	ll %0, %1\n"
-		"	subu %0, %0, 1\n"
-		"	sc %0, %1\n"
-		"	beq %0, $0, 1b\n"
-		"	nop\n"
-		"	sync\n"
-		".set pop\n"
-		: "=&r"(dummy), "+m"(*x) : : "memory" );
-}
-
-static inline void a_store(volatile int *p, int x)
-{
-	__asm__ __volatile__(
-		".set push\n"
-		".set mips2\n"
-		".set noreorder\n"
-		"	sync\n"
-		"	sw %1, %0\n"
-		"	sync\n"
-		".set pop\n"
-		: "+m"(*p) : "r"(x) : "memory" );
-}
-
-#define a_spin a_barrier
-
-static inline void a_barrier()
-{
-	a_cas(&(int){0}, 0, 0);
-}
-
-static inline void a_crash()
-{
-	*(volatile char *)0=0;
-}
-
-static inline void a_and(volatile int *p, int v)
-{
-	int dummy;
-	__asm__ __volatile__(
-		".set push\n"
-		".set mips2\n"
-		".set noreorder\n"
-		"	sync\n"
-		"1:	ll %0, %1\n"
-		"	and %0, %0, %2\n"
-		"	sc %0, %1\n"
-		"	beq %0, $0, 1b\n"
-		"	nop\n"
-		"	sync\n"
-		".set pop\n"
-		: "=&r"(dummy), "+m"(*p) : "r"(v) : "memory" );
-}
-
-static inline void a_or(volatile int *p, int v)
-{
-	int dummy;
-	__asm__ __volatile__(
-		".set push\n"
-		".set mips2\n"
-		".set noreorder\n"
-		"	sync\n"
-		"1:	ll %0, %1\n"
-		"	or %0, %0, %2\n"
-		"	sc %0, %1\n"
-		"	beq %0, $0, 1b\n"
-		"	nop\n"
-		"	sync\n"
-		".set pop\n"
-		: "=&r"(dummy), "+m"(*p) : "r"(v) : "memory" );
-}
-
-static inline void a_or_l(volatile void *p, long v)
-{
-	a_or(p, v);
-}
-
-static inline void a_and_64(volatile uint64_t *p, uint64_t v)
-{
-	union { uint64_t v; uint32_t r[2]; } u = { v };
-	a_and((int *)p, u.r[0]);
-	a_and((int *)p+1, u.r[1]);
-}
-
-static inline void a_or_64(volatile uint64_t *p, uint64_t v)
-{
-	union { uint64_t v; uint32_t r[2]; } u = { v };
-	a_or((int *)p, u.r[0]);
-	a_or((int *)p+1, u.r[1]);
-}
-
-#endif
diff --git a/arch/mips/atomic_arch.h b/arch/mips/atomic_arch.h
new file mode 100644
index 00000000..b111c894
--- /dev/null
+++ b/arch/mips/atomic_arch.h
@@ -0,0 +1,61 @@
+#define a_cas a_cas
+static inline int a_cas(volatile int *p, int t, int s)
+{
+	int dummy;
+	__asm__ __volatile__(
+		".set push\n"
+		".set mips2\n"
+		".set noreorder\n"
+		"	sync\n"
+		"1:	ll %0, %2\n"
+		"	bne %0, %3, 1f\n"
+		"	addu %1, %4, $0\n"
+		"	sc %1, %2\n"
+		"	beq %1, $0, 1b\n"
+		"	nop\n"
+		"	sync\n"
+		"1:	\n"
+		".set pop\n"
+		: "=&r"(t), "=&r"(dummy), "+m"(*p) : "r"(t), "r"(s) : "memory" );
+        return t;
+}
+
+#define a_swap a_swap
+static inline int a_swap(volatile int *x, int v)
+{
+	int old, dummy;
+	__asm__ __volatile__(
+		".set push\n"
+		".set mips2\n"
+		".set noreorder\n"
+		"	sync\n"
+		"1:	ll %0, %2\n"
+		"	addu %1, %3, $0\n"
+		"	sc %1, %2\n"
+		"	beq %1, $0, 1b\n"
+		"	nop\n"
+		"	sync\n"
+		".set pop\n"
+		: "=&r"(old), "=&r"(dummy), "+m"(*x) : "r"(v) : "memory" );
+        return old;
+}
+
+#define a_fetch_add a_fetch_add
+static inline int a_fetch_add(volatile int *x, int v)
+{
+	int old, dummy;
+	__asm__ __volatile__(
+		".set push\n"
+		".set mips2\n"
+		".set noreorder\n"
+		"	sync\n"
+		"1:	ll %0, %2\n"
+		"	addu %1, %0, %3\n"
+		"	sc %1, %2\n"
+		"	beq %1, $0, 1b\n"
+		"	nop\n"
+		"	sync\n"
+		".set pop\n"
+		: "=&r"(old), "=&r"(dummy), "+m"(*x) : "r"(v) : "memory" );
+        return old;
+}
diff --git a/arch/or1k/atomic.h b/arch/or1k/atomic.h
deleted file mode 100644
index 640ff430..00000000
--- a/arch/or1k/atomic.h
+++ /dev/null
@@ -1,120 +0,0 @@
-#ifndef _INTERNAL_ATOMIC_H
-#define _INTERNAL_ATOMIC_H
-
-#include <stdint.h>
-
-static inline int a_ctz_l(unsigned long x)
-{
-	static const char debruijn32[32] = {
-		0, 1, 23, 2, 29, 24, 19, 3, 30, 27, 25, 11, 20, 8, 4, 13,
-		31, 22, 28, 18, 26, 10, 7, 12, 21, 17, 9, 6, 16, 5, 15, 14
-	};
-	return debruijn32[(x&-x)*0x076be629 >> 27];
-}
-
-static inline int a_ctz_64(uint64_t x)
-{
-	uint32_t y = x;
-	if (!y) {
-		y = x>>32;
-		return 32 + a_ctz_l(y);
-	}
-	return a_ctz_l(y);
-}
-
-static inline int a_cas(volatile int *p, int t, int s)
-{
-	__asm__("1:	l.lwa %0, %1\n"
-		"	l.sfeq %0, %2\n"
-		"	l.bnf 1f\n"
-		"	 l.nop\n"
-		"	l.swa %1, %3\n"
-		"	l.bnf 1b\n"
-		"	 l.nop\n"
-		"1:	\n"
-		: "=&r"(t), "+m"(*p) : "r"(t), "r"(s) : "cc", "memory" );
-        return t;
-}
-
-static inline void *a_cas_p(volatile void *p, void *t, void *s)
-{
-	return (void *)a_cas(p, (int)t, (int)s);
-}
-
-static inline int a_swap(volatile int *x, int v)
-{
-	int old;
-	do old = *x;
-	while (a_cas(x, old, v) != old);
-	return old;
-}
-
-static inline int a_fetch_add(volatile int *x, int v)
-{
-	int old;
-	do old = *x;
-	while (a_cas(x, old, old+v) != old);
-	return old;
-}
-
-static inline void a_inc(volatile int *x)
-{
-	a_fetch_add(x, 1);
-}
-
-static inline void a_dec(volatile int *x)
-{
-	a_fetch_add(x, -1);
-}
-
-static inline void a_store(volatile int *p, int x)
-{
-	a_swap(p, x);
-}
-
-#define a_spin a_barrier
-
-static inline void a_barrier()
-{
-	a_cas(&(int){0}, 0, 0);
-}
-
-static inline void a_crash()
-{
-	*(volatile char *)0=0;
-}
-
-static inline void a_and(volatile int *p, int v)
-{
-	int old;
-	do old = *p;
-	while (a_cas(p, old, old&v) != old);
-}
-
-static inline void a_or(volatile int *p, int v)
-{
-	int old;
-	do old = *p;
-	while (a_cas(p, old, old|v) != old);
-}
-
-static inline void a_or_l(volatile void *p, long v)
-{
-	a_or(p, v);
-}
-
-static inline void a_and_64(volatile uint64_t *p, uint64_t v)
-{
-	union { uint64_t v; uint32_t r[2]; } u = { v };
-	a_and((int *)p, u.r[0]);
-	a_and((int *)p+1, u.r[1]);
-}
-
-static inline void a_or_64(volatile uint64_t *p, uint64_t v)
-{
-	union { uint64_t v; uint32_t r[2]; } u = { v };
-	a_or((int *)p, u.r[0]);
-	a_or((int *)p+1, u.r[1]);
-}
-
-#endif
diff --git a/arch/or1k/atomic_arch.h b/arch/or1k/atomic_arch.h
new file mode 100644
index 00000000..11a54292
--- /dev/null
+++ b/arch/or1k/atomic_arch.h
@@ -0,0 +1,14 @@
+#define a_cas a_cas
+static inline int a_cas(volatile int *p, int t, int s)
+{
+	__asm__("1:	l.lwa %0, %1\n"
+		"	l.sfeq %0, %2\n"
+		"	l.bnf 1f\n"
+		"	 l.nop\n"
+		"	l.swa %1, %3\n"
+		"	l.bnf 1b\n"
+		"	 l.nop\n"
+		"1:	\n"
+		: "=&r"(t), "+m"(*p) : "r"(t), "r"(s) : "cc", "memory" );
+        return t;
+}
diff --git a/arch/powerpc/atomic.h b/arch/powerpc/atomic.h
deleted file mode 100644
index f706543a..00000000
--- a/arch/powerpc/atomic.h
+++ /dev/null
@@ -1,126 +0,0 @@
-#ifndef _INTERNAL_ATOMIC_H
-#define _INTERNAL_ATOMIC_H
-
-#include <stdint.h>
-#include <endian.h>
-
-static inline int a_ctz_l(unsigned long x)
-{
-	static const char debruijn32[32] = {
-		0, 1, 23, 2, 29, 24, 19, 3, 30, 27, 25, 11, 20, 8, 4, 13,
-		31, 22, 28, 18, 26, 10, 7, 12, 21, 17, 9, 6, 16, 5, 15, 14
-	};
-	return debruijn32[(x&-x)*0x076be629 >> 27];
-}
-
-static inline int a_ctz_64(uint64_t x)
-{
-	uint32_t y = x;
-	if (!y) {
-		y = x>>32;
-		return 32 + a_ctz_l(y);
-	}
-	return a_ctz_l(y);
-}
-
-static inline int a_cas(volatile int *p, int t, int s)
-{
-	__asm__("\n"
-		"	sync\n"
-		"1:	lwarx %0, 0, %4\n"
-		"	cmpw %0, %2\n"
-		"	bne 1f\n"
-		"	stwcx. %3, 0, %4\n"
-		"	bne- 1b\n"
-		"	isync\n"
-		"1:	\n"
-		: "=&r"(t), "+m"(*p) : "r"(t), "r"(s), "r"(p) : "cc", "memory" );
-        return t;
-}
-
-static inline void *a_cas_p(volatile void *p, void *t, void *s)
-{
-	return (void *)a_cas(p, (int)t, (int)s);
-}
-
-static inline int a_swap(volatile int *x, int v)
-{
-	int old;
-	do old = *x;
-	while (a_cas(x, old, v) != old);
-	return old;
-}
-
-static inline int a_fetch_add(volatile int *x, int v)
-{
-	int old;
-	do old = *x;
-	while (a_cas(x, old, old+v) != old);
-	return old;
-}
-
-static inline void a_inc(volatile int *x)
-{
-	a_fetch_add(x, 1);
-}
-
-static inline void a_dec(volatile int *x)
-{
-	a_fetch_add(x, -1);
-}
-
-static inline void a_store(volatile int *p, int x)
-{
-	__asm__ __volatile__ ("\n"
-		"	sync\n"
-		"	stw %1, %0\n"
-		"	isync\n"
-		: "=m"(*p) : "r"(x) : "memory" );
-}
-
-#define a_spin a_barrier
-
-static inline void a_barrier()
-{
-	a_cas(&(int){0}, 0, 0);
-}
-
-static inline void a_crash()
-{
-	*(volatile char *)0=0;
-}
-
-static inline void a_and(volatile int *p, int v)
-{
-	int old;
-	do old = *p;
-	while (a_cas(p, old, old&v) != old);
-}
-
-static inline void a_or(volatile int *p, int v)
-{
-	int old;
-	do old = *p;
-	while (a_cas(p, old, old|v) != old);
-}
-
-static inline void a_or_l(volatile void *p, long v)
-{
-	a_or(p, v);
-}
-
-static inline void a_and_64(volatile uint64_t *p, uint64_t v)
-{
-	union { uint64_t v; uint32_t r[2]; } u = { v };
-	a_and((int *)p, u.r[0]);
-	a_and((int *)p+1, u.r[1]);
-}
-
-static inline void a_or_64(volatile uint64_t *p, uint64_t v)
-{
-	union { uint64_t v; uint32_t r[2]; } u = { v };
-	a_or((int *)p, u.r[0]);
-	a_or((int *)p+1, u.r[1]);
-}
-
-#endif
diff --git a/arch/powerpc/atomic_arch.h b/arch/powerpc/atomic_arch.h
new file mode 100644
index 00000000..f014e3b8
--- /dev/null
+++ b/arch/powerpc/atomic_arch.h
@@ -0,0 +1,15 @@
+#define a_cas a_cas
+static inline int a_cas(volatile int *p, int t, int s)
+{
+	__asm__("\n"
+		"	sync\n"
+		"1:	lwarx %0, 0, %4\n"
+		"	cmpw %0, %2\n"
+		"	bne 1f\n"
+		"	stwcx. %3, 0, %4\n"
+		"	bne- 1b\n"
+		"	isync\n"
+		"1:	\n"
+		: "=&r"(t), "+m"(*p) : "r"(t), "r"(s), "r"(p) : "cc", "memory" );
+        return t;
+}
diff --git a/arch/sh/atomic.h b/arch/sh/atomic_arch.h
index f2e6dacb..2ac77246 100644
--- a/arch/sh/atomic.h
+++ b/arch/sh/atomic_arch.h
@@ -1,27 +1,3 @@
-#ifndef _INTERNAL_ATOMIC_H
-#define _INTERNAL_ATOMIC_H
-
-#include <stdint.h>
-
-static inline int a_ctz_l(unsigned long x)
-{
-	static const char debruijn32[32] = {
-		0, 1, 23, 2, 29, 24, 19, 3, 30, 27, 25, 11, 20, 8, 4, 13,
-		31, 22, 28, 18, 26, 10, 7, 12, 21, 17, 9, 6, 16, 5, 15, 14
-	};
-	return debruijn32[(x&-x)*0x076be629 >> 27];
-}
-
-static inline int a_ctz_64(uint64_t x)
-{
-	uint32_t y = x;
-	if (!y) {
-		y = x>>32;
-		return 32 + a_ctz_l(y);
-	}
-	return a_ctz_l(y);
-}
-
 #define LLSC_CLOBBERS "r0", "t", "memory"
 #define LLSC_START(mem) "synco\n"  \
 	"0:	movli.l @" mem ", r0\n"
@@ -118,51 +94,3 @@ void __sh_or(volatile int *, int);
 #define a_and(x,v)       __sh_and(x, v)
 #define a_or(x,v)        __sh_or(x, v)
 #endif
-
-static inline void *a_cas_p(volatile void *p, void *t, void *s)
-{
-	return (void *)a_cas(p, (int)t, (int)s);
-}
-
-static inline void a_inc(volatile int *x)
-{
-	a_fetch_add(x, 1);
-}
-
-static inline void a_dec(volatile int *x)
-{
-	a_fetch_add(x, -1);
-}
-
-#define a_spin a_barrier
-
-static inline void a_barrier()
-{
-	a_cas(&(int){0}, 0, 0);
-}
-
-static inline void a_crash()
-{
-	*(volatile char *)0=0;
-}
-
-static inline void a_or_l(volatile void *p, long v)
-{
-	a_or(p, v);
-}
-
-static inline void a_and_64(volatile uint64_t *p, uint64_t v)
-{
-	union { uint64_t v; uint32_t r[2]; } u = { v };
-	a_and((int *)p,   u.r[0]);
-	a_and((int *)p+1, u.r[1]);
-}
-
-static inline void a_or_64(volatile uint64_t *p, uint64_t v)
-{
-	union { uint64_t v; uint32_t r[2]; } u = { v };
-	a_or((int *)p,   u.r[0]);
-	a_or((int *)p+1, u.r[1]);
-}
-
-#endif
diff --git a/arch/x32/atomic.h b/arch/x32/atomic_arch.h
index 7690183d..8d1a03e5 100644
--- a/arch/x32/atomic.h
+++ b/arch/x32/atomic_arch.h
@@ -1,45 +1,39 @@
-#ifndef _INTERNAL_ATOMIC_H
-#define _INTERNAL_ATOMIC_H
-
-#include <stdint.h>
-
+#define a_ctz_64 a_ctz_64
 static inline int a_ctz_64(uint64_t x)
 {
 	__asm__( "bsf %1,%0" : "=r"(x) : "r"(x) );
 	return x;
 }
 
+#define a_ctz_l a_ctz_l
 static inline int a_ctz_l(unsigned long x)
 {
 	__asm__( "bsf %1,%0" : "=r"(x) : "r"(x) );
 	return x;
 }
 
+#define a_and_64 a_and_64
 static inline void a_and_64(volatile uint64_t *p, uint64_t v)
 {
 	__asm__( "lock ; and %1, %0"
 			 : "=m"(*p) : "r"(v) : "memory" );
 }
 
+#define a_or_64 a_or_64
 static inline void a_or_64(volatile uint64_t *p, uint64_t v)
 {
 	__asm__( "lock ; or %1, %0"
 			 : "=m"(*p) : "r"(v) : "memory" );
 }
 
+#define a_or_l a_or_l
 static inline void a_or_l(volatile void *p, long v)
 {
 	__asm__( "lock ; or %1, %0"
 		: "=m"(*(long *)p) : "r"(v) : "memory" );
 }
 
-static inline void *a_cas_p(volatile void *p, void *t, void *s)
-{
-	__asm__( "lock ; cmpxchg %3, %1"
-		: "=a"(t), "=m"(*(long *)p) : "a"(t), "r"(s) : "memory" );
-	return t;
-}
-
+#define a_cas a_cas
 static inline int a_cas(volatile int *p, int t, int s)
 {
 	__asm__( "lock ; cmpxchg %3, %1"
@@ -47,59 +41,66 @@ static inline int a_cas(volatile int *p, int t, int s)
 	return t;
 }
 
+#define a_or a_or
 static inline void a_or(volatile int *p, int v)
 {
 	__asm__( "lock ; or %1, %0"
 		: "=m"(*p) : "r"(v) : "memory" );
 }
 
+#define a_and a_and
 static inline void a_and(volatile int *p, int v)
 {
 	__asm__( "lock ; and %1, %0"
 		: "=m"(*p) : "r"(v) : "memory" );
 }
 
+#define a_swap a_swap
 static inline int a_swap(volatile int *x, int v)
 {
 	__asm__( "xchg %0, %1" : "=r"(v), "=m"(*x) : "0"(v) : "memory" );
 	return v;
 }
 
+#define a_fetch_add a_fetch_add
 static inline int a_fetch_add(volatile int *x, int v)
 {
 	__asm__( "lock ; xadd %0, %1" : "=r"(v), "=m"(*x) : "0"(v) : "memory" );
 	return v;
 }
 
+#define a_inc a_inc
 static inline void a_inc(volatile int *x)
 {
 	__asm__( "lock ; incl %0" : "=m"(*x) : "m"(*x) : "memory" );
 }
 
+#define a_dec a_dec
 static inline void a_dec(volatile int *x)
 {
 	__asm__( "lock ; decl %0" : "=m"(*x) : "m"(*x) : "memory" );
 }
 
+#define a_store a_store
 static inline void a_store(volatile int *p, int x)
 {
 	__asm__( "mov %1, %0 ; lock ; orl $0,(%%rsp)" : "=m"(*p) : "r"(x) : "memory" );
 }
 
+#define a_spin a_spin
 static inline void a_spin()
 {
 	__asm__ __volatile__( "pause" : : : "memory" );
 }
 
+#define a_barrier a_barrier
 static inline void a_barrier()
 {
 	__asm__ __volatile__( "" : : : "memory" );
 }
 
+#define a_crash a_crash
 static inline void a_crash()
 {
 	__asm__ __volatile__( "hlt" : : : "memory" );
 }
-
-
-#endif
diff --git a/arch/x86_64/atomic.h b/arch/x86_64/atomic_arch.h
index 7690183d..92bdac52 100644
--- a/arch/x86_64/atomic.h
+++ b/arch/x86_64/atomic_arch.h
@@ -1,38 +1,32 @@
-#ifndef _INTERNAL_ATOMIC_H
-#define _INTERNAL_ATOMIC_H
-
-#include <stdint.h>
-
+#define a_ctz_64 a_ctz_64
 static inline int a_ctz_64(uint64_t x)
 {
 	__asm__( "bsf %1,%0" : "=r"(x) : "r"(x) );
 	return x;
 }
 
-static inline int a_ctz_l(unsigned long x)
-{
-	__asm__( "bsf %1,%0" : "=r"(x) : "r"(x) );
-	return x;
-}
-
+#define a_and_64 a_and_64
 static inline void a_and_64(volatile uint64_t *p, uint64_t v)
 {
 	__asm__( "lock ; and %1, %0"
 			 : "=m"(*p) : "r"(v) : "memory" );
 }
 
+#define a_or_64 a_or_64
 static inline void a_or_64(volatile uint64_t *p, uint64_t v)
 {
 	__asm__( "lock ; or %1, %0"
 			 : "=m"(*p) : "r"(v) : "memory" );
 }
 
+#define a_or_l a_or_l
 static inline void a_or_l(volatile void *p, long v)
 {
 	__asm__( "lock ; or %1, %0"
 		: "=m"(*(long *)p) : "r"(v) : "memory" );
 }
 
+#define a_cas_p a_cas_p
 static inline void *a_cas_p(volatile void *p, void *t, void *s)
 {
 	__asm__( "lock ; cmpxchg %3, %1"
@@ -40,6 +34,7 @@ static inline void *a_cas_p(volatile void *p, void *t, void *s)
 	return t;
 }
 
+#define a_cas a_cas
 static inline int a_cas(volatile int *p, int t, int s)
 {
 	__asm__( "lock ; cmpxchg %3, %1"
@@ -47,59 +42,66 @@ static inline int a_cas(volatile int *p, int t, int s)
 	return t;
 }
 
+#define a_or a_or
 static inline void a_or(volatile int *p, int v)
 {
 	__asm__( "lock ; or %1, %0"
 		: "=m"(*p) : "r"(v) : "memory" );
 }
 
+#define a_and a_and
 static inline void a_and(volatile int *p, int v)
 {
 	__asm__( "lock ; and %1, %0"
 		: "=m"(*p) : "r"(v) : "memory" );
 }
 
+#define a_swap a_swap
 static inline int a_swap(volatile int *x, int v)
 {
 	__asm__( "xchg %0, %1" : "=r"(v), "=m"(*x) : "0"(v) : "memory" );
 	return v;
 }
 
+#define a_fetch_add a_fetch_add
 static inline int a_fetch_add(volatile int *x, int v)
 {
 	__asm__( "lock ; xadd %0, %1" : "=r"(v), "=m"(*x) : "0"(v) : "memory" );
 	return v;
 }
 
+#define a_inc a_inc
 static inline void a_inc(volatile int *x)
 {
 	__asm__( "lock ; incl %0" : "=m"(*x) : "m"(*x) : "memory" );
 }
 
+#define a_dec a_dec
 static inline void a_dec(volatile int *x)
 {
 	__asm__( "lock ; decl %0" : "=m"(*x) : "m"(*x) : "memory" );
 }
 
+#define a_store a_store
 static inline void a_store(volatile int *p, int x)
 {
 	__asm__( "mov %1, %0 ; lock ; orl $0,(%%rsp)" : "=m"(*p) : "r"(x) : "memory" );
 }
 
+#define a_spin a_spin
 static inline void a_spin()
 {
 	__asm__ __volatile__( "pause" : : : "memory" );
 }
 
+#define a_barrier a_barrier
 static inline void a_barrier()
 {
 	__asm__ __volatile__( "" : : : "memory" );
 }
 
+#define a_crash a_crash
 static inline void a_crash()
 {
 	__asm__ __volatile__( "hlt" : : : "memory" );
 }
-
-
-#endif
diff --git a/src/internal/atomic.h b/src/internal/atomic.h
new file mode 100644
index 00000000..2097247e
--- /dev/null
+++ b/src/internal/atomic.h
@@ -0,0 +1,275 @@
+#ifndef _ATOMIC_H
+#define _ATOMIC_H
+
+#include <stdint.h>
+
+#include "atomic_arch.h"
+
+#ifdef a_ll
+
+#ifndef a_pre_llsc
+#define a_pre_llsc()
+#endif
+
+#ifndef a_post_llsc
+#define a_post_llsc()
+#endif
+
+#ifndef a_cas
+#define a_cas a_cas
+static inline int a_cas(volatile int *p, int t, int s)
+{
+	int old;
+	a_pre_llsc();
+	do old = a_ll(p);
+	while (old==t && !a_sc(p, s));
+	a_post_llsc();
+	return old;
+}
+#endif
+
+#ifndef a_swap
+#define a_swap a_swap
+static inline int a_swap(volatile int *p, int v)
+{
+	int old;
+	a_pre_llsc();
+	do old = a_ll(p);
+	while (!a_sc(p, v));
+	a_post_llsc();
+	return old;
+}
+#endif
+
+#ifndef a_fetch_add
+#define a_fetch_add a_fetch_add
+static inline int a_fetch_add(volatile int *p, int v)
+{
+	int old;
+	a_pre_llsc();
+	do old = a_ll(p);
+	while (!a_sc(p, (unsigned)old + v));
+	a_post_llsc();
+	return old;
+}
+#endif
+
+#ifndef a_fetch_and
+#define a_fetch_and a_fetch_and
+static inline int a_fetch_and(volatile int *p, int v)
+{
+	int old;
+	a_pre_llsc();
+	do old = a_ll(p);
+	while (!a_sc(p, old & v));
+	a_post_llsc();
+	return old;
+}
+#endif
+
+#ifndef a_fetch_or
+#define a_fetch_or a_fetch_or
+static inline int a_fetch_or(volatile int *p, int v)
+{
+	int old;
+	a_pre_llsc();
+	do old = a_ll(p);
+	while (!a_sc(p, old | v));
+	a_post_llsc();
+	return old;
+}
+#endif
+
+#endif
+
+#ifndef a_cas
+#error missing definition of a_cas
+#endif
+
+#ifndef a_swap
+#define a_swap a_swap
+static inline int a_swap(volatile int *p, int v)
+{
+	int old;
+	do old = *p;
+	while (a_cas(p, old, v) != old);
+	return old;
+}
+#endif
+
+#ifndef a_fetch_add
+#define a_fetch_add a_fetch_add
+static inline int a_fetch_add(volatile int *p, int v)
+{
+	int old;
+	do old = *p;
+	while (a_cas(p, old, (unsigned)old+v) != old);
+	return old;
+}
+#endif
+
+#ifndef a_fetch_and
+#define a_fetch_and a_fetch_and
+static inline int a_fetch_and(volatile int *p, int v)
+{
+	int old;
+	do old = *p;
+	while (a_cas(p, old, old&v) != old);
+	return old;
+}
+#endif
+#ifndef a_fetch_or
+#define a_fetch_or a_fetch_or
+static inline int a_fetch_or(volatile int *p, int v)
+{
+	int old;
+	do old = *p;
+	while (a_cas(p, old, old|v) != old);
+	return old;
+}
+#endif
+
+#ifndef a_and
+#define a_and a_and
+static inline void a_and(volatile int *p, int v)
+{
+	a_fetch_and(p, v);
+}
+#endif
+
+#ifndef a_or
+#define a_or a_or
+static inline void a_or(volatile int *p, int v)
+{
+	a_fetch_or(p, v);
+}
+#endif
+
+#ifndef a_inc
+#define a_inc a_inc
+static inline void a_inc(volatile int *p)
+{
+	a_fetch_add(p, 1);
+}
+#endif
+
+#ifndef a_dec
+#define a_dec a_dec
+static inline void a_dec(volatile int *p)
+{
+	a_fetch_add(p, -1);
+}
+#endif
+
+#ifndef a_store
+#define a_store a_store
+static inline void a_store(volatile int *p, int v)
+{
+#ifdef a_barrier
+	a_barrier();
+	*p = v;
+	a_barrier();
+#else
+	a_swap(p, v);
+#endif
+}
+#endif
+
+#ifndef a_barrier
+#define a_barrier a_barrier
+static void a_barrier()
+{
+	volatile int tmp = 0;
+	a_cas(&tmp, 0, 0);
+}
+#endif
+
+#ifndef a_spin
+#define a_spin a_barrier
+#endif
+
+#ifndef a_and_64
+#define a_and_64 a_and_64
+static inline void a_and_64(volatile uint64_t *p, uint64_t v)
+{
+	union { uint64_t v; uint32_t r[2]; } u = { v };
+	if (u.r[0]+1) a_and((int *)p, u.r[0]);
+	if (u.r[1]+1) a_and((int *)p+1, u.r[1]);
+}
+#endif
+
+#ifndef a_or_64
+#define a_or_64 a_or_64
+static inline void a_or_64(volatile uint64_t *p, uint64_t v)
+{
+	union { uint64_t v; uint32_t r[2]; } u = { v };
+	if (u.r[0]) a_or((int *)p, u.r[0]);
+	if (u.r[1]) a_or((int *)p+1, u.r[1]);
+}
+#endif
+
+#ifndef a_cas_p
+#define a_cas_p a_cas_p
+static inline void *a_cas_p(volatile void *p, void *t, void *s)
+{
+	return (void *)a_cas((volatile int *)p, (int)t, (int)s);
+}
+#endif
+
+#ifndef a_or_l
+#define a_or_l a_or_l
+static inline void a_or_l(volatile void *p, long v)
+{
+	if (sizeof(long) == sizeof(int)) a_or(p, v);
+	else a_or_64(p, v);
+}
+#endif
+
+#ifndef a_crash
+#define a_crash a_crash
+static inline void a_crash()
+{
+	*(volatile char *)0=0;
+}
+#endif
+
+#ifndef a_ctz_64
+#define a_ctz_64 a_ctz_64
+static inline int a_ctz_64(uint64_t x)
+{
+	static const char debruijn64[64] = {
+		0, 1, 2, 53, 3, 7, 54, 27, 4, 38, 41, 8, 34, 55, 48, 28,
+		62, 5, 39, 46, 44, 42, 22, 9, 24, 35, 59, 56, 49, 18, 29, 11,
+		63, 52, 6, 26, 37, 40, 33, 47, 61, 45, 43, 21, 23, 58, 17, 10,
+		51, 25, 36, 32, 60, 20, 57, 16, 50, 31, 19, 15, 30, 14, 13, 12
+	};
+	static const char debruijn32[32] = {
+		0, 1, 23, 2, 29, 24, 19, 3, 30, 27, 25, 11, 20, 8, 4, 13,
+		31, 22, 28, 18, 26, 10, 7, 12, 21, 17, 9, 6, 16, 5, 15, 14
+	};
+	if (sizeof(long) < 8) {
+		uint32_t y = x;
+		if (!y) {
+			y = x>>32;
+			return 32 + debruijn32[(y&-y)*0x076be629 >> 27];
+		}
+		return debruijn32[(y&-y)*0x076be629 >> 27];
+	}
+	return debruijn64[(x&-x)*0x022fdd63cc95386dull >> 58];
+}
+#endif
+
+#ifndef a_ctz_l
+#define a_ctz_l a_ctz_l
+static inline int a_ctz_l(unsigned long x)
+{
+	static const char debruijn32[32] = {
+		0, 1, 23, 2, 29, 24, 19, 3, 30, 27, 25, 11, 20, 8, 4, 13,
+		31, 22, 28, 18, 26, 10, 7, 12, 21, 17, 9, 6, 16, 5, 15, 14
+	};
+	if (sizeof(long) == 8) return a_ctz_64(x);
+	return debruijn32[(x&-x)*0x076be629 >> 27];
+}
+#endif
+
+#endif