ARM: 7983/1: atomics: implement a better __atomic_add_unless for v6+

Looking at perf profiles of multi-threaded hackbench runs, a significant performance hit appears to manifest from the cmpxchg loop used to implement the 32-bit atomic_add_unless function. This can be mitigated by writing a direct implementation of __atomic_add_unless which doesn't require iteration outside of the atomic operation. Signed-off-by: Will Deacon <will.deacon@arm.com> Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
author: Will Deacon <will.deacon@arm.com> 2014-02-21 11:01:48 -0500
committer: Russell King <rmk+kernel@arm.linux.org.uk> 2014-02-25 06:35:08 -0500
commit: db38ee874c48713d0723221d08332242e0088970 (patch)
tree: 065fee530d0374306564ea37606617f007af6c64
parent: d98b90ea22b0a28d9d787769704a9cf1ea5a513a (diff)
1 files changed, 31 insertions, 4 deletions
diff --git a/arch/arm/include/asm/atomic.h b/arch/arm/include/asm/atomic.h
index 6e410090896e..9a92fd7864a8 100644
--- a/arch/arm/include/asm/atomic.h
+++ b/arch/arm/include/asm/atomic.h
@@ -141,6 +141,33 @@ static inline int atomic_cmpxchg(atomic_t *ptr, int old, int new)
        return oldval;
 }
+static inline int __atomic_add_unless(atomic_t *v, int a, int u)
+{
+        int oldval, newval;
+        unsigned long tmp;
+        smp_mb();
+        prefetchw(&v->counter);
+        __asm__ __volatile__ ("@ atomic_add_unless\n"
+"1:     ldrex   %0, [%4]\n"
+"       teq     %0, %5\n"
+"       beq     2f\n"
+"       add     %1, %0, %6\n"
+"       strex   %2, %1, [%4]\n"
+"       teq     %2, #0\n"
+"       bne     1b\n"
+"2:"
+        : "=&r" (oldval), "=&r" (newval), "=&r" (tmp), "+Qo" (v->counter)
+        : "r" (&v->counter), "r" (u), "r" (a)
+        : "cc");
+        if (oldval != u)
+                smp_mb();
+        return oldval;
+}
 #else /* ARM_ARCH_6 */
 #ifdef CONFIG_SMP
@@ -189,10 +216,6 @@ static inline int atomic_cmpxchg(atomic_t *v, int old, int new)
        return ret;
 }
-#endif /* __LINUX_ARM_ARCH__ */
-#define atomic_xchg(v, new) (xchg(&((v)->counter), new))
 static inline int __atomic_add_unless(atomic_t *v, int a, int u)
 {
        int c, old;
@@ -203,6 +226,10 @@ static inline int __atomic_add_unless(atomic_t *v, int a, int u)
        return c;
 }
+#endif /* __LINUX_ARM_ARCH__ */
+#define atomic_xchg(v, new) (xchg(&((v)->counter), new))
 #define atomic_inc(v)           atomic_add(1, v)
 #define atomic_dec(v)           atomic_sub(1, v)
author	Will Deacon <will.deacon@arm.com>	2014-02-21 11:01:48 -0500
committer	Russell King <rmk+kernel@arm.linux.org.uk>	2014-02-25 06:35:08 -0500
commit	db38ee874c48713d0723221d08332242e0088970 (patch)
tree	065fee530d0374306564ea37606617f007af6c64
parent	d98b90ea22b0a28d9d787769704a9cf1ea5a513a (diff)

diff --git a/arch/arm/include/asm/atomic.h b/arch/arm/include/asm/atomic.h index 6e410090896e..9a92fd7864a8 100644 --- a/arch/arm/include/asm/atomic.h +++ b/arch/arm/include/asm/atomic.h
@@ -141,6 +141,33 @@ static inline int atomic_cmpxchg(atomic_t *ptr, int old, int new)
141	return oldval;	141	return oldval;
142	}	142	}
143		143
		144	static inline int __atomic_add_unless(atomic_t *v, int a, int u)
		145	{
		146	int oldval, newval;
		147	unsigned long tmp;
		148
		149	smp_mb();
		150	prefetchw(&v->counter);
		151
		152	__asm__ __volatile__ ("@ atomic_add_unless\n"
		153	"1: ldrex %0, [%4]\n"
		154	" teq %0, %5\n"
		155	" beq 2f\n"
		156	" add %1, %0, %6\n"
		157	" strex %2, %1, [%4]\n"
		158	" teq %2, #0\n"
		159	" bne 1b\n"
		160	"2:"
		161	: "=&r" (oldval), "=&r" (newval), "=&r" (tmp), "+Qo" (v->counter)
		162	: "r" (&v->counter), "r" (u), "r" (a)
		163	: "cc");
		164
		165	if (oldval != u)
		166	smp_mb();
		167
		168	return oldval;
		169	}
		170
144	#else /* ARM_ARCH_6 */	171	#else /* ARM_ARCH_6 */
145		172
146	#ifdef CONFIG_SMP	173	#ifdef CONFIG_SMP
@@ -189,10 +216,6 @@ static inline int atomic_cmpxchg(atomic_t *v, int old, int new)
189	return ret;	216	return ret;
190	}	217	}
191		218
192	#endif /* __LINUX_ARM_ARCH__ */
193
194	#define atomic_xchg(v, new) (xchg(&((v)->counter), new))
195
196	static inline int __atomic_add_unless(atomic_t *v, int a, int u)	219	static inline int __atomic_add_unless(atomic_t *v, int a, int u)
197	{	220	{
198	int c, old;	221	int c, old;
@@ -203,6 +226,10 @@ static inline int __atomic_add_unless(atomic_t *v, int a, int u)
203	return c;	226	return c;
204	}	227	}
205		228
		229	#endif /* __LINUX_ARM_ARCH__ */
		230
		231	#define atomic_xchg(v, new) (xchg(&((v)->counter), new))
		232
206	#define atomic_inc(v) atomic_add(1, v)	233	#define atomic_inc(v) atomic_add(1, v)
207	#define atomic_dec(v) atomic_sub(1, v)	234	#define atomic_dec(v) atomic_sub(1, v)
208		235