BKL: revert back to the old spinlock implementation

The generic semaphore rewrite had a huge performance regression on AIM7 (and potentially other BKL-heavy benchmarks) because the generic semaphores had been rewritten to be simple to understand and fair. The latter, in particular, turns a semaphore-based BKL implementation into a mess of scheduling. The attempt to fix the performance regression failed miserably (see the previous commit 00b41ec2611dc98f87f30753ee00a53db648d662 'Revert "semaphore: fix"'), and so for now the simple and sane approach is to instead just go back to the old spinlock-based BKL implementation that never had any issues like this. This patch also has the advantage of being reported to fix the regression completely according to Yanmin Zhang, unlike the semaphore hack which still left a couple percentage point regression. As a spinlock, the BKL obviously has the potential to be a latency issue, but it's not really any different from any other spinlock in that respect. We do want to get rid of the BKL asap, but that has been the plan for several years. These days, the biggest users are in the tty layer (open/release in particular) and Alan holds out some hope: "tty release is probably a few months away from getting cured - I'm afraid it will almost certainly be the very last user of the BKL in tty to get fixed as it depends on everything else being sanely locked." so while we're not there yet, we do have a plan of action. Tested-by: Yanmin Zhang <yanmin_zhang@linux.intel.com> Cc: Ingo Molnar <mingo@elte.hu> Cc: Andi Kleen <andi@firstfloor.org> Cc: Matthew Wilcox <matthew@wil.cx> Cc: Alexander Viro <viro@ftp.linux.org.uk> Cc: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
author: Linus Torvalds <torvalds@linux-foundation.org> 2008-05-10 23:58:02 -0400
committer: Linus Torvalds <torvalds@linux-foundation.org> 2008-05-10 23:58:02 -0400
commit: 8e3e076c5a78519a9f64cd384e8f18bc21882ce0 (patch)
tree: f032258fde3aa4771e86bf4552fe4530c221dec3 /lib
parent: 00b41ec2611dc98f87f30753ee00a53db648d662 (diff)
1 files changed, 81 insertions, 39 deletions
diff --git a/lib/kernel_lock.c b/lib/kernel_lock.c
index cd3e82530b03..01a3c22c1b5a 100644
--- a/lib/kernel_lock.c
+++ b/lib/kernel_lock.c
@@ -11,79 +11,121 @@
 #include <linux/semaphore.h>
 /*
- * The 'big kernel semaphore'
+ * The 'big kernel lock'
 *
- * This mutex is taken and released recursively by lock_kernel()
+ * This spinlock is taken and released recursively by lock_kernel()
 * and unlock_kernel().  It is transparently dropped and reacquired
 * over schedule().  It is used to protect legacy code that hasn't
 * been migrated to a proper locking design yet.
 *
- * Note: code locked by this semaphore will only be serialized against
- * other code using the same locking facility. The code guarantees that
- * the task remains on the same CPU.
- *
 * Don't use in new code.
 */
-static DECLARE_MUTEX(kernel_sem);
+static  __cacheline_aligned_in_smp DEFINE_SPINLOCK(kernel_flag);
 /*
- * Re-acquire the kernel semaphore.
+ * Acquire/release the underlying lock from the scheduler.
 *
- * This function is called with preemption off.
+ * This is called with preemption disabled, and should
+ * return an error value if it cannot get the lock and
+ * TIF_NEED_RESCHED gets set.
 *
- * We are executing in schedule() so the code must be extremely careful
+ * If it successfully gets the lock, it should increment
- * about recursion, both due to the down() and due to the enabling of
+ * the preemption count like any spinlock does.
- * preemption. schedule() will re-check the preemption flag after
+ *
- * reacquiring the semaphore.
+ * (This works on UP too - _raw_spin_trylock will never
+ * return false in that case)
 */
 int __lockfunc __reacquire_kernel_lock(void)
 {
-        struct task_struct *task = current;
+        while (!_raw_spin_trylock(&kernel_flag)) {
-        int saved_lock_depth = task->lock_depth;
+                if (test_thread_flag(TIF_NEED_RESCHED))
+                        return -EAGAIN;
-        BUG_ON(saved_lock_depth < 0);
+                cpu_relax();
+        }
-        task->lock_depth = -1;
-        preempt_enable_no_resched();
-        down(&kernel_sem);
        preempt_disable();
-        task->lock_depth = saved_lock_depth;
        return 0;
 }
 void __lockfunc __release_kernel_lock(void)
 {
-        up(&kernel_sem);
+        _raw_spin_unlock(&kernel_flag);
+        preempt_enable_no_resched();
 }
 /*
- * Getting the big kernel semaphore.
+ * These are the BKL spinlocks - we try to be polite about preemption.
+ * If SMP is not on (ie UP preemption), this all goes away because the
+ * _raw_spin_trylock() will always succeed.
 */
-void __lockfunc lock_kernel(void)
+#ifdef CONFIG_PREEMPT
+static inline void __lock_kernel(void)
 {
-        struct task_struct *task = current;
+        preempt_disable();
-        int depth = task->lock_depth + 1;
+        if (unlikely(!_raw_spin_trylock(&kernel_flag))) {
+                /*
+                 * If preemption was disabled even before this
+                 * was called, there's nothing we can be polite
+                 * about - just spin.
+                 */
+                if (preempt_count() > 1) {
+                        _raw_spin_lock(&kernel_flag);
+                        return;
+                }
-        if (likely(!depth))
                /*
-                 * No recursion worries - we set up lock_depth _after_
+                 * Otherwise, let's wait for the kernel lock
+                 * with preemption enabled..
                 */
-                down(&kernel_sem);
+                do {
+                        preempt_enable();
+                        while (spin_is_locked(&kernel_flag))
+                                cpu_relax();
+                        preempt_disable();
+                } while (!_raw_spin_trylock(&kernel_flag));
+        }
+}
-        task->lock_depth = depth;
+#else
+/*
+ * Non-preemption case - just get the spinlock
+ */
+static inline void __lock_kernel(void)
+{
+        _raw_spin_lock(&kernel_flag);
 }
+#endif
-void __lockfunc unlock_kernel(void)
+static inline void __unlock_kernel(void)
 {
-        struct task_struct *task = current;
+        /*
+         * the BKL is not covered by lockdep, so we open-code the
+         * unlocking sequence (and thus avoid the dep-chain ops):
+         */
+        _raw_spin_unlock(&kernel_flag);
+        preempt_enable();
+}
-        BUG_ON(task->lock_depth < 0);
+/*
+ * Getting the big kernel lock.
+ *
+ * This cannot happen asynchronously, so we only need to
+ * worry about other CPU's.
+ */
+void __lockfunc lock_kernel(void)
+{
+        int depth = current->lock_depth+1;
+        if (likely(!depth))
+                __lock_kernel();
+        current->lock_depth = depth;
+}
-        if (likely(--task->lock_depth < 0))
+void __lockfunc unlock_kernel(void)
-                up(&kernel_sem);
+{
+        BUG_ON(current->lock_depth < 0);
+        if (likely(--current->lock_depth < 0))
+                __unlock_kernel();
 }
 EXPORT_SYMBOL(lock_kernel);
author	Linus Torvalds <torvalds@linux-foundation.org>	2008-05-10 23:58:02 -0400
committer	Linus Torvalds <torvalds@linux-foundation.org>	2008-05-10 23:58:02 -0400
commit	8e3e076c5a78519a9f64cd384e8f18bc21882ce0 (patch)
tree	f032258fde3aa4771e86bf4552fe4530c221dec3 /lib
parent	00b41ec2611dc98f87f30753ee00a53db648d662 (diff)

diff --git a/lib/kernel_lock.c b/lib/kernel_lock.c index cd3e82530b03..01a3c22c1b5a 100644 --- a/lib/kernel_lock.c +++ b/lib/kernel_lock.c
@@ -11,79 +11,121 @@
11	#include <linux/semaphore.h>	11	#include <linux/semaphore.h>
12		12
13	/*	13	/*
14	* The 'big kernel semaphore'	14	* The 'big kernel lock'
15	*	15	*
16	* This mutex is taken and released recursively by lock_kernel()	16	* This spinlock is taken and released recursively by lock_kernel()
17	* and unlock_kernel(). It is transparently dropped and reacquired	17	* and unlock_kernel(). It is transparently dropped and reacquired
18	* over schedule(). It is used to protect legacy code that hasn't	18	* over schedule(). It is used to protect legacy code that hasn't
19	* been migrated to a proper locking design yet.	19	* been migrated to a proper locking design yet.
20	*	20	*
21	* Note: code locked by this semaphore will only be serialized against
22	* other code using the same locking facility. The code guarantees that
23	* the task remains on the same CPU.
24	*
25	* Don't use in new code.	21	* Don't use in new code.
26	*/	22	*/
27	static DECLARE_MUTEX(kernel_sem);	23	static __cacheline_aligned_in_smp DEFINE_SPINLOCK(kernel_flag);
		24
28		25
29	/*	26	/*
30	* Re-acquire the kernel semaphore.	27	* Acquire/release the underlying lock from the scheduler.
31	*	28	*
32	* This function is called with preemption off.	29	* This is called with preemption disabled, and should
		30	* return an error value if it cannot get the lock and
		31	* TIF_NEED_RESCHED gets set.
33	*	32	*
34	* We are executing in schedule() so the code must be extremely careful	33	* If it successfully gets the lock, it should increment
35	* about recursion, both due to the down() and due to the enabling of	34	* the preemption count like any spinlock does.
36	* preemption. schedule() will re-check the preemption flag after	35	*
37	* reacquiring the semaphore.	36	* (This works on UP too - _raw_spin_trylock will never
		37	* return false in that case)
38	*/	38	*/
39	int __lockfunc __reacquire_kernel_lock(void)	39	int __lockfunc __reacquire_kernel_lock(void)
40	{	40	{
41	struct task_struct *task = current;	41	while (!_raw_spin_trylock(&kernel_flag)) {
42	int saved_lock_depth = task->lock_depth;	42	if (test_thread_flag(TIF_NEED_RESCHED))
43		43	return -EAGAIN;
44	BUG_ON(saved_lock_depth < 0);	44	cpu_relax();
45		45	}
46	task->lock_depth = -1;
47	preempt_enable_no_resched();
48
49	down(&kernel_sem);
50
51	preempt_disable();	46	preempt_disable();
52	task->lock_depth = saved_lock_depth;
53
54	return 0;	47	return 0;
55	}	48	}
56		49
57	void __lockfunc __release_kernel_lock(void)	50	void __lockfunc __release_kernel_lock(void)
58	{	51	{
59	up(&kernel_sem);	52	_raw_spin_unlock(&kernel_flag);
		53	preempt_enable_no_resched();
60	}	54	}
61		55
62	/*	56	/*
63	* Getting the big kernel semaphore.	57	* These are the BKL spinlocks - we try to be polite about preemption.
		58	* If SMP is not on (ie UP preemption), this all goes away because the
		59	* _raw_spin_trylock() will always succeed.
64	*/	60	*/
65	void __lockfunc lock_kernel(void)	61	#ifdef CONFIG_PREEMPT
		62	static inline void __lock_kernel(void)
66	{	63	{
67	struct task_struct *task = current;	64	preempt_disable();
68	int depth = task->lock_depth + 1;	65	if (unlikely(!_raw_spin_trylock(&kernel_flag))) {
		66	/*
		67	* If preemption was disabled even before this
		68	* was called, there's nothing we can be polite
		69	* about - just spin.
		70	*/
		71	if (preempt_count() > 1) {
		72	_raw_spin_lock(&kernel_flag);
		73	return;
		74	}
69		75
70	if (likely(!depth))
71	/*	76	/*
72	* No recursion worries - we set up lock_depth _after_	77	* Otherwise, let's wait for the kernel lock
		78	* with preemption enabled..
73	*/	79	*/
74	down(&kernel_sem);	80	do {
		81	preempt_enable();
		82	while (spin_is_locked(&kernel_flag))
		83	cpu_relax();
		84	preempt_disable();
		85	} while (!_raw_spin_trylock(&kernel_flag));
		86	}
		87	}
75		88
76	task->lock_depth = depth;	89	#else
		90
		91	/*
		92	* Non-preemption case - just get the spinlock
		93	*/
		94	static inline void __lock_kernel(void)
		95	{
		96	_raw_spin_lock(&kernel_flag);
77	}	97	}
		98	#endif
78		99
79	void __lockfunc unlock_kernel(void)	100	static inline void __unlock_kernel(void)
80	{	101	{
81	struct task_struct *task = current;	102	/*
		103	* the BKL is not covered by lockdep, so we open-code the
		104	* unlocking sequence (and thus avoid the dep-chain ops):
		105	*/
		106	_raw_spin_unlock(&kernel_flag);
		107	preempt_enable();
		108	}
82		109
83	BUG_ON(task->lock_depth < 0);	110	/*
		111	* Getting the big kernel lock.
		112	*
		113	* This cannot happen asynchronously, so we only need to
		114	* worry about other CPU's.
		115	*/
		116	void __lockfunc lock_kernel(void)
		117	{
		118	int depth = current->lock_depth+1;
		119	if (likely(!depth))
		120	__lock_kernel();
		121	current->lock_depth = depth;
		122	}
84		123
85	if (likely(--task->lock_depth < 0))	124	void __lockfunc unlock_kernel(void)
86	up(&kernel_sem);	125	{
		126	BUG_ON(current->lock_depth < 0);
		127	if (likely(--current->lock_depth < 0))
		128	__unlock_kernel();
87	}	129	}
88		130
89	EXPORT_SYMBOL(lock_kernel);	131	EXPORT_SYMBOL(lock_kernel);