aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2008-05-10 23:58:02 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2008-05-10 23:58:02 -0400
commit8e3e076c5a78519a9f64cd384e8f18bc21882ce0 (patch)
treef032258fde3aa4771e86bf4552fe4530c221dec3
parent00b41ec2611dc98f87f30753ee00a53db648d662 (diff)
BKL: revert back to the old spinlock implementation
The generic semaphore rewrite had a huge performance regression on AIM7 (and potentially other BKL-heavy benchmarks) because the generic semaphores had been rewritten to be simple to understand and fair. The latter, in particular, turns a semaphore-based BKL implementation into a mess of scheduling. The attempt to fix the performance regression failed miserably (see the previous commit 00b41ec2611dc98f87f30753ee00a53db648d662 'Revert "semaphore: fix"'), and so for now the simple and sane approach is to instead just go back to the old spinlock-based BKL implementation that never had any issues like this. This patch also has the advantage of being reported to fix the regression completely according to Yanmin Zhang, unlike the semaphore hack which still left a couple percentage point regression. As a spinlock, the BKL obviously has the potential to be a latency issue, but it's not really any different from any other spinlock in that respect. We do want to get rid of the BKL asap, but that has been the plan for several years. These days, the biggest users are in the tty layer (open/release in particular) and Alan holds out some hope: "tty release is probably a few months away from getting cured - I'm afraid it will almost certainly be the very last user of the BKL in tty to get fixed as it depends on everything else being sanely locked." so while we're not there yet, we do have a plan of action. Tested-by: Yanmin Zhang <yanmin_zhang@linux.intel.com> Cc: Ingo Molnar <mingo@elte.hu> Cc: Andi Kleen <andi@firstfloor.org> Cc: Matthew Wilcox <matthew@wil.cx> Cc: Alexander Viro <viro@ftp.linux.org.uk> Cc: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--arch/mn10300/Kconfig11
-rw-r--r--include/linux/hardirq.h18
-rw-r--r--kernel/sched.c27
-rw-r--r--lib/kernel_lock.c120
4 files changed, 95 insertions, 81 deletions
diff --git a/arch/mn10300/Kconfig b/arch/mn10300/Kconfig
index 6a6409adc564..e856218da90d 100644
--- a/arch/mn10300/Kconfig
+++ b/arch/mn10300/Kconfig
@@ -186,17 +186,6 @@ config PREEMPT
186 Say Y here if you are building a kernel for a desktop, embedded 186 Say Y here if you are building a kernel for a desktop, embedded
187 or real-time system. Say N if you are unsure. 187 or real-time system. Say N if you are unsure.
188 188
189config PREEMPT_BKL
190 bool "Preempt The Big Kernel Lock"
191 depends on PREEMPT
192 default y
193 help
194 This option reduces the latency of the kernel by making the
195 big kernel lock preemptible.
196
197 Say Y here if you are building a kernel for a desktop system.
198 Say N if you are unsure.
199
200config MN10300_CURRENT_IN_E2 189config MN10300_CURRENT_IN_E2
201 bool "Hold current task address in E2 register" 190 bool "Hold current task address in E2 register"
202 default y 191 default y
diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h
index 897f723bd222..181006cc94a0 100644
--- a/include/linux/hardirq.h
+++ b/include/linux/hardirq.h
@@ -72,6 +72,14 @@
72#define in_softirq() (softirq_count()) 72#define in_softirq() (softirq_count())
73#define in_interrupt() (irq_count()) 73#define in_interrupt() (irq_count())
74 74
75#if defined(CONFIG_PREEMPT)
76# define PREEMPT_INATOMIC_BASE kernel_locked()
77# define PREEMPT_CHECK_OFFSET 1
78#else
79# define PREEMPT_INATOMIC_BASE 0
80# define PREEMPT_CHECK_OFFSET 0
81#endif
82
75/* 83/*
76 * Are we running in atomic context? WARNING: this macro cannot 84 * Are we running in atomic context? WARNING: this macro cannot
77 * always detect atomic context; in particular, it cannot know about 85 * always detect atomic context; in particular, it cannot know about
@@ -79,17 +87,11 @@
79 * used in the general case to determine whether sleeping is possible. 87 * used in the general case to determine whether sleeping is possible.
80 * Do not use in_atomic() in driver code. 88 * Do not use in_atomic() in driver code.
81 */ 89 */
82#define in_atomic() ((preempt_count() & ~PREEMPT_ACTIVE) != 0) 90#define in_atomic() ((preempt_count() & ~PREEMPT_ACTIVE) != PREEMPT_INATOMIC_BASE)
83
84#ifdef CONFIG_PREEMPT
85# define PREEMPT_CHECK_OFFSET 1
86#else
87# define PREEMPT_CHECK_OFFSET 0
88#endif
89 91
90/* 92/*
91 * Check whether we were atomic before we did preempt_disable(): 93 * Check whether we were atomic before we did preempt_disable():
92 * (used by the scheduler) 94 * (used by the scheduler, *after* releasing the kernel lock)
93 */ 95 */
94#define in_atomic_preempt_off() \ 96#define in_atomic_preempt_off() \
95 ((preempt_count() & ~PREEMPT_ACTIVE) != PREEMPT_CHECK_OFFSET) 97 ((preempt_count() & ~PREEMPT_ACTIVE) != PREEMPT_CHECK_OFFSET)
diff --git a/kernel/sched.c b/kernel/sched.c
index 58fb8af15776..c51b6565e07c 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4567,8 +4567,6 @@ EXPORT_SYMBOL(schedule);
4567asmlinkage void __sched preempt_schedule(void) 4567asmlinkage void __sched preempt_schedule(void)
4568{ 4568{
4569 struct thread_info *ti = current_thread_info(); 4569 struct thread_info *ti = current_thread_info();
4570 struct task_struct *task = current;
4571 int saved_lock_depth;
4572 4570
4573 /* 4571 /*
4574 * If there is a non-zero preempt_count or interrupts are disabled, 4572 * If there is a non-zero preempt_count or interrupts are disabled,
@@ -4579,16 +4577,7 @@ asmlinkage void __sched preempt_schedule(void)
4579 4577
4580 do { 4578 do {
4581 add_preempt_count(PREEMPT_ACTIVE); 4579 add_preempt_count(PREEMPT_ACTIVE);
4582
4583 /*
4584 * We keep the big kernel semaphore locked, but we
4585 * clear ->lock_depth so that schedule() doesnt
4586 * auto-release the semaphore:
4587 */
4588 saved_lock_depth = task->lock_depth;
4589 task->lock_depth = -1;
4590 schedule(); 4580 schedule();
4591 task->lock_depth = saved_lock_depth;
4592 sub_preempt_count(PREEMPT_ACTIVE); 4581 sub_preempt_count(PREEMPT_ACTIVE);
4593 4582
4594 /* 4583 /*
@@ -4609,26 +4598,15 @@ EXPORT_SYMBOL(preempt_schedule);
4609asmlinkage void __sched preempt_schedule_irq(void) 4598asmlinkage void __sched preempt_schedule_irq(void)
4610{ 4599{
4611 struct thread_info *ti = current_thread_info(); 4600 struct thread_info *ti = current_thread_info();
4612 struct task_struct *task = current;
4613 int saved_lock_depth;
4614 4601
4615 /* Catch callers which need to be fixed */ 4602 /* Catch callers which need to be fixed */
4616 BUG_ON(ti->preempt_count || !irqs_disabled()); 4603 BUG_ON(ti->preempt_count || !irqs_disabled());
4617 4604
4618 do { 4605 do {
4619 add_preempt_count(PREEMPT_ACTIVE); 4606 add_preempt_count(PREEMPT_ACTIVE);
4620
4621 /*
4622 * We keep the big kernel semaphore locked, but we
4623 * clear ->lock_depth so that schedule() doesnt
4624 * auto-release the semaphore:
4625 */
4626 saved_lock_depth = task->lock_depth;
4627 task->lock_depth = -1;
4628 local_irq_enable(); 4607 local_irq_enable();
4629 schedule(); 4608 schedule();
4630 local_irq_disable(); 4609 local_irq_disable();
4631 task->lock_depth = saved_lock_depth;
4632 sub_preempt_count(PREEMPT_ACTIVE); 4610 sub_preempt_count(PREEMPT_ACTIVE);
4633 4611
4634 /* 4612 /*
@@ -5853,8 +5831,11 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
5853 spin_unlock_irqrestore(&rq->lock, flags); 5831 spin_unlock_irqrestore(&rq->lock, flags);
5854 5832
5855 /* Set the preempt count _outside_ the spinlocks! */ 5833 /* Set the preempt count _outside_ the spinlocks! */
5834#if defined(CONFIG_PREEMPT)
5835 task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0);
5836#else
5856 task_thread_info(idle)->preempt_count = 0; 5837 task_thread_info(idle)->preempt_count = 0;
5857 5838#endif
5858 /* 5839 /*
5859 * The idle tasks have their own, simple scheduling class: 5840 * The idle tasks have their own, simple scheduling class:
5860 */ 5841 */
diff --git a/lib/kernel_lock.c b/lib/kernel_lock.c
index cd3e82530b03..01a3c22c1b5a 100644
--- a/lib/kernel_lock.c
+++ b/lib/kernel_lock.c
@@ -11,79 +11,121 @@
11#include <linux/semaphore.h> 11#include <linux/semaphore.h>
12 12
13/* 13/*
14 * The 'big kernel semaphore' 14 * The 'big kernel lock'
15 * 15 *
16 * This mutex is taken and released recursively by lock_kernel() 16 * This spinlock is taken and released recursively by lock_kernel()
17 * and unlock_kernel(). It is transparently dropped and reacquired 17 * and unlock_kernel(). It is transparently dropped and reacquired
18 * over schedule(). It is used to protect legacy code that hasn't 18 * over schedule(). It is used to protect legacy code that hasn't
19 * been migrated to a proper locking design yet. 19 * been migrated to a proper locking design yet.
20 * 20 *
21 * Note: code locked by this semaphore will only be serialized against
22 * other code using the same locking facility. The code guarantees that
23 * the task remains on the same CPU.
24 *
25 * Don't use in new code. 21 * Don't use in new code.
26 */ 22 */
27static DECLARE_MUTEX(kernel_sem); 23static __cacheline_aligned_in_smp DEFINE_SPINLOCK(kernel_flag);
24
28 25
29/* 26/*
30 * Re-acquire the kernel semaphore. 27 * Acquire/release the underlying lock from the scheduler.
31 * 28 *
32 * This function is called with preemption off. 29 * This is called with preemption disabled, and should
30 * return an error value if it cannot get the lock and
31 * TIF_NEED_RESCHED gets set.
33 * 32 *
34 * We are executing in schedule() so the code must be extremely careful 33 * If it successfully gets the lock, it should increment
35 * about recursion, both due to the down() and due to the enabling of 34 * the preemption count like any spinlock does.
36 * preemption. schedule() will re-check the preemption flag after 35 *
37 * reacquiring the semaphore. 36 * (This works on UP too - _raw_spin_trylock will never
37 * return false in that case)
38 */ 38 */
39int __lockfunc __reacquire_kernel_lock(void) 39int __lockfunc __reacquire_kernel_lock(void)
40{ 40{
41 struct task_struct *task = current; 41 while (!_raw_spin_trylock(&kernel_flag)) {
42 int saved_lock_depth = task->lock_depth; 42 if (test_thread_flag(TIF_NEED_RESCHED))
43 43 return -EAGAIN;
44 BUG_ON(saved_lock_depth < 0); 44 cpu_relax();
45 45 }
46 task->lock_depth = -1;
47 preempt_enable_no_resched();
48
49 down(&kernel_sem);
50
51 preempt_disable(); 46 preempt_disable();
52 task->lock_depth = saved_lock_depth;
53
54 return 0; 47 return 0;
55} 48}
56 49
57void __lockfunc __release_kernel_lock(void) 50void __lockfunc __release_kernel_lock(void)
58{ 51{
59 up(&kernel_sem); 52 _raw_spin_unlock(&kernel_flag);
53 preempt_enable_no_resched();
60} 54}
61 55
62/* 56/*
63 * Getting the big kernel semaphore. 57 * These are the BKL spinlocks - we try to be polite about preemption.
58 * If SMP is not on (ie UP preemption), this all goes away because the
59 * _raw_spin_trylock() will always succeed.
64 */ 60 */
65void __lockfunc lock_kernel(void) 61#ifdef CONFIG_PREEMPT
62static inline void __lock_kernel(void)
66{ 63{
67 struct task_struct *task = current; 64 preempt_disable();
68 int depth = task->lock_depth + 1; 65 if (unlikely(!_raw_spin_trylock(&kernel_flag))) {
66 /*
67 * If preemption was disabled even before this
68 * was called, there's nothing we can be polite
69 * about - just spin.
70 */
71 if (preempt_count() > 1) {
72 _raw_spin_lock(&kernel_flag);
73 return;
74 }
69 75
70 if (likely(!depth))
71 /* 76 /*
72 * No recursion worries - we set up lock_depth _after_ 77 * Otherwise, let's wait for the kernel lock
78 * with preemption enabled..
73 */ 79 */
74 down(&kernel_sem); 80 do {
81 preempt_enable();
82 while (spin_is_locked(&kernel_flag))
83 cpu_relax();
84 preempt_disable();
85 } while (!_raw_spin_trylock(&kernel_flag));
86 }
87}
75 88
76 task->lock_depth = depth; 89#else
90
91/*
92 * Non-preemption case - just get the spinlock
93 */
94static inline void __lock_kernel(void)
95{
96 _raw_spin_lock(&kernel_flag);
77} 97}
98#endif
78 99
79void __lockfunc unlock_kernel(void) 100static inline void __unlock_kernel(void)
80{ 101{
81 struct task_struct *task = current; 102 /*
103 * the BKL is not covered by lockdep, so we open-code the
104 * unlocking sequence (and thus avoid the dep-chain ops):
105 */
106 _raw_spin_unlock(&kernel_flag);
107 preempt_enable();
108}
82 109
83 BUG_ON(task->lock_depth < 0); 110/*
111 * Getting the big kernel lock.
112 *
113 * This cannot happen asynchronously, so we only need to
114 * worry about other CPU's.
115 */
116void __lockfunc lock_kernel(void)
117{
118 int depth = current->lock_depth+1;
119 if (likely(!depth))
120 __lock_kernel();
121 current->lock_depth = depth;
122}
84 123
85 if (likely(--task->lock_depth < 0)) 124void __lockfunc unlock_kernel(void)
86 up(&kernel_sem); 125{
126 BUG_ON(current->lock_depth < 0);
127 if (likely(--current->lock_depth < 0))
128 __unlock_kernel();
87} 129}
88 130
89EXPORT_SYMBOL(lock_kernel); 131EXPORT_SYMBOL(lock_kernel);