diff options
author | Alex Shi <alex.shi@intel.com> | 2013-02-05 08:11:55 -0500 |
---|---|---|
committer | Ingo Molnar <mingo@kernel.org> | 2013-02-19 02:42:43 -0500 |
commit | ce6711f3d196f09ca0ed29a24dfad42d83912b20 (patch) | |
tree | 6d948f9036bdee10baaa825d86ad4a444d9b22bb /lib | |
parent | 5cd3f5affad2109fd1458aab3f6216f2181e26ea (diff) |
rwsem: Implement writer lock-stealing for better scalability
Commit 5a505085f043 ("mm/rmap: Convert the struct anon_vma::mutex
to an rwsem") changed struct anon_vma::mutex to an rwsem, which
caused aim7 fork_test performance to drop by 50%.
Yuanhan Liu did the following excellent analysis:
https://lkml.org/lkml/2013/1/29/84
and found that the regression is caused by strict, serialized,
FIFO sequential write-ownership of rwsems. Ingo suggested
implementing opportunistic lock-stealing for the front writer
task in the waitqueue.
Yuanhan Liu implemented lock-stealing for spinlock-rwsems,
which indeed recovered much of the regression - confirming
the analysis that the main factor in the regression was the
FIFO writer-fairness of rwsems.
In this patch we allow lock-stealing to happen when the first
waiter is also writer. With that change in place the
aim7 fork_test performance is fully recovered on my
Intel NHM EP, NHM EX, SNB EP 2S and 4S test-machines.
Reported-by: lkp@linux.intel.com
Reported-by: Yuanhan Liu <yuanhan.liu@linux.intel.com>
Signed-off-by: Alex Shi <alex.shi@intel.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Michel Lespinasse <walken@google.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Anton Blanchard <anton@samba.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: paul.gortmaker@windriver.com
Link: https://lkml.org/lkml/2013/1/29/84
Link: http://lkml.kernel.org/r/1360069915-31619-1-git-send-email-alex.shi@intel.com
[ Small stylistic fixes, updated changelog. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'lib')
-rw-r--r-- | lib/rwsem.c | 75 |
1 files changed, 46 insertions, 29 deletions
diff --git a/lib/rwsem.c b/lib/rwsem.c index 8337e1b9bb8d..ad5e0df16ab4 100644 --- a/lib/rwsem.c +++ b/lib/rwsem.c | |||
@@ -2,6 +2,8 @@ | |||
2 | * | 2 | * |
3 | * Written by David Howells (dhowells@redhat.com). | 3 | * Written by David Howells (dhowells@redhat.com). |
4 | * Derived from arch/i386/kernel/semaphore.c | 4 | * Derived from arch/i386/kernel/semaphore.c |
5 | * | ||
6 | * Writer lock-stealing by Alex Shi <alex.shi@intel.com> | ||
5 | */ | 7 | */ |
6 | #include <linux/rwsem.h> | 8 | #include <linux/rwsem.h> |
7 | #include <linux/sched.h> | 9 | #include <linux/sched.h> |
@@ -60,7 +62,7 @@ __rwsem_do_wake(struct rw_semaphore *sem, int wake_type) | |||
60 | struct rwsem_waiter *waiter; | 62 | struct rwsem_waiter *waiter; |
61 | struct task_struct *tsk; | 63 | struct task_struct *tsk; |
62 | struct list_head *next; | 64 | struct list_head *next; |
63 | signed long oldcount, woken, loop, adjustment; | 65 | signed long woken, loop, adjustment; |
64 | 66 | ||
65 | waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list); | 67 | waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list); |
66 | if (!(waiter->flags & RWSEM_WAITING_FOR_WRITE)) | 68 | if (!(waiter->flags & RWSEM_WAITING_FOR_WRITE)) |
@@ -72,30 +74,8 @@ __rwsem_do_wake(struct rw_semaphore *sem, int wake_type) | |||
72 | */ | 74 | */ |
73 | goto out; | 75 | goto out; |
74 | 76 | ||
75 | /* There's a writer at the front of the queue - try to grant it the | 77 | /* Wake up the writing waiter and let the task grab the sem: */ |
76 | * write lock. However, we only wake this writer if we can transition | 78 | wake_up_process(waiter->task); |
77 | * the active part of the count from 0 -> 1 | ||
78 | */ | ||
79 | adjustment = RWSEM_ACTIVE_WRITE_BIAS; | ||
80 | if (waiter->list.next == &sem->wait_list) | ||
81 | adjustment -= RWSEM_WAITING_BIAS; | ||
82 | |||
83 | try_again_write: | ||
84 | oldcount = rwsem_atomic_update(adjustment, sem) - adjustment; | ||
85 | if (oldcount & RWSEM_ACTIVE_MASK) | ||
86 | /* Someone grabbed the sem already */ | ||
87 | goto undo_write; | ||
88 | |||
89 | /* We must be careful not to touch 'waiter' after we set ->task = NULL. | ||
90 | * It is an allocated on the waiter's stack and may become invalid at | ||
91 | * any time after that point (due to a wakeup from another source). | ||
92 | */ | ||
93 | list_del(&waiter->list); | ||
94 | tsk = waiter->task; | ||
95 | smp_mb(); | ||
96 | waiter->task = NULL; | ||
97 | wake_up_process(tsk); | ||
98 | put_task_struct(tsk); | ||
99 | goto out; | 79 | goto out; |
100 | 80 | ||
101 | readers_only: | 81 | readers_only: |
@@ -157,12 +137,40 @@ __rwsem_do_wake(struct rw_semaphore *sem, int wake_type) | |||
157 | 137 | ||
158 | out: | 138 | out: |
159 | return sem; | 139 | return sem; |
140 | } | ||
141 | |||
142 | /* Try to get write sem, caller holds sem->wait_lock: */ | ||
143 | static int try_get_writer_sem(struct rw_semaphore *sem, | ||
144 | struct rwsem_waiter *waiter) | ||
145 | { | ||
146 | struct rwsem_waiter *fwaiter; | ||
147 | long oldcount, adjustment; | ||
160 | 148 | ||
161 | /* undo the change to the active count, but check for a transition | 149 | /* only steal when first waiter is writing */ |
162 | * 1->0 */ | 150 | fwaiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list); |
163 | undo_write: | 151 | if (!(fwaiter->flags & RWSEM_WAITING_FOR_WRITE)) |
152 | return 0; | ||
153 | |||
154 | adjustment = RWSEM_ACTIVE_WRITE_BIAS; | ||
155 | /* Only one waiter in the queue: */ | ||
156 | if (fwaiter == waiter && waiter->list.next == &sem->wait_list) | ||
157 | adjustment -= RWSEM_WAITING_BIAS; | ||
158 | |||
159 | try_again_write: | ||
160 | oldcount = rwsem_atomic_update(adjustment, sem) - adjustment; | ||
161 | if (!(oldcount & RWSEM_ACTIVE_MASK)) { | ||
162 | /* No active lock: */ | ||
163 | struct task_struct *tsk = waiter->task; | ||
164 | |||
165 | list_del(&waiter->list); | ||
166 | smp_mb(); | ||
167 | put_task_struct(tsk); | ||
168 | tsk->state = TASK_RUNNING; | ||
169 | return 1; | ||
170 | } | ||
171 | /* some one grabbed the sem already */ | ||
164 | if (rwsem_atomic_update(-adjustment, sem) & RWSEM_ACTIVE_MASK) | 172 | if (rwsem_atomic_update(-adjustment, sem) & RWSEM_ACTIVE_MASK) |
165 | goto out; | 173 | return 0; |
166 | goto try_again_write; | 174 | goto try_again_write; |
167 | } | 175 | } |
168 | 176 | ||
@@ -210,6 +218,15 @@ rwsem_down_failed_common(struct rw_semaphore *sem, | |||
210 | for (;;) { | 218 | for (;;) { |
211 | if (!waiter.task) | 219 | if (!waiter.task) |
212 | break; | 220 | break; |
221 | |||
222 | raw_spin_lock_irq(&sem->wait_lock); | ||
223 | /* Try to get the writer sem, may steal from the head writer: */ | ||
224 | if (flags == RWSEM_WAITING_FOR_WRITE) | ||
225 | if (try_get_writer_sem(sem, &waiter)) { | ||
226 | raw_spin_unlock_irq(&sem->wait_lock); | ||
227 | return sem; | ||
228 | } | ||
229 | raw_spin_unlock_irq(&sem->wait_lock); | ||
213 | schedule(); | 230 | schedule(); |
214 | set_task_state(tsk, TASK_UNINTERRUPTIBLE); | 231 | set_task_state(tsk, TASK_UNINTERRUPTIBLE); |
215 | } | 232 | } |