aboutsummaryrefslogtreecommitdiffstats
path: root/lib
diff options
context:
space:
mode:
authorAlex Shi <alex.shi@intel.com>2013-02-05 08:11:55 -0500
committerIngo Molnar <mingo@kernel.org>2013-02-19 02:42:43 -0500
commitce6711f3d196f09ca0ed29a24dfad42d83912b20 (patch)
tree6d948f9036bdee10baaa825d86ad4a444d9b22bb /lib
parent5cd3f5affad2109fd1458aab3f6216f2181e26ea (diff)
rwsem: Implement writer lock-stealing for better scalability
Commit 5a505085f043 ("mm/rmap: Convert the struct anon_vma::mutex to an rwsem") changed struct anon_vma::mutex to an rwsem, which caused aim7 fork_test performance to drop by 50%. Yuanhan Liu did the following excellent analysis: https://lkml.org/lkml/2013/1/29/84 and found that the regression is caused by strict, serialized, FIFO sequential write-ownership of rwsems. Ingo suggested implementing opportunistic lock-stealing for the front writer task in the waitqueue. Yuanhan Liu implemented lock-stealing for spinlock-rwsems, which indeed recovered much of the regression - confirming the analysis that the main factor in the regression was the FIFO writer-fairness of rwsems. In this patch we allow lock-stealing to happen when the first waiter is also writer. With that change in place the aim7 fork_test performance is fully recovered on my Intel NHM EP, NHM EX, SNB EP 2S and 4S test-machines. Reported-by: lkp@linux.intel.com Reported-by: Yuanhan Liu <yuanhan.liu@linux.intel.com> Signed-off-by: Alex Shi <alex.shi@intel.com> Cc: David Howells <dhowells@redhat.com> Cc: Michel Lespinasse <walken@google.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Anton Blanchard <anton@samba.org> Cc: Arjan van de Ven <arjan@linux.intel.com> Cc: paul.gortmaker@windriver.com Link: https://lkml.org/lkml/2013/1/29/84 Link: http://lkml.kernel.org/r/1360069915-31619-1-git-send-email-alex.shi@intel.com [ Small stylistic fixes, updated changelog. ] Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'lib')
-rw-r--r--lib/rwsem.c75
1 files changed, 46 insertions, 29 deletions
diff --git a/lib/rwsem.c b/lib/rwsem.c
index 8337e1b9bb8d..ad5e0df16ab4 100644
--- a/lib/rwsem.c
+++ b/lib/rwsem.c
@@ -2,6 +2,8 @@
2 * 2 *
3 * Written by David Howells (dhowells@redhat.com). 3 * Written by David Howells (dhowells@redhat.com).
4 * Derived from arch/i386/kernel/semaphore.c 4 * Derived from arch/i386/kernel/semaphore.c
5 *
6 * Writer lock-stealing by Alex Shi <alex.shi@intel.com>
5 */ 7 */
6#include <linux/rwsem.h> 8#include <linux/rwsem.h>
7#include <linux/sched.h> 9#include <linux/sched.h>
@@ -60,7 +62,7 @@ __rwsem_do_wake(struct rw_semaphore *sem, int wake_type)
60 struct rwsem_waiter *waiter; 62 struct rwsem_waiter *waiter;
61 struct task_struct *tsk; 63 struct task_struct *tsk;
62 struct list_head *next; 64 struct list_head *next;
63 signed long oldcount, woken, loop, adjustment; 65 signed long woken, loop, adjustment;
64 66
65 waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list); 67 waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list);
66 if (!(waiter->flags & RWSEM_WAITING_FOR_WRITE)) 68 if (!(waiter->flags & RWSEM_WAITING_FOR_WRITE))
@@ -72,30 +74,8 @@ __rwsem_do_wake(struct rw_semaphore *sem, int wake_type)
72 */ 74 */
73 goto out; 75 goto out;
74 76
75 /* There's a writer at the front of the queue - try to grant it the 77 /* Wake up the writing waiter and let the task grab the sem: */
76 * write lock. However, we only wake this writer if we can transition 78 wake_up_process(waiter->task);
77 * the active part of the count from 0 -> 1
78 */
79 adjustment = RWSEM_ACTIVE_WRITE_BIAS;
80 if (waiter->list.next == &sem->wait_list)
81 adjustment -= RWSEM_WAITING_BIAS;
82
83 try_again_write:
84 oldcount = rwsem_atomic_update(adjustment, sem) - adjustment;
85 if (oldcount & RWSEM_ACTIVE_MASK)
86 /* Someone grabbed the sem already */
87 goto undo_write;
88
89 /* We must be careful not to touch 'waiter' after we set ->task = NULL.
90 * It is an allocated on the waiter's stack and may become invalid at
91 * any time after that point (due to a wakeup from another source).
92 */
93 list_del(&waiter->list);
94 tsk = waiter->task;
95 smp_mb();
96 waiter->task = NULL;
97 wake_up_process(tsk);
98 put_task_struct(tsk);
99 goto out; 79 goto out;
100 80
101 readers_only: 81 readers_only:
@@ -157,12 +137,40 @@ __rwsem_do_wake(struct rw_semaphore *sem, int wake_type)
157 137
158 out: 138 out:
159 return sem; 139 return sem;
140}
141
142/* Try to get write sem, caller holds sem->wait_lock: */
143static int try_get_writer_sem(struct rw_semaphore *sem,
144 struct rwsem_waiter *waiter)
145{
146 struct rwsem_waiter *fwaiter;
147 long oldcount, adjustment;
160 148
161 /* undo the change to the active count, but check for a transition 149 /* only steal when first waiter is writing */
162 * 1->0 */ 150 fwaiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list);
163 undo_write: 151 if (!(fwaiter->flags & RWSEM_WAITING_FOR_WRITE))
152 return 0;
153
154 adjustment = RWSEM_ACTIVE_WRITE_BIAS;
155 /* Only one waiter in the queue: */
156 if (fwaiter == waiter && waiter->list.next == &sem->wait_list)
157 adjustment -= RWSEM_WAITING_BIAS;
158
159try_again_write:
160 oldcount = rwsem_atomic_update(adjustment, sem) - adjustment;
161 if (!(oldcount & RWSEM_ACTIVE_MASK)) {
162 /* No active lock: */
163 struct task_struct *tsk = waiter->task;
164
165 list_del(&waiter->list);
166 smp_mb();
167 put_task_struct(tsk);
168 tsk->state = TASK_RUNNING;
169 return 1;
170 }
171 /* some one grabbed the sem already */
164 if (rwsem_atomic_update(-adjustment, sem) & RWSEM_ACTIVE_MASK) 172 if (rwsem_atomic_update(-adjustment, sem) & RWSEM_ACTIVE_MASK)
165 goto out; 173 return 0;
166 goto try_again_write; 174 goto try_again_write;
167} 175}
168 176
@@ -210,6 +218,15 @@ rwsem_down_failed_common(struct rw_semaphore *sem,
210 for (;;) { 218 for (;;) {
211 if (!waiter.task) 219 if (!waiter.task)
212 break; 220 break;
221
222 raw_spin_lock_irq(&sem->wait_lock);
223 /* Try to get the writer sem, may steal from the head writer: */
224 if (flags == RWSEM_WAITING_FOR_WRITE)
225 if (try_get_writer_sem(sem, &waiter)) {
226 raw_spin_unlock_irq(&sem->wait_lock);
227 return sem;
228 }
229 raw_spin_unlock_irq(&sem->wait_lock);
213 schedule(); 230 schedule();
214 set_task_state(tsk, TASK_UNINTERRUPTIBLE); 231 set_task_state(tsk, TASK_UNINTERRUPTIBLE);
215 } 232 }