diff options
author | Yuanhan Liu <yuanhan.liu@linux.intel.com> | 2013-02-01 05:59:16 -0500 |
---|---|---|
committer | Ingo Molnar <mingo@kernel.org> | 2013-02-19 02:43:39 -0500 |
commit | 41ef8f826692c8f65882bec0a8211bd4d1d2d19a (patch) | |
tree | a59199669e2ffb6f1fffaaee9a98b50fd82d1d96 | |
parent | fe2b05f7ca9f906be61dced5489f63b8b4d7c770 (diff) |
rwsem-spinlock: Implement writer lock-stealing for better scalability
We (Linux Kernel Performance project) found a regression
introduced by commit:
5a505085f043 mm/rmap: Convert the struct anon_vma::mutex to an rwsem
which converted all anon_vma::mutex locks rwsem write locks.
The semantics are the same, but the behavioral difference is
quite huge in some cases. After investigating it we found the
root cause: mutexes support lock stealing while rwsems don't.
Here is the link for the detailed regression report:
https://lkml.org/lkml/2013/1/29/84
Ingo suggested adding write lock stealing to rwsems:
"I think we should allow lock-steal between rwsem writers - that
will not hurt fairness as most rwsem fairness concerns relate to
reader vs. writer fairness"
And here is the rwsem-spinlock version.
With this patch, we got a double performance increase in one
test box with following aim7 workfile:
FILESIZE: 1M
POOLSIZE: 10M
10 fork_test
/usr/bin/time output w/o patch /usr/bin/time_output with patch
-- Percent of CPU this job got: 369% Percent of CPU this job got: 537%
Voluntary context switches: 640595016 Voluntary context switches: 157915561
We got a 45% increase in CPU usage and saved about 3/4 voluntary context switches.
Reported-by: LKP project <lkp@linux.intel.com>
Suggested-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Yuanhan Liu <yuanhan.liu@linux.intel.com>
Cc: Alex Shi <alex.shi@intel.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Michel Lespinasse <walken@google.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Anton Blanchard <anton@samba.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: paul.gortmaker@windriver.com
Link: http://lkml.kernel.org/r/1359716356-23865-1-git-send-email-yuanhan.liu@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
-rw-r--r-- | lib/rwsem-spinlock.c | 69 |
1 files changed, 24 insertions, 45 deletions
diff --git a/lib/rwsem-spinlock.c b/lib/rwsem-spinlock.c index 7e0d6a58fc83..7542afbb22b3 100644 --- a/lib/rwsem-spinlock.c +++ b/lib/rwsem-spinlock.c | |||
@@ -73,20 +73,13 @@ __rwsem_do_wake(struct rw_semaphore *sem, int wakewrite) | |||
73 | goto dont_wake_writers; | 73 | goto dont_wake_writers; |
74 | } | 74 | } |
75 | 75 | ||
76 | /* if we are allowed to wake writers try to grant a single write lock | 76 | /* |
77 | * if there's a writer at the front of the queue | 77 | * as we support write lock stealing, we can't set sem->activity |
78 | * - we leave the 'waiting count' incremented to signify potential | 78 | * to -1 here to indicate we get the lock. Instead, we wake it up |
79 | * contention | 79 | * to let it go get it again. |
80 | */ | 80 | */ |
81 | if (waiter->flags & RWSEM_WAITING_FOR_WRITE) { | 81 | if (waiter->flags & RWSEM_WAITING_FOR_WRITE) { |
82 | sem->activity = -1; | 82 | wake_up_process(waiter->task); |
83 | list_del(&waiter->list); | ||
84 | tsk = waiter->task; | ||
85 | /* Don't touch waiter after ->task has been NULLed */ | ||
86 | smp_mb(); | ||
87 | waiter->task = NULL; | ||
88 | wake_up_process(tsk); | ||
89 | put_task_struct(tsk); | ||
90 | goto out; | 83 | goto out; |
91 | } | 84 | } |
92 | 85 | ||
@@ -121,18 +114,10 @@ static inline struct rw_semaphore * | |||
121 | __rwsem_wake_one_writer(struct rw_semaphore *sem) | 114 | __rwsem_wake_one_writer(struct rw_semaphore *sem) |
122 | { | 115 | { |
123 | struct rwsem_waiter *waiter; | 116 | struct rwsem_waiter *waiter; |
124 | struct task_struct *tsk; | ||
125 | |||
126 | sem->activity = -1; | ||
127 | 117 | ||
128 | waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list); | 118 | waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list); |
129 | list_del(&waiter->list); | 119 | wake_up_process(waiter->task); |
130 | 120 | ||
131 | tsk = waiter->task; | ||
132 | smp_mb(); | ||
133 | waiter->task = NULL; | ||
134 | wake_up_process(tsk); | ||
135 | put_task_struct(tsk); | ||
136 | return sem; | 121 | return sem; |
137 | } | 122 | } |
138 | 123 | ||
@@ -204,7 +189,6 @@ int __down_read_trylock(struct rw_semaphore *sem) | |||
204 | 189 | ||
205 | /* | 190 | /* |
206 | * get a write lock on the semaphore | 191 | * get a write lock on the semaphore |
207 | * - we increment the waiting count anyway to indicate an exclusive lock | ||
208 | */ | 192 | */ |
209 | void __sched __down_write_nested(struct rw_semaphore *sem, int subclass) | 193 | void __sched __down_write_nested(struct rw_semaphore *sem, int subclass) |
210 | { | 194 | { |
@@ -214,37 +198,32 @@ void __sched __down_write_nested(struct rw_semaphore *sem, int subclass) | |||
214 | 198 | ||
215 | raw_spin_lock_irqsave(&sem->wait_lock, flags); | 199 | raw_spin_lock_irqsave(&sem->wait_lock, flags); |
216 | 200 | ||
217 | if (sem->activity == 0 && list_empty(&sem->wait_list)) { | ||
218 | /* granted */ | ||
219 | sem->activity = -1; | ||
220 | raw_spin_unlock_irqrestore(&sem->wait_lock, flags); | ||
221 | goto out; | ||
222 | } | ||
223 | |||
224 | tsk = current; | ||
225 | set_task_state(tsk, TASK_UNINTERRUPTIBLE); | ||
226 | |||
227 | /* set up my own style of waitqueue */ | 201 | /* set up my own style of waitqueue */ |
202 | tsk = current; | ||
228 | waiter.task = tsk; | 203 | waiter.task = tsk; |
229 | waiter.flags = RWSEM_WAITING_FOR_WRITE; | 204 | waiter.flags = RWSEM_WAITING_FOR_WRITE; |
230 | get_task_struct(tsk); | ||
231 | |||
232 | list_add_tail(&waiter.list, &sem->wait_list); | 205 | list_add_tail(&waiter.list, &sem->wait_list); |
233 | 206 | ||
234 | /* we don't need to touch the semaphore struct anymore */ | 207 | /* wait for someone to release the lock */ |
235 | raw_spin_unlock_irqrestore(&sem->wait_lock, flags); | ||
236 | |||
237 | /* wait to be given the lock */ | ||
238 | for (;;) { | 208 | for (;;) { |
239 | if (!waiter.task) | 209 | /* |
210 | * That is the key to support write lock stealing: allows the | ||
211 | * task already on CPU to get the lock soon rather than put | ||
212 | * itself into sleep and waiting for system woke it or someone | ||
213 | * else in the head of the wait list up. | ||
214 | */ | ||
215 | if (sem->activity == 0) | ||
240 | break; | 216 | break; |
241 | schedule(); | ||
242 | set_task_state(tsk, TASK_UNINTERRUPTIBLE); | 217 | set_task_state(tsk, TASK_UNINTERRUPTIBLE); |
218 | raw_spin_unlock_irqrestore(&sem->wait_lock, flags); | ||
219 | schedule(); | ||
220 | raw_spin_lock_irqsave(&sem->wait_lock, flags); | ||
243 | } | 221 | } |
222 | /* got the lock */ | ||
223 | sem->activity = -1; | ||
224 | list_del(&waiter.list); | ||
244 | 225 | ||
245 | tsk->state = TASK_RUNNING; | 226 | raw_spin_unlock_irqrestore(&sem->wait_lock, flags); |
246 | out: | ||
247 | ; | ||
248 | } | 227 | } |
249 | 228 | ||
250 | void __sched __down_write(struct rw_semaphore *sem) | 229 | void __sched __down_write(struct rw_semaphore *sem) |
@@ -262,8 +241,8 @@ int __down_write_trylock(struct rw_semaphore *sem) | |||
262 | 241 | ||
263 | raw_spin_lock_irqsave(&sem->wait_lock, flags); | 242 | raw_spin_lock_irqsave(&sem->wait_lock, flags); |
264 | 243 | ||
265 | if (sem->activity == 0 && list_empty(&sem->wait_list)) { | 244 | if (sem->activity == 0) { |
266 | /* granted */ | 245 | /* got the lock */ |
267 | sem->activity = -1; | 246 | sem->activity = -1; |
268 | ret = 1; | 247 | ret = 1; |
269 | } | 248 | } |