diff options
| author | Yuanhan Liu <yuanhan.liu@linux.intel.com> | 2013-02-01 05:59:16 -0500 |
|---|---|---|
| committer | Ingo Molnar <mingo@kernel.org> | 2013-02-19 02:43:39 -0500 |
| commit | 41ef8f826692c8f65882bec0a8211bd4d1d2d19a (patch) | |
| tree | a59199669e2ffb6f1fffaaee9a98b50fd82d1d96 | |
| parent | fe2b05f7ca9f906be61dced5489f63b8b4d7c770 (diff) | |
rwsem-spinlock: Implement writer lock-stealing for better scalability
We (Linux Kernel Performance project) found a regression
introduced by commit:
5a505085f043 mm/rmap: Convert the struct anon_vma::mutex to an rwsem
which converted all anon_vma::mutex locks rwsem write locks.
The semantics are the same, but the behavioral difference is
quite huge in some cases. After investigating it we found the
root cause: mutexes support lock stealing while rwsems don't.
Here is the link for the detailed regression report:
https://lkml.org/lkml/2013/1/29/84
Ingo suggested adding write lock stealing to rwsems:
"I think we should allow lock-steal between rwsem writers - that
will not hurt fairness as most rwsem fairness concerns relate to
reader vs. writer fairness"
And here is the rwsem-spinlock version.
With this patch, we got a double performance increase in one
test box with following aim7 workfile:
FILESIZE: 1M
POOLSIZE: 10M
10 fork_test
/usr/bin/time output w/o patch /usr/bin/time_output with patch
-- Percent of CPU this job got: 369% Percent of CPU this job got: 537%
Voluntary context switches: 640595016 Voluntary context switches: 157915561
We got a 45% increase in CPU usage and saved about 3/4 voluntary context switches.
Reported-by: LKP project <lkp@linux.intel.com>
Suggested-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Yuanhan Liu <yuanhan.liu@linux.intel.com>
Cc: Alex Shi <alex.shi@intel.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Michel Lespinasse <walken@google.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Anton Blanchard <anton@samba.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: paul.gortmaker@windriver.com
Link: http://lkml.kernel.org/r/1359716356-23865-1-git-send-email-yuanhan.liu@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
| -rw-r--r-- | lib/rwsem-spinlock.c | 69 |
1 files changed, 24 insertions, 45 deletions
diff --git a/lib/rwsem-spinlock.c b/lib/rwsem-spinlock.c index 7e0d6a58fc83..7542afbb22b3 100644 --- a/lib/rwsem-spinlock.c +++ b/lib/rwsem-spinlock.c | |||
| @@ -73,20 +73,13 @@ __rwsem_do_wake(struct rw_semaphore *sem, int wakewrite) | |||
| 73 | goto dont_wake_writers; | 73 | goto dont_wake_writers; |
| 74 | } | 74 | } |
| 75 | 75 | ||
| 76 | /* if we are allowed to wake writers try to grant a single write lock | 76 | /* |
| 77 | * if there's a writer at the front of the queue | 77 | * as we support write lock stealing, we can't set sem->activity |
| 78 | * - we leave the 'waiting count' incremented to signify potential | 78 | * to -1 here to indicate we get the lock. Instead, we wake it up |
| 79 | * contention | 79 | * to let it go get it again. |
| 80 | */ | 80 | */ |
| 81 | if (waiter->flags & RWSEM_WAITING_FOR_WRITE) { | 81 | if (waiter->flags & RWSEM_WAITING_FOR_WRITE) { |
| 82 | sem->activity = -1; | 82 | wake_up_process(waiter->task); |
| 83 | list_del(&waiter->list); | ||
| 84 | tsk = waiter->task; | ||
| 85 | /* Don't touch waiter after ->task has been NULLed */ | ||
| 86 | smp_mb(); | ||
| 87 | waiter->task = NULL; | ||
| 88 | wake_up_process(tsk); | ||
| 89 | put_task_struct(tsk); | ||
| 90 | goto out; | 83 | goto out; |
| 91 | } | 84 | } |
| 92 | 85 | ||
| @@ -121,18 +114,10 @@ static inline struct rw_semaphore * | |||
| 121 | __rwsem_wake_one_writer(struct rw_semaphore *sem) | 114 | __rwsem_wake_one_writer(struct rw_semaphore *sem) |
| 122 | { | 115 | { |
| 123 | struct rwsem_waiter *waiter; | 116 | struct rwsem_waiter *waiter; |
| 124 | struct task_struct *tsk; | ||
| 125 | |||
| 126 | sem->activity = -1; | ||
| 127 | 117 | ||
| 128 | waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list); | 118 | waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list); |
| 129 | list_del(&waiter->list); | 119 | wake_up_process(waiter->task); |
| 130 | 120 | ||
| 131 | tsk = waiter->task; | ||
| 132 | smp_mb(); | ||
| 133 | waiter->task = NULL; | ||
| 134 | wake_up_process(tsk); | ||
| 135 | put_task_struct(tsk); | ||
| 136 | return sem; | 121 | return sem; |
| 137 | } | 122 | } |
| 138 | 123 | ||
| @@ -204,7 +189,6 @@ int __down_read_trylock(struct rw_semaphore *sem) | |||
| 204 | 189 | ||
| 205 | /* | 190 | /* |
| 206 | * get a write lock on the semaphore | 191 | * get a write lock on the semaphore |
| 207 | * - we increment the waiting count anyway to indicate an exclusive lock | ||
| 208 | */ | 192 | */ |
| 209 | void __sched __down_write_nested(struct rw_semaphore *sem, int subclass) | 193 | void __sched __down_write_nested(struct rw_semaphore *sem, int subclass) |
| 210 | { | 194 | { |
| @@ -214,37 +198,32 @@ void __sched __down_write_nested(struct rw_semaphore *sem, int subclass) | |||
| 214 | 198 | ||
| 215 | raw_spin_lock_irqsave(&sem->wait_lock, flags); | 199 | raw_spin_lock_irqsave(&sem->wait_lock, flags); |
| 216 | 200 | ||
| 217 | if (sem->activity == 0 && list_empty(&sem->wait_list)) { | ||
| 218 | /* granted */ | ||
| 219 | sem->activity = -1; | ||
| 220 | raw_spin_unlock_irqrestore(&sem->wait_lock, flags); | ||
| 221 | goto out; | ||
| 222 | } | ||
| 223 | |||
| 224 | tsk = current; | ||
| 225 | set_task_state(tsk, TASK_UNINTERRUPTIBLE); | ||
| 226 | |||
| 227 | /* set up my own style of waitqueue */ | 201 | /* set up my own style of waitqueue */ |
| 202 | tsk = current; | ||
| 228 | waiter.task = tsk; | 203 | waiter.task = tsk; |
| 229 | waiter.flags = RWSEM_WAITING_FOR_WRITE; | 204 | waiter.flags = RWSEM_WAITING_FOR_WRITE; |
| 230 | get_task_struct(tsk); | ||
| 231 | |||
| 232 | list_add_tail(&waiter.list, &sem->wait_list); | 205 | list_add_tail(&waiter.list, &sem->wait_list); |
| 233 | 206 | ||
| 234 | /* we don't need to touch the semaphore struct anymore */ | 207 | /* wait for someone to release the lock */ |
| 235 | raw_spin_unlock_irqrestore(&sem->wait_lock, flags); | ||
| 236 | |||
| 237 | /* wait to be given the lock */ | ||
| 238 | for (;;) { | 208 | for (;;) { |
| 239 | if (!waiter.task) | 209 | /* |
| 210 | * That is the key to support write lock stealing: allows the | ||
| 211 | * task already on CPU to get the lock soon rather than put | ||
| 212 | * itself into sleep and waiting for system woke it or someone | ||
| 213 | * else in the head of the wait list up. | ||
| 214 | */ | ||
| 215 | if (sem->activity == 0) | ||
| 240 | break; | 216 | break; |
| 241 | schedule(); | ||
| 242 | set_task_state(tsk, TASK_UNINTERRUPTIBLE); | 217 | set_task_state(tsk, TASK_UNINTERRUPTIBLE); |
| 218 | raw_spin_unlock_irqrestore(&sem->wait_lock, flags); | ||
| 219 | schedule(); | ||
| 220 | raw_spin_lock_irqsave(&sem->wait_lock, flags); | ||
| 243 | } | 221 | } |
| 222 | /* got the lock */ | ||
| 223 | sem->activity = -1; | ||
| 224 | list_del(&waiter.list); | ||
| 244 | 225 | ||
| 245 | tsk->state = TASK_RUNNING; | 226 | raw_spin_unlock_irqrestore(&sem->wait_lock, flags); |
| 246 | out: | ||
| 247 | ; | ||
| 248 | } | 227 | } |
| 249 | 228 | ||
| 250 | void __sched __down_write(struct rw_semaphore *sem) | 229 | void __sched __down_write(struct rw_semaphore *sem) |
| @@ -262,8 +241,8 @@ int __down_write_trylock(struct rw_semaphore *sem) | |||
| 262 | 241 | ||
| 263 | raw_spin_lock_irqsave(&sem->wait_lock, flags); | 242 | raw_spin_lock_irqsave(&sem->wait_lock, flags); |
| 264 | 243 | ||
| 265 | if (sem->activity == 0 && list_empty(&sem->wait_list)) { | 244 | if (sem->activity == 0) { |
| 266 | /* granted */ | 245 | /* got the lock */ |
| 267 | sem->activity = -1; | 246 | sem->activity = -1; |
| 268 | ret = 1; | 247 | ret = 1; |
| 269 | } | 248 | } |
