aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/mutex.c
diff options
context:
space:
mode:
authorWaiman Long <Waiman.Long@hp.com>2013-04-17 15:23:13 -0400
committerIngo Molnar <mingo@kernel.org>2013-04-19 03:33:36 -0400
commit2bd2c92cf07cc4a373bf316c75b78ac465fefd35 (patch)
tree0d8e6121b5f9d364a3fa90489c8f8fe32717b63c /kernel/mutex.c
parent0dc8c730c98a06a4d927f8d08bd0dd6de973b8dd (diff)
mutex: Queue mutex spinners with MCS lock to reduce cacheline contention
The current mutex spinning code (with MUTEX_SPIN_ON_OWNER option turned on) allow multiple tasks to spin on a single mutex concurrently. A potential problem with the current approach is that when the mutex becomes available, all the spinning tasks will try to acquire the mutex more or less simultaneously. As a result, there will be a lot of cacheline bouncing especially on systems with a large number of CPUs. This patch tries to reduce this kind of contention by putting the mutex spinners into a queue so that only the first one in the queue will try to acquire the mutex. This will reduce contention and allow all the tasks to move forward faster. The queuing of mutex spinners is done using an MCS lock based implementation which will further reduce contention on the mutex cacheline than a similar ticket spinlock based implementation. This patch will add a new field into the mutex data structure for holding the MCS lock. This expands the mutex size by 8 bytes for 64-bit system and 4 bytes for 32-bit system. This overhead will be avoid if the MUTEX_SPIN_ON_OWNER option is turned off. The following table shows the jobs per minute (JPM) scalability data on an 8-node 80-core Westmere box with a 3.7.10 kernel. The numactl command is used to restrict the running of the fserver workloads to 1/2/4/8 nodes with hyperthreading off. +-----------------+-----------+-----------+-------------+----------+ | Configuration | Mean JPM | Mean JPM | Mean JPM | % Change | | | w/o patch | patch 1 | patches 1&2 | 1->1&2 | +-----------------+------------------------------------------------+ | | User Range 1100 - 2000 | +-----------------+------------------------------------------------+ | 8 nodes, HT off | 227972 | 227237 | 305043 | +34.2% | | 4 nodes, HT off | 393503 | 381558 | 394650 | +3.4% | | 2 nodes, HT off | 334957 | 325240 | 338853 | +4.2% | | 1 node , HT off | 198141 | 197972 | 198075 | +0.1% | +-----------------+------------------------------------------------+ | | User Range 200 - 1000 | +-----------------+------------------------------------------------+ | 8 nodes, HT off | 282325 | 312870 | 332185 | +6.2% | | 4 nodes, HT off | 390698 | 378279 | 393419 | +4.0% | | 2 nodes, HT off | 336986 | 326543 | 340260 | +4.2% | | 1 node , HT off | 197588 | 197622 | 197582 | 0.0% | +-----------------+-----------+-----------+-------------+----------+ At low user range 10-100, the JPM differences were within +/-1%. So they are not that interesting. The fserver workload uses mutex spinning extensively. With just the mutex change in the first patch, there is no noticeable change in performance. Rather, there is a slight drop in performance. This mutex spinning patch more than recovers the lost performance and show a significant increase of +30% at high user load with the full 8 nodes. Similar improvements were also seen in a 3.8 kernel. The table below shows the %time spent by different kernel functions as reported by perf when running the fserver workload at 1500 users with all 8 nodes. +-----------------------+-----------+---------+-------------+ | Function | % time | % time | % time | | | w/o patch | patch 1 | patches 1&2 | +-----------------------+-----------+---------+-------------+ | __read_lock_failed | 34.96% | 34.91% | 29.14% | | __write_lock_failed | 10.14% | 10.68% | 7.51% | | mutex_spin_on_owner | 3.62% | 3.42% | 2.33% | | mspin_lock | N/A | N/A | 9.90% | | __mutex_lock_slowpath | 1.46% | 0.81% | 0.14% | | _raw_spin_lock | 2.25% | 2.50% | 1.10% | +-----------------------+-----------+---------+-------------+ The fserver workload for an 8-node system is dominated by the contention in the read/write lock. Mutex contention also plays a role. With the first patch only, mutex contention is down (as shown by the __mutex_lock_slowpath figure) which help a little bit. We saw only a few percents improvement with that. By applying patch 2 as well, the single mutex_spin_on_owner figure is now split out into an additional mspin_lock figure. The time increases from 3.42% to 11.23%. It shows a great reduction in contention among the spinners leading to a 30% improvement. The time ratio 9.9/2.33=4.3 indicates that there are on average 4+ spinners waiting in the spin_lock loop for each spinner in the mutex_spin_on_owner loop. Contention in other locking functions also go down by quite a lot. The table below shows the performance change of both patches 1 & 2 over patch 1 alone in other AIM7 workloads (at 8 nodes, hyperthreading off). +--------------+---------------+----------------+-----------------+ | Workload | mean % change | mean % change | mean % change | | | 10-100 users | 200-1000 users | 1100-2000 users | +--------------+---------------+----------------+-----------------+ | alltests | 0.0% | -0.8% | +0.6% | | five_sec | -0.3% | +0.8% | +0.8% | | high_systime | +0.4% | +2.4% | +2.1% | | new_fserver | +0.1% | +14.1% | +34.2% | | shared | -0.5% | -0.3% | -0.4% | | short | -1.7% | -9.8% | -8.3% | +--------------+---------------+----------------+-----------------+ The short workload is the only one that shows a decline in performance probably due to the spinner locking and queuing overhead. Signed-off-by: Waiman Long <Waiman.Long@hp.com> Reviewed-by: Davidlohr Bueso <davidlohr.bueso@hp.com> Acked-by: Rik van Riel <riel@redhat.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Chandramouleeswaran Aswin <aswin@hp.com> Cc: Norton Scott J <scott.norton@hp.com> Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Cc: David Howells <dhowells@redhat.com> Cc: Dave Jones <davej@redhat.com> Cc: Clark Williams <williams@redhat.com> Cc: Peter Zijlstra <peterz@infradead.org> Link: http://lkml.kernel.org/r/1366226594-5506-4-git-send-email-Waiman.Long@hp.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'kernel/mutex.c')
-rw-r--r--kernel/mutex.c91
1 files changed, 90 insertions, 1 deletions
diff --git a/kernel/mutex.c b/kernel/mutex.c
index 70ebd855d9e8..1dbd4210baef 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -55,6 +55,9 @@ __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key)
55 spin_lock_init(&lock->wait_lock); 55 spin_lock_init(&lock->wait_lock);
56 INIT_LIST_HEAD(&lock->wait_list); 56 INIT_LIST_HEAD(&lock->wait_list);
57 mutex_clear_owner(lock); 57 mutex_clear_owner(lock);
58#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
59 lock->spin_mlock = NULL;
60#endif
58 61
59 debug_mutex_init(lock, name, key); 62 debug_mutex_init(lock, name, key);
60} 63}
@@ -108,6 +111,60 @@ EXPORT_SYMBOL(mutex_lock);
108 111
109#ifdef CONFIG_MUTEX_SPIN_ON_OWNER 112#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
110/* 113/*
114 * In order to avoid a stampede of mutex spinners from acquiring the mutex
115 * more or less simultaneously, the spinners need to acquire a MCS lock
116 * first before spinning on the owner field.
117 *
118 * We don't inline mspin_lock() so that perf can correctly account for the
119 * time spent in this lock function.
120 */
121struct mspin_node {
122 struct mspin_node *next ;
123 int locked; /* 1 if lock acquired */
124};
125#define MLOCK(mutex) ((struct mspin_node **)&((mutex)->spin_mlock))
126
127static noinline
128void mspin_lock(struct mspin_node **lock, struct mspin_node *node)
129{
130 struct mspin_node *prev;
131
132 /* Init node */
133 node->locked = 0;
134 node->next = NULL;
135
136 prev = xchg(lock, node);
137 if (likely(prev == NULL)) {
138 /* Lock acquired */
139 node->locked = 1;
140 return;
141 }
142 ACCESS_ONCE(prev->next) = node;
143 smp_wmb();
144 /* Wait until the lock holder passes the lock down */
145 while (!ACCESS_ONCE(node->locked))
146 arch_mutex_cpu_relax();
147}
148
149static void mspin_unlock(struct mspin_node **lock, struct mspin_node *node)
150{
151 struct mspin_node *next = ACCESS_ONCE(node->next);
152
153 if (likely(!next)) {
154 /*
155 * Release the lock by setting it to NULL
156 */
157 if (cmpxchg(lock, node, NULL) == node)
158 return;
159 /* Wait until the next pointer is set */
160 while (!(next = ACCESS_ONCE(node->next)))
161 arch_mutex_cpu_relax();
162 }
163 ACCESS_ONCE(next->locked) = 1;
164 smp_wmb();
165}
166
167/*
111 * Mutex spinning code migrated from kernel/sched/core.c 168 * Mutex spinning code migrated from kernel/sched/core.c
112 */ 169 */
113 170
@@ -150,6 +207,24 @@ int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)
150 */ 207 */
151 return lock->owner == NULL; 208 return lock->owner == NULL;
152} 209}
210
211/*
212 * Initial check for entering the mutex spinning loop
213 */
214static inline int mutex_can_spin_on_owner(struct mutex *lock)
215{
216 int retval = 1;
217
218 rcu_read_lock();
219 if (lock->owner)
220 retval = lock->owner->on_cpu;
221 rcu_read_unlock();
222 /*
223 * if lock->owner is not set, the mutex owner may have just acquired
224 * it and not set the owner yet or the mutex has been released.
225 */
226 return retval;
227}
153#endif 228#endif
154 229
155static __used noinline void __sched __mutex_unlock_slowpath(atomic_t *lock_count); 230static __used noinline void __sched __mutex_unlock_slowpath(atomic_t *lock_count);
@@ -215,26 +290,39 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
215 * 290 *
216 * We can't do this for DEBUG_MUTEXES because that relies on wait_lock 291 * We can't do this for DEBUG_MUTEXES because that relies on wait_lock
217 * to serialize everything. 292 * to serialize everything.
293 *
294 * The mutex spinners are queued up using MCS lock so that only one
295 * spinner can compete for the mutex. However, if mutex spinning isn't
296 * going to happen, there is no point in going through the lock/unlock
297 * overhead.
218 */ 298 */
299 if (!mutex_can_spin_on_owner(lock))
300 goto slowpath;
219 301
220 for (;;) { 302 for (;;) {
221 struct task_struct *owner; 303 struct task_struct *owner;
304 struct mspin_node node;
222 305
223 /* 306 /*
224 * If there's an owner, wait for it to either 307 * If there's an owner, wait for it to either
225 * release the lock or go to sleep. 308 * release the lock or go to sleep.
226 */ 309 */
310 mspin_lock(MLOCK(lock), &node);
227 owner = ACCESS_ONCE(lock->owner); 311 owner = ACCESS_ONCE(lock->owner);
228 if (owner && !mutex_spin_on_owner(lock, owner)) 312 if (owner && !mutex_spin_on_owner(lock, owner)) {
313 mspin_unlock(MLOCK(lock), &node);
229 break; 314 break;
315 }
230 316
231 if ((atomic_read(&lock->count) == 1) && 317 if ((atomic_read(&lock->count) == 1) &&
232 (atomic_cmpxchg(&lock->count, 1, 0) == 1)) { 318 (atomic_cmpxchg(&lock->count, 1, 0) == 1)) {
233 lock_acquired(&lock->dep_map, ip); 319 lock_acquired(&lock->dep_map, ip);
234 mutex_set_owner(lock); 320 mutex_set_owner(lock);
321 mspin_unlock(MLOCK(lock), &node);
235 preempt_enable(); 322 preempt_enable();
236 return 0; 323 return 0;
237 } 324 }
325 mspin_unlock(MLOCK(lock), &node);
238 326
239 /* 327 /*
240 * When there's no owner, we might have preempted between the 328 * When there's no owner, we might have preempted between the
@@ -253,6 +341,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
253 */ 341 */
254 arch_mutex_cpu_relax(); 342 arch_mutex_cpu_relax();
255 } 343 }
344slowpath:
256#endif 345#endif
257 spin_lock_mutex(&lock->wait_lock, flags); 346 spin_lock_mutex(&lock->wait_lock, flags);
258 347