diff options
Diffstat (limited to 'kernel/locking/qspinlock.c')
-rw-r--r-- | kernel/locking/qspinlock.c | 473 |
1 files changed, 473 insertions, 0 deletions
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c new file mode 100644 index 000000000000..38c49202d532 --- /dev/null +++ b/kernel/locking/qspinlock.c | |||
@@ -0,0 +1,473 @@ | |||
1 | /* | ||
2 | * Queued spinlock | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or modify | ||
5 | * it under the terms of the GNU General Public License as published by | ||
6 | * the Free Software Foundation; either version 2 of the License, or | ||
7 | * (at your option) any later version. | ||
8 | * | ||
9 | * This program is distributed in the hope that it will be useful, | ||
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
12 | * GNU General Public License for more details. | ||
13 | * | ||
14 | * (C) Copyright 2013-2015 Hewlett-Packard Development Company, L.P. | ||
15 | * (C) Copyright 2013-2014 Red Hat, Inc. | ||
16 | * (C) Copyright 2015 Intel Corp. | ||
17 | * | ||
18 | * Authors: Waiman Long <waiman.long@hp.com> | ||
19 | * Peter Zijlstra <peterz@infradead.org> | ||
20 | */ | ||
21 | |||
22 | #ifndef _GEN_PV_LOCK_SLOWPATH | ||
23 | |||
24 | #include <linux/smp.h> | ||
25 | #include <linux/bug.h> | ||
26 | #include <linux/cpumask.h> | ||
27 | #include <linux/percpu.h> | ||
28 | #include <linux/hardirq.h> | ||
29 | #include <linux/mutex.h> | ||
30 | #include <asm/byteorder.h> | ||
31 | #include <asm/qspinlock.h> | ||
32 | |||
33 | /* | ||
34 | * The basic principle of a queue-based spinlock can best be understood | ||
35 | * by studying a classic queue-based spinlock implementation called the | ||
36 | * MCS lock. The paper below provides a good description for this kind | ||
37 | * of lock. | ||
38 | * | ||
39 | * http://www.cise.ufl.edu/tr/DOC/REP-1992-71.pdf | ||
40 | * | ||
41 | * This queued spinlock implementation is based on the MCS lock, however to make | ||
42 | * it fit the 4 bytes we assume spinlock_t to be, and preserve its existing | ||
43 | * API, we must modify it somehow. | ||
44 | * | ||
45 | * In particular; where the traditional MCS lock consists of a tail pointer | ||
46 | * (8 bytes) and needs the next pointer (another 8 bytes) of its own node to | ||
47 | * unlock the next pending (next->locked), we compress both these: {tail, | ||
48 | * next->locked} into a single u32 value. | ||
49 | * | ||
50 | * Since a spinlock disables recursion of its own context and there is a limit | ||
51 | * to the contexts that can nest; namely: task, softirq, hardirq, nmi. As there | ||
52 | * are at most 4 nesting levels, it can be encoded by a 2-bit number. Now | ||
53 | * we can encode the tail by combining the 2-bit nesting level with the cpu | ||
54 | * number. With one byte for the lock value and 3 bytes for the tail, only a | ||
55 | * 32-bit word is now needed. Even though we only need 1 bit for the lock, | ||
56 | * we extend it to a full byte to achieve better performance for architectures | ||
57 | * that support atomic byte write. | ||
58 | * | ||
59 | * We also change the first spinner to spin on the lock bit instead of its | ||
60 | * node; whereby avoiding the need to carry a node from lock to unlock, and | ||
61 | * preserving existing lock API. This also makes the unlock code simpler and | ||
62 | * faster. | ||
63 | * | ||
64 | * N.B. The current implementation only supports architectures that allow | ||
65 | * atomic operations on smaller 8-bit and 16-bit data types. | ||
66 | * | ||
67 | */ | ||
68 | |||
69 | #include "mcs_spinlock.h" | ||
70 | |||
71 | #ifdef CONFIG_PARAVIRT_SPINLOCKS | ||
72 | #define MAX_NODES 8 | ||
73 | #else | ||
74 | #define MAX_NODES 4 | ||
75 | #endif | ||
76 | |||
77 | /* | ||
78 | * Per-CPU queue node structures; we can never have more than 4 nested | ||
79 | * contexts: task, softirq, hardirq, nmi. | ||
80 | * | ||
81 | * Exactly fits one 64-byte cacheline on a 64-bit architecture. | ||
82 | * | ||
83 | * PV doubles the storage and uses the second cacheline for PV state. | ||
84 | */ | ||
85 | static DEFINE_PER_CPU_ALIGNED(struct mcs_spinlock, mcs_nodes[MAX_NODES]); | ||
86 | |||
87 | /* | ||
88 | * We must be able to distinguish between no-tail and the tail at 0:0, | ||
89 | * therefore increment the cpu number by one. | ||
90 | */ | ||
91 | |||
92 | static inline u32 encode_tail(int cpu, int idx) | ||
93 | { | ||
94 | u32 tail; | ||
95 | |||
96 | #ifdef CONFIG_DEBUG_SPINLOCK | ||
97 | BUG_ON(idx > 3); | ||
98 | #endif | ||
99 | tail = (cpu + 1) << _Q_TAIL_CPU_OFFSET; | ||
100 | tail |= idx << _Q_TAIL_IDX_OFFSET; /* assume < 4 */ | ||
101 | |||
102 | return tail; | ||
103 | } | ||
104 | |||
105 | static inline struct mcs_spinlock *decode_tail(u32 tail) | ||
106 | { | ||
107 | int cpu = (tail >> _Q_TAIL_CPU_OFFSET) - 1; | ||
108 | int idx = (tail & _Q_TAIL_IDX_MASK) >> _Q_TAIL_IDX_OFFSET; | ||
109 | |||
110 | return per_cpu_ptr(&mcs_nodes[idx], cpu); | ||
111 | } | ||
112 | |||
113 | #define _Q_LOCKED_PENDING_MASK (_Q_LOCKED_MASK | _Q_PENDING_MASK) | ||
114 | |||
115 | /* | ||
116 | * By using the whole 2nd least significant byte for the pending bit, we | ||
117 | * can allow better optimization of the lock acquisition for the pending | ||
118 | * bit holder. | ||
119 | * | ||
120 | * This internal structure is also used by the set_locked function which | ||
121 | * is not restricted to _Q_PENDING_BITS == 8. | ||
122 | */ | ||
123 | struct __qspinlock { | ||
124 | union { | ||
125 | atomic_t val; | ||
126 | #ifdef __LITTLE_ENDIAN | ||
127 | struct { | ||
128 | u8 locked; | ||
129 | u8 pending; | ||
130 | }; | ||
131 | struct { | ||
132 | u16 locked_pending; | ||
133 | u16 tail; | ||
134 | }; | ||
135 | #else | ||
136 | struct { | ||
137 | u16 tail; | ||
138 | u16 locked_pending; | ||
139 | }; | ||
140 | struct { | ||
141 | u8 reserved[2]; | ||
142 | u8 pending; | ||
143 | u8 locked; | ||
144 | }; | ||
145 | #endif | ||
146 | }; | ||
147 | }; | ||
148 | |||
149 | #if _Q_PENDING_BITS == 8 | ||
150 | /** | ||
151 | * clear_pending_set_locked - take ownership and clear the pending bit. | ||
152 | * @lock: Pointer to queued spinlock structure | ||
153 | * | ||
154 | * *,1,0 -> *,0,1 | ||
155 | * | ||
156 | * Lock stealing is not allowed if this function is used. | ||
157 | */ | ||
158 | static __always_inline void clear_pending_set_locked(struct qspinlock *lock) | ||
159 | { | ||
160 | struct __qspinlock *l = (void *)lock; | ||
161 | |||
162 | WRITE_ONCE(l->locked_pending, _Q_LOCKED_VAL); | ||
163 | } | ||
164 | |||
165 | /* | ||
166 | * xchg_tail - Put in the new queue tail code word & retrieve previous one | ||
167 | * @lock : Pointer to queued spinlock structure | ||
168 | * @tail : The new queue tail code word | ||
169 | * Return: The previous queue tail code word | ||
170 | * | ||
171 | * xchg(lock, tail) | ||
172 | * | ||
173 | * p,*,* -> n,*,* ; prev = xchg(lock, node) | ||
174 | */ | ||
175 | static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail) | ||
176 | { | ||
177 | struct __qspinlock *l = (void *)lock; | ||
178 | |||
179 | return (u32)xchg(&l->tail, tail >> _Q_TAIL_OFFSET) << _Q_TAIL_OFFSET; | ||
180 | } | ||
181 | |||
182 | #else /* _Q_PENDING_BITS == 8 */ | ||
183 | |||
184 | /** | ||
185 | * clear_pending_set_locked - take ownership and clear the pending bit. | ||
186 | * @lock: Pointer to queued spinlock structure | ||
187 | * | ||
188 | * *,1,0 -> *,0,1 | ||
189 | */ | ||
190 | static __always_inline void clear_pending_set_locked(struct qspinlock *lock) | ||
191 | { | ||
192 | atomic_add(-_Q_PENDING_VAL + _Q_LOCKED_VAL, &lock->val); | ||
193 | } | ||
194 | |||
195 | /** | ||
196 | * xchg_tail - Put in the new queue tail code word & retrieve previous one | ||
197 | * @lock : Pointer to queued spinlock structure | ||
198 | * @tail : The new queue tail code word | ||
199 | * Return: The previous queue tail code word | ||
200 | * | ||
201 | * xchg(lock, tail) | ||
202 | * | ||
203 | * p,*,* -> n,*,* ; prev = xchg(lock, node) | ||
204 | */ | ||
205 | static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail) | ||
206 | { | ||
207 | u32 old, new, val = atomic_read(&lock->val); | ||
208 | |||
209 | for (;;) { | ||
210 | new = (val & _Q_LOCKED_PENDING_MASK) | tail; | ||
211 | old = atomic_cmpxchg(&lock->val, val, new); | ||
212 | if (old == val) | ||
213 | break; | ||
214 | |||
215 | val = old; | ||
216 | } | ||
217 | return old; | ||
218 | } | ||
219 | #endif /* _Q_PENDING_BITS == 8 */ | ||
220 | |||
221 | /** | ||
222 | * set_locked - Set the lock bit and own the lock | ||
223 | * @lock: Pointer to queued spinlock structure | ||
224 | * | ||
225 | * *,*,0 -> *,0,1 | ||
226 | */ | ||
227 | static __always_inline void set_locked(struct qspinlock *lock) | ||
228 | { | ||
229 | struct __qspinlock *l = (void *)lock; | ||
230 | |||
231 | WRITE_ONCE(l->locked, _Q_LOCKED_VAL); | ||
232 | } | ||
233 | |||
234 | |||
235 | /* | ||
236 | * Generate the native code for queued_spin_unlock_slowpath(); provide NOPs for | ||
237 | * all the PV callbacks. | ||
238 | */ | ||
239 | |||
240 | static __always_inline void __pv_init_node(struct mcs_spinlock *node) { } | ||
241 | static __always_inline void __pv_wait_node(struct mcs_spinlock *node) { } | ||
242 | static __always_inline void __pv_kick_node(struct mcs_spinlock *node) { } | ||
243 | |||
244 | static __always_inline void __pv_wait_head(struct qspinlock *lock, | ||
245 | struct mcs_spinlock *node) { } | ||
246 | |||
247 | #define pv_enabled() false | ||
248 | |||
249 | #define pv_init_node __pv_init_node | ||
250 | #define pv_wait_node __pv_wait_node | ||
251 | #define pv_kick_node __pv_kick_node | ||
252 | #define pv_wait_head __pv_wait_head | ||
253 | |||
254 | #ifdef CONFIG_PARAVIRT_SPINLOCKS | ||
255 | #define queued_spin_lock_slowpath native_queued_spin_lock_slowpath | ||
256 | #endif | ||
257 | |||
258 | #endif /* _GEN_PV_LOCK_SLOWPATH */ | ||
259 | |||
260 | /** | ||
261 | * queued_spin_lock_slowpath - acquire the queued spinlock | ||
262 | * @lock: Pointer to queued spinlock structure | ||
263 | * @val: Current value of the queued spinlock 32-bit word | ||
264 | * | ||
265 | * (queue tail, pending bit, lock value) | ||
266 | * | ||
267 | * fast : slow : unlock | ||
268 | * : : | ||
269 | * uncontended (0,0,0) -:--> (0,0,1) ------------------------------:--> (*,*,0) | ||
270 | * : | ^--------.------. / : | ||
271 | * : v \ \ | : | ||
272 | * pending : (0,1,1) +--> (0,1,0) \ | : | ||
273 | * : | ^--' | | : | ||
274 | * : v | | : | ||
275 | * uncontended : (n,x,y) +--> (n,0,0) --' | : | ||
276 | * queue : | ^--' | : | ||
277 | * : v | : | ||
278 | * contended : (*,x,y) +--> (*,0,0) ---> (*,0,1) -' : | ||
279 | * queue : ^--' : | ||
280 | */ | ||
281 | void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val) | ||
282 | { | ||
283 | struct mcs_spinlock *prev, *next, *node; | ||
284 | u32 new, old, tail; | ||
285 | int idx; | ||
286 | |||
287 | BUILD_BUG_ON(CONFIG_NR_CPUS >= (1U << _Q_TAIL_CPU_BITS)); | ||
288 | |||
289 | if (pv_enabled()) | ||
290 | goto queue; | ||
291 | |||
292 | if (virt_queued_spin_lock(lock)) | ||
293 | return; | ||
294 | |||
295 | /* | ||
296 | * wait for in-progress pending->locked hand-overs | ||
297 | * | ||
298 | * 0,1,0 -> 0,0,1 | ||
299 | */ | ||
300 | if (val == _Q_PENDING_VAL) { | ||
301 | while ((val = atomic_read(&lock->val)) == _Q_PENDING_VAL) | ||
302 | cpu_relax(); | ||
303 | } | ||
304 | |||
305 | /* | ||
306 | * trylock || pending | ||
307 | * | ||
308 | * 0,0,0 -> 0,0,1 ; trylock | ||
309 | * 0,0,1 -> 0,1,1 ; pending | ||
310 | */ | ||
311 | for (;;) { | ||
312 | /* | ||
313 | * If we observe any contention; queue. | ||
314 | */ | ||
315 | if (val & ~_Q_LOCKED_MASK) | ||
316 | goto queue; | ||
317 | |||
318 | new = _Q_LOCKED_VAL; | ||
319 | if (val == new) | ||
320 | new |= _Q_PENDING_VAL; | ||
321 | |||
322 | old = atomic_cmpxchg(&lock->val, val, new); | ||
323 | if (old == val) | ||
324 | break; | ||
325 | |||
326 | val = old; | ||
327 | } | ||
328 | |||
329 | /* | ||
330 | * we won the trylock | ||
331 | */ | ||
332 | if (new == _Q_LOCKED_VAL) | ||
333 | return; | ||
334 | |||
335 | /* | ||
336 | * we're pending, wait for the owner to go away. | ||
337 | * | ||
338 | * *,1,1 -> *,1,0 | ||
339 | * | ||
340 | * this wait loop must be a load-acquire such that we match the | ||
341 | * store-release that clears the locked bit and create lock | ||
342 | * sequentiality; this is because not all clear_pending_set_locked() | ||
343 | * implementations imply full barriers. | ||
344 | */ | ||
345 | while ((val = smp_load_acquire(&lock->val.counter)) & _Q_LOCKED_MASK) | ||
346 | cpu_relax(); | ||
347 | |||
348 | /* | ||
349 | * take ownership and clear the pending bit. | ||
350 | * | ||
351 | * *,1,0 -> *,0,1 | ||
352 | */ | ||
353 | clear_pending_set_locked(lock); | ||
354 | return; | ||
355 | |||
356 | /* | ||
357 | * End of pending bit optimistic spinning and beginning of MCS | ||
358 | * queuing. | ||
359 | */ | ||
360 | queue: | ||
361 | node = this_cpu_ptr(&mcs_nodes[0]); | ||
362 | idx = node->count++; | ||
363 | tail = encode_tail(smp_processor_id(), idx); | ||
364 | |||
365 | node += idx; | ||
366 | node->locked = 0; | ||
367 | node->next = NULL; | ||
368 | pv_init_node(node); | ||
369 | |||
370 | /* | ||
371 | * We touched a (possibly) cold cacheline in the per-cpu queue node; | ||
372 | * attempt the trylock once more in the hope someone let go while we | ||
373 | * weren't watching. | ||
374 | */ | ||
375 | if (queued_spin_trylock(lock)) | ||
376 | goto release; | ||
377 | |||
378 | /* | ||
379 | * We have already touched the queueing cacheline; don't bother with | ||
380 | * pending stuff. | ||
381 | * | ||
382 | * p,*,* -> n,*,* | ||
383 | */ | ||
384 | old = xchg_tail(lock, tail); | ||
385 | |||
386 | /* | ||
387 | * if there was a previous node; link it and wait until reaching the | ||
388 | * head of the waitqueue. | ||
389 | */ | ||
390 | if (old & _Q_TAIL_MASK) { | ||
391 | prev = decode_tail(old); | ||
392 | WRITE_ONCE(prev->next, node); | ||
393 | |||
394 | pv_wait_node(node); | ||
395 | arch_mcs_spin_lock_contended(&node->locked); | ||
396 | } | ||
397 | |||
398 | /* | ||
399 | * we're at the head of the waitqueue, wait for the owner & pending to | ||
400 | * go away. | ||
401 | * | ||
402 | * *,x,y -> *,0,0 | ||
403 | * | ||
404 | * this wait loop must use a load-acquire such that we match the | ||
405 | * store-release that clears the locked bit and create lock | ||
406 | * sequentiality; this is because the set_locked() function below | ||
407 | * does not imply a full barrier. | ||
408 | * | ||
409 | */ | ||
410 | pv_wait_head(lock, node); | ||
411 | while ((val = smp_load_acquire(&lock->val.counter)) & _Q_LOCKED_PENDING_MASK) | ||
412 | cpu_relax(); | ||
413 | |||
414 | /* | ||
415 | * claim the lock: | ||
416 | * | ||
417 | * n,0,0 -> 0,0,1 : lock, uncontended | ||
418 | * *,0,0 -> *,0,1 : lock, contended | ||
419 | * | ||
420 | * If the queue head is the only one in the queue (lock value == tail), | ||
421 | * clear the tail code and grab the lock. Otherwise, we only need | ||
422 | * to grab the lock. | ||
423 | */ | ||
424 | for (;;) { | ||
425 | if (val != tail) { | ||
426 | set_locked(lock); | ||
427 | break; | ||
428 | } | ||
429 | old = atomic_cmpxchg(&lock->val, val, _Q_LOCKED_VAL); | ||
430 | if (old == val) | ||
431 | goto release; /* No contention */ | ||
432 | |||
433 | val = old; | ||
434 | } | ||
435 | |||
436 | /* | ||
437 | * contended path; wait for next, release. | ||
438 | */ | ||
439 | while (!(next = READ_ONCE(node->next))) | ||
440 | cpu_relax(); | ||
441 | |||
442 | arch_mcs_spin_unlock_contended(&next->locked); | ||
443 | pv_kick_node(next); | ||
444 | |||
445 | release: | ||
446 | /* | ||
447 | * release the node | ||
448 | */ | ||
449 | this_cpu_dec(mcs_nodes[0].count); | ||
450 | } | ||
451 | EXPORT_SYMBOL(queued_spin_lock_slowpath); | ||
452 | |||
453 | /* | ||
454 | * Generate the paravirt code for queued_spin_unlock_slowpath(). | ||
455 | */ | ||
456 | #if !defined(_GEN_PV_LOCK_SLOWPATH) && defined(CONFIG_PARAVIRT_SPINLOCKS) | ||
457 | #define _GEN_PV_LOCK_SLOWPATH | ||
458 | |||
459 | #undef pv_enabled | ||
460 | #define pv_enabled() true | ||
461 | |||
462 | #undef pv_init_node | ||
463 | #undef pv_wait_node | ||
464 | #undef pv_kick_node | ||
465 | #undef pv_wait_head | ||
466 | |||
467 | #undef queued_spin_lock_slowpath | ||
468 | #define queued_spin_lock_slowpath __pv_queued_spin_lock_slowpath | ||
469 | |||
470 | #include "qspinlock_paravirt.h" | ||
471 | #include "qspinlock.c" | ||
472 | |||
473 | #endif | ||