diff options
-rw-r--r-- | Documentation/mutex-design.txt | 252 | ||||
-rw-r--r-- | arch/x86/Kconfig | 1 | ||||
-rw-r--r-- | arch/x86/include/asm/qrwlock.h | 17 | ||||
-rw-r--r-- | arch/x86/include/asm/spinlock.h | 4 | ||||
-rw-r--r-- | arch/x86/include/asm/spinlock_types.h | 4 | ||||
-rw-r--r-- | include/asm-generic/qrwlock.h | 166 | ||||
-rw-r--r-- | include/asm-generic/qrwlock_types.h | 21 | ||||
-rw-r--r-- | include/linux/rwsem.h | 25 | ||||
-rw-r--r-- | kernel/Kconfig.locks | 7 | ||||
-rw-r--r-- | kernel/locking/Makefile | 1 | ||||
-rw-r--r-- | kernel/locking/qrwlock.c | 133 | ||||
-rw-r--r-- | kernel/locking/rwsem-xadd.c | 225 | ||||
-rw-r--r-- | kernel/locking/rwsem.c | 31 |
13 files changed, 737 insertions, 150 deletions
diff --git a/Documentation/mutex-design.txt b/Documentation/mutex-design.txt index 1dfe62c3641d..ee231ed09ec6 100644 --- a/Documentation/mutex-design.txt +++ b/Documentation/mutex-design.txt | |||
@@ -1,139 +1,157 @@ | |||
1 | Generic Mutex Subsystem | 1 | Generic Mutex Subsystem |
2 | 2 | ||
3 | started by Ingo Molnar <mingo@redhat.com> | 3 | started by Ingo Molnar <mingo@redhat.com> |
4 | updated by Davidlohr Bueso <davidlohr@hp.com> | ||
4 | 5 | ||
5 | "Why on earth do we need a new mutex subsystem, and what's wrong | 6 | What are mutexes? |
6 | with semaphores?" | 7 | ----------------- |
7 | 8 | ||
8 | firstly, there's nothing wrong with semaphores. But if the simpler | 9 | In the Linux kernel, mutexes refer to a particular locking primitive |
9 | mutex semantics are sufficient for your code, then there are a couple | 10 | that enforces serialization on shared memory systems, and not only to |
10 | of advantages of mutexes: | 11 | the generic term referring to 'mutual exclusion' found in academia |
12 | or similar theoretical text books. Mutexes are sleeping locks which | ||
13 | behave similarly to binary semaphores, and were introduced in 2006[1] | ||
14 | as an alternative to these. This new data structure provided a number | ||
15 | of advantages, including simpler interfaces, and at that time smaller | ||
16 | code (see Disadvantages). | ||
11 | 17 | ||
12 | - 'struct mutex' is smaller on most architectures: E.g. on x86, | 18 | [1] http://lwn.net/Articles/164802/ |
13 | 'struct semaphore' is 20 bytes, 'struct mutex' is 16 bytes. | ||
14 | A smaller structure size means less RAM footprint, and better | ||
15 | CPU-cache utilization. | ||
16 | 19 | ||
17 | - tighter code. On x86 i get the following .text sizes when | 20 | Implementation |
18 | switching all mutex-alike semaphores in the kernel to the mutex | 21 | -------------- |
19 | subsystem: | ||
20 | 22 | ||
21 | text data bss dec hex filename | 23 | Mutexes are represented by 'struct mutex', defined in include/linux/mutex.h |
22 | 3280380 868188 396860 4545428 455b94 vmlinux-semaphore | 24 | and implemented in kernel/locking/mutex.c. These locks use a three |
23 | 3255329 865296 396732 4517357 44eded vmlinux-mutex | 25 | state atomic counter (->count) to represent the different possible |
26 | transitions that can occur during the lifetime of a lock: | ||
24 | 27 | ||
25 | that's 25051 bytes of code saved, or a 0.76% win - off the hottest | 28 | 1: unlocked |
26 | codepaths of the kernel. (The .data savings are 2892 bytes, or 0.33%) | 29 | 0: locked, no waiters |
27 | Smaller code means better icache footprint, which is one of the | 30 | negative: locked, with potential waiters |
28 | major optimization goals in the Linux kernel currently. | ||
29 | 31 | ||
30 | - the mutex subsystem is slightly faster and has better scalability for | 32 | In its most basic form it also includes a wait-queue and a spinlock |
31 | contended workloads. On an 8-way x86 system, running a mutex-based | 33 | that serializes access to it. CONFIG_SMP systems can also include |
32 | kernel and testing creat+unlink+close (of separate, per-task files) | 34 | a pointer to the lock task owner (->owner) as well as a spinner MCS |
33 | in /tmp with 16 parallel tasks, the average number of ops/sec is: | 35 | lock (->osq), both described below in (ii). |
34 | 36 | ||
35 | Semaphores: Mutexes: | 37 | When acquiring a mutex, there are three possible paths that can be |
38 | taken, depending on the state of the lock: | ||
36 | 39 | ||
37 | $ ./test-mutex V 16 10 $ ./test-mutex V 16 10 | 40 | (i) fastpath: tries to atomically acquire the lock by decrementing the |
38 | 8 CPUs, running 16 tasks. 8 CPUs, running 16 tasks. | 41 | counter. If it was already taken by another task it goes to the next |
39 | checking VFS performance. checking VFS performance. | 42 | possible path. This logic is architecture specific. On x86-64, the |
40 | avg loops/sec: 34713 avg loops/sec: 84153 | 43 | locking fastpath is 2 instructions: |
41 | CPU utilization: 63% CPU utilization: 22% | ||
42 | 44 | ||
43 | i.e. in this workload, the mutex based kernel was 2.4 times faster | 45 | 0000000000000e10 <mutex_lock>: |
44 | than the semaphore based kernel, _and_ it also had 2.8 times less CPU | 46 | e21: f0 ff 0b lock decl (%rbx) |
45 | utilization. (In terms of 'ops per CPU cycle', the semaphore kernel | 47 | e24: 79 08 jns e2e <mutex_lock+0x1e> |
46 | performed 551 ops/sec per 1% of CPU time used, while the mutex kernel | ||
47 | performed 3825 ops/sec per 1% of CPU time used - it was 6.9 times | ||
48 | more efficient.) | ||
49 | |||
50 | the scalability difference is visible even on a 2-way P4 HT box: | ||
51 | |||
52 | Semaphores: Mutexes: | ||
53 | |||
54 | $ ./test-mutex V 16 10 $ ./test-mutex V 16 10 | ||
55 | 4 CPUs, running 16 tasks. 8 CPUs, running 16 tasks. | ||
56 | checking VFS performance. checking VFS performance. | ||
57 | avg loops/sec: 127659 avg loops/sec: 181082 | ||
58 | CPU utilization: 100% CPU utilization: 34% | ||
59 | |||
60 | (the straight performance advantage of mutexes is 41%, the per-cycle | ||
61 | efficiency of mutexes is 4.1 times better.) | ||
62 | |||
63 | - there are no fastpath tradeoffs, the mutex fastpath is just as tight | ||
64 | as the semaphore fastpath. On x86, the locking fastpath is 2 | ||
65 | instructions: | ||
66 | |||
67 | c0377ccb <mutex_lock>: | ||
68 | c0377ccb: f0 ff 08 lock decl (%eax) | ||
69 | c0377cce: 78 0e js c0377cde <.text..lock.mutex> | ||
70 | c0377cd0: c3 ret | ||
71 | 48 | ||
72 | the unlocking fastpath is equally tight: | 49 | the unlocking fastpath is equally tight: |
73 | 50 | ||
74 | c0377cd1 <mutex_unlock>: | 51 | 0000000000000bc0 <mutex_unlock>: |
75 | c0377cd1: f0 ff 00 lock incl (%eax) | 52 | bc8: f0 ff 07 lock incl (%rdi) |
76 | c0377cd4: 7e 0f jle c0377ce5 <.text..lock.mutex+0x7> | 53 | bcb: 7f 0a jg bd7 <mutex_unlock+0x17> |
77 | c0377cd6: c3 ret | 54 | |
78 | 55 | ||
79 | - 'struct mutex' semantics are well-defined and are enforced if | 56 | (ii) midpath: aka optimistic spinning, tries to spin for acquisition |
80 | CONFIG_DEBUG_MUTEXES is turned on. Semaphores on the other hand have | 57 | while the lock owner is running and there are no other tasks ready |
81 | virtually no debugging code or instrumentation. The mutex subsystem | 58 | to run that have higher priority (need_resched). The rationale is |
82 | checks and enforces the following rules: | 59 | that if the lock owner is running, it is likely to release the lock |
83 | 60 | soon. The mutex spinners are queued up using MCS lock so that only | |
84 | * - only one task can hold the mutex at a time | 61 | one spinner can compete for the mutex. |
85 | * - only the owner can unlock the mutex | 62 | |
86 | * - multiple unlocks are not permitted | 63 | The MCS lock (proposed by Mellor-Crummey and Scott) is a simple spinlock |
87 | * - recursive locking is not permitted | 64 | with the desirable properties of being fair and with each cpu trying |
88 | * - a mutex object must be initialized via the API | 65 | to acquire the lock spinning on a local variable. It avoids expensive |
89 | * - a mutex object must not be initialized via memset or copying | 66 | cacheline bouncing that common test-and-set spinlock implementations |
90 | * - task may not exit with mutex held | 67 | incur. An MCS-like lock is specially tailored for optimistic spinning |
91 | * - memory areas where held locks reside must not be freed | 68 | for sleeping lock implementation. An important feature of the customized |
92 | * - held mutexes must not be reinitialized | 69 | MCS lock is that it has the extra property that spinners are able to exit |
93 | * - mutexes may not be used in hardware or software interrupt | 70 | the MCS spinlock queue when they need to reschedule. This further helps |
94 | * contexts such as tasklets and timers | 71 | avoid situations where MCS spinners that need to reschedule would continue |
95 | 72 | waiting to spin on mutex owner, only to go directly to slowpath upon | |
96 | furthermore, there are also convenience features in the debugging | 73 | obtaining the MCS lock. |
97 | code: | 74 | |
98 | 75 | ||
99 | * - uses symbolic names of mutexes, whenever they are printed in debug output | 76 | (iii) slowpath: last resort, if the lock is still unable to be acquired, |
100 | * - point-of-acquire tracking, symbolic lookup of function names | 77 | the task is added to the wait-queue and sleeps until woken up by the |
101 | * - list of all locks held in the system, printout of them | 78 | unlock path. Under normal circumstances it blocks as TASK_UNINTERRUPTIBLE. |
102 | * - owner tracking | 79 | |
103 | * - detects self-recursing locks and prints out all relevant info | 80 | While formally kernel mutexes are sleepable locks, it is path (ii) that |
104 | * - detects multi-task circular deadlocks and prints out all affected | 81 | makes them more practically a hybrid type. By simply not interrupting a |
105 | * locks and tasks (and only those tasks) | 82 | task and busy-waiting for a few cycles instead of immediately sleeping, |
83 | the performance of this lock has been seen to significantly improve a | ||
84 | number of workloads. Note that this technique is also used for rw-semaphores. | ||
85 | |||
86 | Semantics | ||
87 | --------- | ||
88 | |||
89 | The mutex subsystem checks and enforces the following rules: | ||
90 | |||
91 | - Only one task can hold the mutex at a time. | ||
92 | - Only the owner can unlock the mutex. | ||
93 | - Multiple unlocks are not permitted. | ||
94 | - Recursive locking/unlocking is not permitted. | ||
95 | - A mutex must only be initialized via the API (see below). | ||
96 | - A task may not exit with a mutex held. | ||
97 | - Memory areas where held locks reside must not be freed. | ||
98 | - Held mutexes must not be reinitialized. | ||
99 | - Mutexes may not be used in hardware or software interrupt | ||
100 | contexts such as tasklets and timers. | ||
101 | |||
102 | These semantics are fully enforced when CONFIG DEBUG_MUTEXES is enabled. | ||
103 | In addition, the mutex debugging code also implements a number of other | ||
104 | features that make lock debugging easier and faster: | ||
105 | |||
106 | - Uses symbolic names of mutexes, whenever they are printed | ||
107 | in debug output. | ||
108 | - Point-of-acquire tracking, symbolic lookup of function names, | ||
109 | list of all locks held in the system, printout of them. | ||
110 | - Owner tracking. | ||
111 | - Detects self-recursing locks and prints out all relevant info. | ||
112 | - Detects multi-task circular deadlocks and prints out all affected | ||
113 | locks and tasks (and only those tasks). | ||
114 | |||
115 | |||
116 | Interfaces | ||
117 | ---------- | ||
118 | Statically define the mutex: | ||
119 | DEFINE_MUTEX(name); | ||
120 | |||
121 | Dynamically initialize the mutex: | ||
122 | mutex_init(mutex); | ||
123 | |||
124 | Acquire the mutex, uninterruptible: | ||
125 | void mutex_lock(struct mutex *lock); | ||
126 | void mutex_lock_nested(struct mutex *lock, unsigned int subclass); | ||
127 | int mutex_trylock(struct mutex *lock); | ||
128 | |||
129 | Acquire the mutex, interruptible: | ||
130 | int mutex_lock_interruptible_nested(struct mutex *lock, | ||
131 | unsigned int subclass); | ||
132 | int mutex_lock_interruptible(struct mutex *lock); | ||
133 | |||
134 | Acquire the mutex, interruptible, if dec to 0: | ||
135 | int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock); | ||
136 | |||
137 | Unlock the mutex: | ||
138 | void mutex_unlock(struct mutex *lock); | ||
139 | |||
140 | Test if the mutex is taken: | ||
141 | int mutex_is_locked(struct mutex *lock); | ||
106 | 142 | ||
107 | Disadvantages | 143 | Disadvantages |
108 | ------------- | 144 | ------------- |
109 | 145 | ||
110 | The stricter mutex API means you cannot use mutexes the same way you | 146 | Unlike its original design and purpose, 'struct mutex' is larger than |
111 | can use semaphores: e.g. they cannot be used from an interrupt context, | 147 | most locks in the kernel. E.g: on x86-64 it is 40 bytes, almost twice |
112 | nor can they be unlocked from a different context that which acquired | 148 | as large as 'struct semaphore' (24 bytes) and 8 bytes shy of the |
113 | it. [ I'm not aware of any other (e.g. performance) disadvantages from | 149 | 'struct rw_semaphore' variant. Larger structure sizes mean more CPU |
114 | using mutexes at the moment, please let me know if you find any. ] | 150 | cache and memory footprint. |
115 | |||
116 | Implementation of mutexes | ||
117 | ------------------------- | ||
118 | |||
119 | 'struct mutex' is the new mutex type, defined in include/linux/mutex.h and | ||
120 | implemented in kernel/locking/mutex.c. It is a counter-based mutex with a | ||
121 | spinlock and a wait-list. The counter has 3 states: 1 for "unlocked", 0 for | ||
122 | "locked" and negative numbers (usually -1) for "locked, potential waiters | ||
123 | queued". | ||
124 | |||
125 | the APIs of 'struct mutex' have been streamlined: | ||
126 | |||
127 | DEFINE_MUTEX(name); | ||
128 | 151 | ||
129 | mutex_init(mutex); | 152 | When to use mutexes |
153 | ------------------- | ||
130 | 154 | ||
131 | void mutex_lock(struct mutex *lock); | 155 | Unless the strict semantics of mutexes are unsuitable and/or the critical |
132 | int mutex_lock_interruptible(struct mutex *lock); | 156 | region prevents the lock from being shared, always prefer them to any other |
133 | int mutex_trylock(struct mutex *lock); | 157 | locking primitive. |
134 | void mutex_unlock(struct mutex *lock); | ||
135 | int mutex_is_locked(struct mutex *lock); | ||
136 | void mutex_lock_nested(struct mutex *lock, unsigned int subclass); | ||
137 | int mutex_lock_interruptible_nested(struct mutex *lock, | ||
138 | unsigned int subclass); | ||
139 | int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock); | ||
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index b660088c220d..fcefdda5136d 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig | |||
@@ -121,6 +121,7 @@ config X86 | |||
121 | select MODULES_USE_ELF_RELA if X86_64 | 121 | select MODULES_USE_ELF_RELA if X86_64 |
122 | select CLONE_BACKWARDS if X86_32 | 122 | select CLONE_BACKWARDS if X86_32 |
123 | select ARCH_USE_BUILTIN_BSWAP | 123 | select ARCH_USE_BUILTIN_BSWAP |
124 | select ARCH_USE_QUEUE_RWLOCK | ||
124 | select OLD_SIGSUSPEND3 if X86_32 || IA32_EMULATION | 125 | select OLD_SIGSUSPEND3 if X86_32 || IA32_EMULATION |
125 | select OLD_SIGACTION if X86_32 | 126 | select OLD_SIGACTION if X86_32 |
126 | select COMPAT_OLD_SIGACTION if IA32_EMULATION | 127 | select COMPAT_OLD_SIGACTION if IA32_EMULATION |
diff --git a/arch/x86/include/asm/qrwlock.h b/arch/x86/include/asm/qrwlock.h new file mode 100644 index 000000000000..70f46f07f94e --- /dev/null +++ b/arch/x86/include/asm/qrwlock.h | |||
@@ -0,0 +1,17 @@ | |||
1 | #ifndef _ASM_X86_QRWLOCK_H | ||
2 | #define _ASM_X86_QRWLOCK_H | ||
3 | |||
4 | #include <asm-generic/qrwlock_types.h> | ||
5 | |||
6 | #if !defined(CONFIG_X86_OOSTORE) && !defined(CONFIG_X86_PPRO_FENCE) | ||
7 | #define queue_write_unlock queue_write_unlock | ||
8 | static inline void queue_write_unlock(struct qrwlock *lock) | ||
9 | { | ||
10 | barrier(); | ||
11 | ACCESS_ONCE(*(u8 *)&lock->cnts) = 0; | ||
12 | } | ||
13 | #endif | ||
14 | |||
15 | #include <asm-generic/qrwlock.h> | ||
16 | |||
17 | #endif /* _ASM_X86_QRWLOCK_H */ | ||
diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h index 0f62f5482d91..54f1c8068c02 100644 --- a/arch/x86/include/asm/spinlock.h +++ b/arch/x86/include/asm/spinlock.h | |||
@@ -187,6 +187,7 @@ static inline void arch_spin_unlock_wait(arch_spinlock_t *lock) | |||
187 | cpu_relax(); | 187 | cpu_relax(); |
188 | } | 188 | } |
189 | 189 | ||
190 | #ifndef CONFIG_QUEUE_RWLOCK | ||
190 | /* | 191 | /* |
191 | * Read-write spinlocks, allowing multiple readers | 192 | * Read-write spinlocks, allowing multiple readers |
192 | * but only one writer. | 193 | * but only one writer. |
@@ -269,6 +270,9 @@ static inline void arch_write_unlock(arch_rwlock_t *rw) | |||
269 | asm volatile(LOCK_PREFIX WRITE_LOCK_ADD(%1) "%0" | 270 | asm volatile(LOCK_PREFIX WRITE_LOCK_ADD(%1) "%0" |
270 | : "+m" (rw->write) : "i" (RW_LOCK_BIAS) : "memory"); | 271 | : "+m" (rw->write) : "i" (RW_LOCK_BIAS) : "memory"); |
271 | } | 272 | } |
273 | #else | ||
274 | #include <asm/qrwlock.h> | ||
275 | #endif /* CONFIG_QUEUE_RWLOCK */ | ||
272 | 276 | ||
273 | #define arch_read_lock_flags(lock, flags) arch_read_lock(lock) | 277 | #define arch_read_lock_flags(lock, flags) arch_read_lock(lock) |
274 | #define arch_write_lock_flags(lock, flags) arch_write_lock(lock) | 278 | #define arch_write_lock_flags(lock, flags) arch_write_lock(lock) |
diff --git a/arch/x86/include/asm/spinlock_types.h b/arch/x86/include/asm/spinlock_types.h index 4f1bea19945b..73c4c007200f 100644 --- a/arch/x86/include/asm/spinlock_types.h +++ b/arch/x86/include/asm/spinlock_types.h | |||
@@ -34,6 +34,10 @@ typedef struct arch_spinlock { | |||
34 | 34 | ||
35 | #define __ARCH_SPIN_LOCK_UNLOCKED { { 0 } } | 35 | #define __ARCH_SPIN_LOCK_UNLOCKED { { 0 } } |
36 | 36 | ||
37 | #ifdef CONFIG_QUEUE_RWLOCK | ||
38 | #include <asm-generic/qrwlock_types.h> | ||
39 | #else | ||
37 | #include <asm/rwlock.h> | 40 | #include <asm/rwlock.h> |
41 | #endif | ||
38 | 42 | ||
39 | #endif /* _ASM_X86_SPINLOCK_TYPES_H */ | 43 | #endif /* _ASM_X86_SPINLOCK_TYPES_H */ |
diff --git a/include/asm-generic/qrwlock.h b/include/asm-generic/qrwlock.h new file mode 100644 index 000000000000..6383d54bf983 --- /dev/null +++ b/include/asm-generic/qrwlock.h | |||
@@ -0,0 +1,166 @@ | |||
1 | /* | ||
2 | * Queue read/write lock | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or modify | ||
5 | * it under the terms of the GNU General Public License as published by | ||
6 | * the Free Software Foundation; either version 2 of the License, or | ||
7 | * (at your option) any later version. | ||
8 | * | ||
9 | * This program is distributed in the hope that it will be useful, | ||
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
12 | * GNU General Public License for more details. | ||
13 | * | ||
14 | * (C) Copyright 2013-2014 Hewlett-Packard Development Company, L.P. | ||
15 | * | ||
16 | * Authors: Waiman Long <waiman.long@hp.com> | ||
17 | */ | ||
18 | #ifndef __ASM_GENERIC_QRWLOCK_H | ||
19 | #define __ASM_GENERIC_QRWLOCK_H | ||
20 | |||
21 | #include <linux/atomic.h> | ||
22 | #include <asm/barrier.h> | ||
23 | #include <asm/processor.h> | ||
24 | |||
25 | #include <asm-generic/qrwlock_types.h> | ||
26 | |||
27 | /* | ||
28 | * Writer states & reader shift and bias | ||
29 | */ | ||
30 | #define _QW_WAITING 1 /* A writer is waiting */ | ||
31 | #define _QW_LOCKED 0xff /* A writer holds the lock */ | ||
32 | #define _QW_WMASK 0xff /* Writer mask */ | ||
33 | #define _QR_SHIFT 8 /* Reader count shift */ | ||
34 | #define _QR_BIAS (1U << _QR_SHIFT) | ||
35 | |||
36 | /* | ||
37 | * External function declarations | ||
38 | */ | ||
39 | extern void queue_read_lock_slowpath(struct qrwlock *lock); | ||
40 | extern void queue_write_lock_slowpath(struct qrwlock *lock); | ||
41 | |||
42 | /** | ||
43 | * queue_read_can_lock- would read_trylock() succeed? | ||
44 | * @lock: Pointer to queue rwlock structure | ||
45 | */ | ||
46 | static inline int queue_read_can_lock(struct qrwlock *lock) | ||
47 | { | ||
48 | return !(atomic_read(&lock->cnts) & _QW_WMASK); | ||
49 | } | ||
50 | |||
51 | /** | ||
52 | * queue_write_can_lock- would write_trylock() succeed? | ||
53 | * @lock: Pointer to queue rwlock structure | ||
54 | */ | ||
55 | static inline int queue_write_can_lock(struct qrwlock *lock) | ||
56 | { | ||
57 | return !atomic_read(&lock->cnts); | ||
58 | } | ||
59 | |||
60 | /** | ||
61 | * queue_read_trylock - try to acquire read lock of a queue rwlock | ||
62 | * @lock : Pointer to queue rwlock structure | ||
63 | * Return: 1 if lock acquired, 0 if failed | ||
64 | */ | ||
65 | static inline int queue_read_trylock(struct qrwlock *lock) | ||
66 | { | ||
67 | u32 cnts; | ||
68 | |||
69 | cnts = atomic_read(&lock->cnts); | ||
70 | if (likely(!(cnts & _QW_WMASK))) { | ||
71 | cnts = (u32)atomic_add_return(_QR_BIAS, &lock->cnts); | ||
72 | if (likely(!(cnts & _QW_WMASK))) | ||
73 | return 1; | ||
74 | atomic_sub(_QR_BIAS, &lock->cnts); | ||
75 | } | ||
76 | return 0; | ||
77 | } | ||
78 | |||
79 | /** | ||
80 | * queue_write_trylock - try to acquire write lock of a queue rwlock | ||
81 | * @lock : Pointer to queue rwlock structure | ||
82 | * Return: 1 if lock acquired, 0 if failed | ||
83 | */ | ||
84 | static inline int queue_write_trylock(struct qrwlock *lock) | ||
85 | { | ||
86 | u32 cnts; | ||
87 | |||
88 | cnts = atomic_read(&lock->cnts); | ||
89 | if (unlikely(cnts)) | ||
90 | return 0; | ||
91 | |||
92 | return likely(atomic_cmpxchg(&lock->cnts, | ||
93 | cnts, cnts | _QW_LOCKED) == cnts); | ||
94 | } | ||
95 | /** | ||
96 | * queue_read_lock - acquire read lock of a queue rwlock | ||
97 | * @lock: Pointer to queue rwlock structure | ||
98 | */ | ||
99 | static inline void queue_read_lock(struct qrwlock *lock) | ||
100 | { | ||
101 | u32 cnts; | ||
102 | |||
103 | cnts = atomic_add_return(_QR_BIAS, &lock->cnts); | ||
104 | if (likely(!(cnts & _QW_WMASK))) | ||
105 | return; | ||
106 | |||
107 | /* The slowpath will decrement the reader count, if necessary. */ | ||
108 | queue_read_lock_slowpath(lock); | ||
109 | } | ||
110 | |||
111 | /** | ||
112 | * queue_write_lock - acquire write lock of a queue rwlock | ||
113 | * @lock : Pointer to queue rwlock structure | ||
114 | */ | ||
115 | static inline void queue_write_lock(struct qrwlock *lock) | ||
116 | { | ||
117 | /* Optimize for the unfair lock case where the fair flag is 0. */ | ||
118 | if (atomic_cmpxchg(&lock->cnts, 0, _QW_LOCKED) == 0) | ||
119 | return; | ||
120 | |||
121 | queue_write_lock_slowpath(lock); | ||
122 | } | ||
123 | |||
124 | /** | ||
125 | * queue_read_unlock - release read lock of a queue rwlock | ||
126 | * @lock : Pointer to queue rwlock structure | ||
127 | */ | ||
128 | static inline void queue_read_unlock(struct qrwlock *lock) | ||
129 | { | ||
130 | /* | ||
131 | * Atomically decrement the reader count | ||
132 | */ | ||
133 | smp_mb__before_atomic(); | ||
134 | atomic_sub(_QR_BIAS, &lock->cnts); | ||
135 | } | ||
136 | |||
137 | #ifndef queue_write_unlock | ||
138 | /** | ||
139 | * queue_write_unlock - release write lock of a queue rwlock | ||
140 | * @lock : Pointer to queue rwlock structure | ||
141 | */ | ||
142 | static inline void queue_write_unlock(struct qrwlock *lock) | ||
143 | { | ||
144 | /* | ||
145 | * If the writer field is atomic, it can be cleared directly. | ||
146 | * Otherwise, an atomic subtraction will be used to clear it. | ||
147 | */ | ||
148 | smp_mb__before_atomic(); | ||
149 | atomic_sub(_QW_LOCKED, &lock->cnts); | ||
150 | } | ||
151 | #endif | ||
152 | |||
153 | /* | ||
154 | * Remapping rwlock architecture specific functions to the corresponding | ||
155 | * queue rwlock functions. | ||
156 | */ | ||
157 | #define arch_read_can_lock(l) queue_read_can_lock(l) | ||
158 | #define arch_write_can_lock(l) queue_write_can_lock(l) | ||
159 | #define arch_read_lock(l) queue_read_lock(l) | ||
160 | #define arch_write_lock(l) queue_write_lock(l) | ||
161 | #define arch_read_trylock(l) queue_read_trylock(l) | ||
162 | #define arch_write_trylock(l) queue_write_trylock(l) | ||
163 | #define arch_read_unlock(l) queue_read_unlock(l) | ||
164 | #define arch_write_unlock(l) queue_write_unlock(l) | ||
165 | |||
166 | #endif /* __ASM_GENERIC_QRWLOCK_H */ | ||
diff --git a/include/asm-generic/qrwlock_types.h b/include/asm-generic/qrwlock_types.h new file mode 100644 index 000000000000..4d76f24df518 --- /dev/null +++ b/include/asm-generic/qrwlock_types.h | |||
@@ -0,0 +1,21 @@ | |||
1 | #ifndef __ASM_GENERIC_QRWLOCK_TYPES_H | ||
2 | #define __ASM_GENERIC_QRWLOCK_TYPES_H | ||
3 | |||
4 | #include <linux/types.h> | ||
5 | #include <asm/spinlock_types.h> | ||
6 | |||
7 | /* | ||
8 | * The queue read/write lock data structure | ||
9 | */ | ||
10 | |||
11 | typedef struct qrwlock { | ||
12 | atomic_t cnts; | ||
13 | arch_spinlock_t lock; | ||
14 | } arch_rwlock_t; | ||
15 | |||
16 | #define __ARCH_RW_LOCK_UNLOCKED { \ | ||
17 | .cnts = ATOMIC_INIT(0), \ | ||
18 | .lock = __ARCH_SPIN_LOCK_UNLOCKED, \ | ||
19 | } | ||
20 | |||
21 | #endif /* __ASM_GENERIC_QRWLOCK_TYPES_H */ | ||
diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h index 03f3b05e8ec1..8d79708146aa 100644 --- a/include/linux/rwsem.h +++ b/include/linux/rwsem.h | |||
@@ -16,6 +16,7 @@ | |||
16 | 16 | ||
17 | #include <linux/atomic.h> | 17 | #include <linux/atomic.h> |
18 | 18 | ||
19 | struct optimistic_spin_queue; | ||
19 | struct rw_semaphore; | 20 | struct rw_semaphore; |
20 | 21 | ||
21 | #ifdef CONFIG_RWSEM_GENERIC_SPINLOCK | 22 | #ifdef CONFIG_RWSEM_GENERIC_SPINLOCK |
@@ -23,9 +24,17 @@ struct rw_semaphore; | |||
23 | #else | 24 | #else |
24 | /* All arch specific implementations share the same struct */ | 25 | /* All arch specific implementations share the same struct */ |
25 | struct rw_semaphore { | 26 | struct rw_semaphore { |
26 | long count; | 27 | long count; |
27 | raw_spinlock_t wait_lock; | 28 | raw_spinlock_t wait_lock; |
28 | struct list_head wait_list; | 29 | struct list_head wait_list; |
30 | #ifdef CONFIG_SMP | ||
31 | /* | ||
32 | * Write owner. Used as a speculative check to see | ||
33 | * if the owner is running on the cpu. | ||
34 | */ | ||
35 | struct task_struct *owner; | ||
36 | struct optimistic_spin_queue *osq; /* spinner MCS lock */ | ||
37 | #endif | ||
29 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | 38 | #ifdef CONFIG_DEBUG_LOCK_ALLOC |
30 | struct lockdep_map dep_map; | 39 | struct lockdep_map dep_map; |
31 | #endif | 40 | #endif |
@@ -55,11 +64,21 @@ static inline int rwsem_is_locked(struct rw_semaphore *sem) | |||
55 | # define __RWSEM_DEP_MAP_INIT(lockname) | 64 | # define __RWSEM_DEP_MAP_INIT(lockname) |
56 | #endif | 65 | #endif |
57 | 66 | ||
67 | #if defined(CONFIG_SMP) && !defined(CONFIG_RWSEM_GENERIC_SPINLOCK) | ||
68 | #define __RWSEM_INITIALIZER(name) \ | ||
69 | { RWSEM_UNLOCKED_VALUE, \ | ||
70 | __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock), \ | ||
71 | LIST_HEAD_INIT((name).wait_list), \ | ||
72 | NULL, /* owner */ \ | ||
73 | NULL /* mcs lock */ \ | ||
74 | __RWSEM_DEP_MAP_INIT(name) } | ||
75 | #else | ||
58 | #define __RWSEM_INITIALIZER(name) \ | 76 | #define __RWSEM_INITIALIZER(name) \ |
59 | { RWSEM_UNLOCKED_VALUE, \ | 77 | { RWSEM_UNLOCKED_VALUE, \ |
60 | __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock), \ | 78 | __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock), \ |
61 | LIST_HEAD_INIT((name).wait_list) \ | 79 | LIST_HEAD_INIT((name).wait_list) \ |
62 | __RWSEM_DEP_MAP_INIT(name) } | 80 | __RWSEM_DEP_MAP_INIT(name) } |
81 | #endif | ||
63 | 82 | ||
64 | #define DECLARE_RWSEM(name) \ | 83 | #define DECLARE_RWSEM(name) \ |
65 | struct rw_semaphore name = __RWSEM_INITIALIZER(name) | 84 | struct rw_semaphore name = __RWSEM_INITIALIZER(name) |
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks index d2b32ac27a39..35536d9c0964 100644 --- a/kernel/Kconfig.locks +++ b/kernel/Kconfig.locks | |||
@@ -223,3 +223,10 @@ endif | |||
223 | config MUTEX_SPIN_ON_OWNER | 223 | config MUTEX_SPIN_ON_OWNER |
224 | def_bool y | 224 | def_bool y |
225 | depends on SMP && !DEBUG_MUTEXES | 225 | depends on SMP && !DEBUG_MUTEXES |
226 | |||
227 | config ARCH_USE_QUEUE_RWLOCK | ||
228 | bool | ||
229 | |||
230 | config QUEUE_RWLOCK | ||
231 | def_bool y if ARCH_USE_QUEUE_RWLOCK | ||
232 | depends on SMP | ||
diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile index b8bdcd4785b7..8541bfdfd232 100644 --- a/kernel/locking/Makefile +++ b/kernel/locking/Makefile | |||
@@ -24,4 +24,5 @@ obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o | |||
24 | obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o | 24 | obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o |
25 | obj-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem-xadd.o | 25 | obj-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem-xadd.o |
26 | obj-$(CONFIG_PERCPU_RWSEM) += percpu-rwsem.o | 26 | obj-$(CONFIG_PERCPU_RWSEM) += percpu-rwsem.o |
27 | obj-$(CONFIG_QUEUE_RWLOCK) += qrwlock.o | ||
27 | obj-$(CONFIG_LOCK_TORTURE_TEST) += locktorture.o | 28 | obj-$(CONFIG_LOCK_TORTURE_TEST) += locktorture.o |
diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c new file mode 100644 index 000000000000..fb5b8ac411a5 --- /dev/null +++ b/kernel/locking/qrwlock.c | |||
@@ -0,0 +1,133 @@ | |||
1 | /* | ||
2 | * Queue read/write lock | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or modify | ||
5 | * it under the terms of the GNU General Public License as published by | ||
6 | * the Free Software Foundation; either version 2 of the License, or | ||
7 | * (at your option) any later version. | ||
8 | * | ||
9 | * This program is distributed in the hope that it will be useful, | ||
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
12 | * GNU General Public License for more details. | ||
13 | * | ||
14 | * (C) Copyright 2013-2014 Hewlett-Packard Development Company, L.P. | ||
15 | * | ||
16 | * Authors: Waiman Long <waiman.long@hp.com> | ||
17 | */ | ||
18 | #include <linux/smp.h> | ||
19 | #include <linux/bug.h> | ||
20 | #include <linux/cpumask.h> | ||
21 | #include <linux/percpu.h> | ||
22 | #include <linux/hardirq.h> | ||
23 | #include <linux/mutex.h> | ||
24 | #include <asm/qrwlock.h> | ||
25 | |||
26 | /** | ||
27 | * rspin_until_writer_unlock - inc reader count & spin until writer is gone | ||
28 | * @lock : Pointer to queue rwlock structure | ||
29 | * @writer: Current queue rwlock writer status byte | ||
30 | * | ||
31 | * In interrupt context or at the head of the queue, the reader will just | ||
32 | * increment the reader count & wait until the writer releases the lock. | ||
33 | */ | ||
34 | static __always_inline void | ||
35 | rspin_until_writer_unlock(struct qrwlock *lock, u32 cnts) | ||
36 | { | ||
37 | while ((cnts & _QW_WMASK) == _QW_LOCKED) { | ||
38 | arch_mutex_cpu_relax(); | ||
39 | cnts = smp_load_acquire((u32 *)&lock->cnts); | ||
40 | } | ||
41 | } | ||
42 | |||
43 | /** | ||
44 | * queue_read_lock_slowpath - acquire read lock of a queue rwlock | ||
45 | * @lock: Pointer to queue rwlock structure | ||
46 | */ | ||
47 | void queue_read_lock_slowpath(struct qrwlock *lock) | ||
48 | { | ||
49 | u32 cnts; | ||
50 | |||
51 | /* | ||
52 | * Readers come here when they cannot get the lock without waiting | ||
53 | */ | ||
54 | if (unlikely(in_interrupt())) { | ||
55 | /* | ||
56 | * Readers in interrupt context will spin until the lock is | ||
57 | * available without waiting in the queue. | ||
58 | */ | ||
59 | cnts = smp_load_acquire((u32 *)&lock->cnts); | ||
60 | rspin_until_writer_unlock(lock, cnts); | ||
61 | return; | ||
62 | } | ||
63 | atomic_sub(_QR_BIAS, &lock->cnts); | ||
64 | |||
65 | /* | ||
66 | * Put the reader into the wait queue | ||
67 | */ | ||
68 | arch_spin_lock(&lock->lock); | ||
69 | |||
70 | /* | ||
71 | * At the head of the wait queue now, wait until the writer state | ||
72 | * goes to 0 and then try to increment the reader count and get | ||
73 | * the lock. It is possible that an incoming writer may steal the | ||
74 | * lock in the interim, so it is necessary to check the writer byte | ||
75 | * to make sure that the write lock isn't taken. | ||
76 | */ | ||
77 | while (atomic_read(&lock->cnts) & _QW_WMASK) | ||
78 | arch_mutex_cpu_relax(); | ||
79 | |||
80 | cnts = atomic_add_return(_QR_BIAS, &lock->cnts) - _QR_BIAS; | ||
81 | rspin_until_writer_unlock(lock, cnts); | ||
82 | |||
83 | /* | ||
84 | * Signal the next one in queue to become queue head | ||
85 | */ | ||
86 | arch_spin_unlock(&lock->lock); | ||
87 | } | ||
88 | EXPORT_SYMBOL(queue_read_lock_slowpath); | ||
89 | |||
90 | /** | ||
91 | * queue_write_lock_slowpath - acquire write lock of a queue rwlock | ||
92 | * @lock : Pointer to queue rwlock structure | ||
93 | */ | ||
94 | void queue_write_lock_slowpath(struct qrwlock *lock) | ||
95 | { | ||
96 | u32 cnts; | ||
97 | |||
98 | /* Put the writer into the wait queue */ | ||
99 | arch_spin_lock(&lock->lock); | ||
100 | |||
101 | /* Try to acquire the lock directly if no reader is present */ | ||
102 | if (!atomic_read(&lock->cnts) && | ||
103 | (atomic_cmpxchg(&lock->cnts, 0, _QW_LOCKED) == 0)) | ||
104 | goto unlock; | ||
105 | |||
106 | /* | ||
107 | * Set the waiting flag to notify readers that a writer is pending, | ||
108 | * or wait for a previous writer to go away. | ||
109 | */ | ||
110 | for (;;) { | ||
111 | cnts = atomic_read(&lock->cnts); | ||
112 | if (!(cnts & _QW_WMASK) && | ||
113 | (atomic_cmpxchg(&lock->cnts, cnts, | ||
114 | cnts | _QW_WAITING) == cnts)) | ||
115 | break; | ||
116 | |||
117 | arch_mutex_cpu_relax(); | ||
118 | } | ||
119 | |||
120 | /* When no more readers, set the locked flag */ | ||
121 | for (;;) { | ||
122 | cnts = atomic_read(&lock->cnts); | ||
123 | if ((cnts == _QW_WAITING) && | ||
124 | (atomic_cmpxchg(&lock->cnts, _QW_WAITING, | ||
125 | _QW_LOCKED) == _QW_WAITING)) | ||
126 | break; | ||
127 | |||
128 | arch_mutex_cpu_relax(); | ||
129 | } | ||
130 | unlock: | ||
131 | arch_spin_unlock(&lock->lock); | ||
132 | } | ||
133 | EXPORT_SYMBOL(queue_write_lock_slowpath); | ||
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index b4219ff87b8c..dacc32142fcc 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c | |||
@@ -5,11 +5,17 @@ | |||
5 | * | 5 | * |
6 | * Writer lock-stealing by Alex Shi <alex.shi@intel.com> | 6 | * Writer lock-stealing by Alex Shi <alex.shi@intel.com> |
7 | * and Michel Lespinasse <walken@google.com> | 7 | * and Michel Lespinasse <walken@google.com> |
8 | * | ||
9 | * Optimistic spinning by Tim Chen <tim.c.chen@intel.com> | ||
10 | * and Davidlohr Bueso <davidlohr@hp.com>. Based on mutexes. | ||
8 | */ | 11 | */ |
9 | #include <linux/rwsem.h> | 12 | #include <linux/rwsem.h> |
10 | #include <linux/sched.h> | 13 | #include <linux/sched.h> |
11 | #include <linux/init.h> | 14 | #include <linux/init.h> |
12 | #include <linux/export.h> | 15 | #include <linux/export.h> |
16 | #include <linux/sched/rt.h> | ||
17 | |||
18 | #include "mcs_spinlock.h" | ||
13 | 19 | ||
14 | /* | 20 | /* |
15 | * Guide to the rw_semaphore's count field for common values. | 21 | * Guide to the rw_semaphore's count field for common values. |
@@ -76,6 +82,10 @@ void __init_rwsem(struct rw_semaphore *sem, const char *name, | |||
76 | sem->count = RWSEM_UNLOCKED_VALUE; | 82 | sem->count = RWSEM_UNLOCKED_VALUE; |
77 | raw_spin_lock_init(&sem->wait_lock); | 83 | raw_spin_lock_init(&sem->wait_lock); |
78 | INIT_LIST_HEAD(&sem->wait_list); | 84 | INIT_LIST_HEAD(&sem->wait_list); |
85 | #ifdef CONFIG_SMP | ||
86 | sem->owner = NULL; | ||
87 | sem->osq = NULL; | ||
88 | #endif | ||
79 | } | 89 | } |
80 | 90 | ||
81 | EXPORT_SYMBOL(__init_rwsem); | 91 | EXPORT_SYMBOL(__init_rwsem); |
@@ -190,7 +200,7 @@ __rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type) | |||
190 | } | 200 | } |
191 | 201 | ||
192 | /* | 202 | /* |
193 | * wait for the read lock to be granted | 203 | * Wait for the read lock to be granted |
194 | */ | 204 | */ |
195 | __visible | 205 | __visible |
196 | struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) | 206 | struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) |
@@ -237,64 +247,221 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) | |||
237 | return sem; | 247 | return sem; |
238 | } | 248 | } |
239 | 249 | ||
250 | static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem) | ||
251 | { | ||
252 | if (!(count & RWSEM_ACTIVE_MASK)) { | ||
253 | /* try acquiring the write lock */ | ||
254 | if (sem->count == RWSEM_WAITING_BIAS && | ||
255 | cmpxchg(&sem->count, RWSEM_WAITING_BIAS, | ||
256 | RWSEM_ACTIVE_WRITE_BIAS) == RWSEM_WAITING_BIAS) { | ||
257 | if (!list_is_singular(&sem->wait_list)) | ||
258 | rwsem_atomic_update(RWSEM_WAITING_BIAS, sem); | ||
259 | return true; | ||
260 | } | ||
261 | } | ||
262 | return false; | ||
263 | } | ||
264 | |||
265 | #ifdef CONFIG_SMP | ||
240 | /* | 266 | /* |
241 | * wait until we successfully acquire the write lock | 267 | * Try to acquire write lock before the writer has been put on wait queue. |
268 | */ | ||
269 | static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem) | ||
270 | { | ||
271 | long old, count = ACCESS_ONCE(sem->count); | ||
272 | |||
273 | while (true) { | ||
274 | if (!(count == 0 || count == RWSEM_WAITING_BIAS)) | ||
275 | return false; | ||
276 | |||
277 | old = cmpxchg(&sem->count, count, count + RWSEM_ACTIVE_WRITE_BIAS); | ||
278 | if (old == count) | ||
279 | return true; | ||
280 | |||
281 | count = old; | ||
282 | } | ||
283 | } | ||
284 | |||
285 | static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem) | ||
286 | { | ||
287 | struct task_struct *owner; | ||
288 | bool on_cpu = true; | ||
289 | |||
290 | if (need_resched()) | ||
291 | return 0; | ||
292 | |||
293 | rcu_read_lock(); | ||
294 | owner = ACCESS_ONCE(sem->owner); | ||
295 | if (owner) | ||
296 | on_cpu = owner->on_cpu; | ||
297 | rcu_read_unlock(); | ||
298 | |||
299 | /* | ||
300 | * If sem->owner is not set, the rwsem owner may have | ||
301 | * just acquired it and not set the owner yet or the rwsem | ||
302 | * has been released. | ||
303 | */ | ||
304 | return on_cpu; | ||
305 | } | ||
306 | |||
307 | static inline bool owner_running(struct rw_semaphore *sem, | ||
308 | struct task_struct *owner) | ||
309 | { | ||
310 | if (sem->owner != owner) | ||
311 | return false; | ||
312 | |||
313 | /* | ||
314 | * Ensure we emit the owner->on_cpu, dereference _after_ checking | ||
315 | * sem->owner still matches owner, if that fails, owner might | ||
316 | * point to free()d memory, if it still matches, the rcu_read_lock() | ||
317 | * ensures the memory stays valid. | ||
318 | */ | ||
319 | barrier(); | ||
320 | |||
321 | return owner->on_cpu; | ||
322 | } | ||
323 | |||
324 | static noinline | ||
325 | bool rwsem_spin_on_owner(struct rw_semaphore *sem, struct task_struct *owner) | ||
326 | { | ||
327 | rcu_read_lock(); | ||
328 | while (owner_running(sem, owner)) { | ||
329 | if (need_resched()) | ||
330 | break; | ||
331 | |||
332 | arch_mutex_cpu_relax(); | ||
333 | } | ||
334 | rcu_read_unlock(); | ||
335 | |||
336 | /* | ||
337 | * We break out the loop above on need_resched() or when the | ||
338 | * owner changed, which is a sign for heavy contention. Return | ||
339 | * success only when sem->owner is NULL. | ||
340 | */ | ||
341 | return sem->owner == NULL; | ||
342 | } | ||
343 | |||
344 | static bool rwsem_optimistic_spin(struct rw_semaphore *sem) | ||
345 | { | ||
346 | struct task_struct *owner; | ||
347 | bool taken = false; | ||
348 | |||
349 | preempt_disable(); | ||
350 | |||
351 | /* sem->wait_lock should not be held when doing optimistic spinning */ | ||
352 | if (!rwsem_can_spin_on_owner(sem)) | ||
353 | goto done; | ||
354 | |||
355 | if (!osq_lock(&sem->osq)) | ||
356 | goto done; | ||
357 | |||
358 | while (true) { | ||
359 | owner = ACCESS_ONCE(sem->owner); | ||
360 | if (owner && !rwsem_spin_on_owner(sem, owner)) | ||
361 | break; | ||
362 | |||
363 | /* wait_lock will be acquired if write_lock is obtained */ | ||
364 | if (rwsem_try_write_lock_unqueued(sem)) { | ||
365 | taken = true; | ||
366 | break; | ||
367 | } | ||
368 | |||
369 | /* | ||
370 | * When there's no owner, we might have preempted between the | ||
371 | * owner acquiring the lock and setting the owner field. If | ||
372 | * we're an RT task that will live-lock because we won't let | ||
373 | * the owner complete. | ||
374 | */ | ||
375 | if (!owner && (need_resched() || rt_task(current))) | ||
376 | break; | ||
377 | |||
378 | /* | ||
379 | * The cpu_relax() call is a compiler barrier which forces | ||
380 | * everything in this loop to be re-loaded. We don't need | ||
381 | * memory barriers as we'll eventually observe the right | ||
382 | * values at the cost of a few extra spins. | ||
383 | */ | ||
384 | arch_mutex_cpu_relax(); | ||
385 | } | ||
386 | osq_unlock(&sem->osq); | ||
387 | done: | ||
388 | preempt_enable(); | ||
389 | return taken; | ||
390 | } | ||
391 | |||
392 | #else | ||
393 | static bool rwsem_optimistic_spin(struct rw_semaphore *sem) | ||
394 | { | ||
395 | return false; | ||
396 | } | ||
397 | #endif | ||
398 | |||
399 | /* | ||
400 | * Wait until we successfully acquire the write lock | ||
242 | */ | 401 | */ |
243 | __visible | 402 | __visible |
244 | struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem) | 403 | struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem) |
245 | { | 404 | { |
246 | long count, adjustment = -RWSEM_ACTIVE_WRITE_BIAS; | 405 | long count; |
406 | bool waiting = true; /* any queued threads before us */ | ||
247 | struct rwsem_waiter waiter; | 407 | struct rwsem_waiter waiter; |
248 | struct task_struct *tsk = current; | ||
249 | 408 | ||
250 | /* set up my own style of waitqueue */ | 409 | /* undo write bias from down_write operation, stop active locking */ |
251 | waiter.task = tsk; | 410 | count = rwsem_atomic_update(-RWSEM_ACTIVE_WRITE_BIAS, sem); |
411 | |||
412 | /* do optimistic spinning and steal lock if possible */ | ||
413 | if (rwsem_optimistic_spin(sem)) | ||
414 | return sem; | ||
415 | |||
416 | /* | ||
417 | * Optimistic spinning failed, proceed to the slowpath | ||
418 | * and block until we can acquire the sem. | ||
419 | */ | ||
420 | waiter.task = current; | ||
252 | waiter.type = RWSEM_WAITING_FOR_WRITE; | 421 | waiter.type = RWSEM_WAITING_FOR_WRITE; |
253 | 422 | ||
254 | raw_spin_lock_irq(&sem->wait_lock); | 423 | raw_spin_lock_irq(&sem->wait_lock); |
424 | |||
425 | /* account for this before adding a new element to the list */ | ||
255 | if (list_empty(&sem->wait_list)) | 426 | if (list_empty(&sem->wait_list)) |
256 | adjustment += RWSEM_WAITING_BIAS; | 427 | waiting = false; |
428 | |||
257 | list_add_tail(&waiter.list, &sem->wait_list); | 429 | list_add_tail(&waiter.list, &sem->wait_list); |
258 | 430 | ||
259 | /* we're now waiting on the lock, but no longer actively locking */ | 431 | /* we're now waiting on the lock, but no longer actively locking */ |
260 | count = rwsem_atomic_update(adjustment, sem); | 432 | if (waiting) { |
433 | count = ACCESS_ONCE(sem->count); | ||
261 | 434 | ||
262 | /* If there were already threads queued before us and there are no | 435 | /* |
263 | * active writers, the lock must be read owned; so we try to wake | 436 | * If there were already threads queued before us and there are |
264 | * any read locks that were queued ahead of us. */ | 437 | * no active writers, the lock must be read owned; so we try to |
265 | if (count > RWSEM_WAITING_BIAS && | 438 | * wake any read locks that were queued ahead of us. |
266 | adjustment == -RWSEM_ACTIVE_WRITE_BIAS) | 439 | */ |
267 | sem = __rwsem_do_wake(sem, RWSEM_WAKE_READERS); | 440 | if (count > RWSEM_WAITING_BIAS) |
441 | sem = __rwsem_do_wake(sem, RWSEM_WAKE_READERS); | ||
442 | |||
443 | } else | ||
444 | count = rwsem_atomic_update(RWSEM_WAITING_BIAS, sem); | ||
268 | 445 | ||
269 | /* wait until we successfully acquire the lock */ | 446 | /* wait until we successfully acquire the lock */ |
270 | set_task_state(tsk, TASK_UNINTERRUPTIBLE); | 447 | set_current_state(TASK_UNINTERRUPTIBLE); |
271 | while (true) { | 448 | while (true) { |
272 | if (!(count & RWSEM_ACTIVE_MASK)) { | 449 | if (rwsem_try_write_lock(count, sem)) |
273 | /* Try acquiring the write lock. */ | 450 | break; |
274 | count = RWSEM_ACTIVE_WRITE_BIAS; | ||
275 | if (!list_is_singular(&sem->wait_list)) | ||
276 | count += RWSEM_WAITING_BIAS; | ||
277 | |||
278 | if (sem->count == RWSEM_WAITING_BIAS && | ||
279 | cmpxchg(&sem->count, RWSEM_WAITING_BIAS, count) == | ||
280 | RWSEM_WAITING_BIAS) | ||
281 | break; | ||
282 | } | ||
283 | |||
284 | raw_spin_unlock_irq(&sem->wait_lock); | 451 | raw_spin_unlock_irq(&sem->wait_lock); |
285 | 452 | ||
286 | /* Block until there are no active lockers. */ | 453 | /* Block until there are no active lockers. */ |
287 | do { | 454 | do { |
288 | schedule(); | 455 | schedule(); |
289 | set_task_state(tsk, TASK_UNINTERRUPTIBLE); | 456 | set_current_state(TASK_UNINTERRUPTIBLE); |
290 | } while ((count = sem->count) & RWSEM_ACTIVE_MASK); | 457 | } while ((count = sem->count) & RWSEM_ACTIVE_MASK); |
291 | 458 | ||
292 | raw_spin_lock_irq(&sem->wait_lock); | 459 | raw_spin_lock_irq(&sem->wait_lock); |
293 | } | 460 | } |
461 | __set_current_state(TASK_RUNNING); | ||
294 | 462 | ||
295 | list_del(&waiter.list); | 463 | list_del(&waiter.list); |
296 | raw_spin_unlock_irq(&sem->wait_lock); | 464 | raw_spin_unlock_irq(&sem->wait_lock); |
297 | tsk->state = TASK_RUNNING; | ||
298 | 465 | ||
299 | return sem; | 466 | return sem; |
300 | } | 467 | } |
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index cfff1435bdfb..42f806de49d4 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c | |||
@@ -12,6 +12,27 @@ | |||
12 | 12 | ||
13 | #include <linux/atomic.h> | 13 | #include <linux/atomic.h> |
14 | 14 | ||
15 | #if defined(CONFIG_SMP) && defined(CONFIG_RWSEM_XCHGADD_ALGORITHM) | ||
16 | static inline void rwsem_set_owner(struct rw_semaphore *sem) | ||
17 | { | ||
18 | sem->owner = current; | ||
19 | } | ||
20 | |||
21 | static inline void rwsem_clear_owner(struct rw_semaphore *sem) | ||
22 | { | ||
23 | sem->owner = NULL; | ||
24 | } | ||
25 | |||
26 | #else | ||
27 | static inline void rwsem_set_owner(struct rw_semaphore *sem) | ||
28 | { | ||
29 | } | ||
30 | |||
31 | static inline void rwsem_clear_owner(struct rw_semaphore *sem) | ||
32 | { | ||
33 | } | ||
34 | #endif | ||
35 | |||
15 | /* | 36 | /* |
16 | * lock for reading | 37 | * lock for reading |
17 | */ | 38 | */ |
@@ -48,6 +69,7 @@ void __sched down_write(struct rw_semaphore *sem) | |||
48 | rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_); | 69 | rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_); |
49 | 70 | ||
50 | LOCK_CONTENDED(sem, __down_write_trylock, __down_write); | 71 | LOCK_CONTENDED(sem, __down_write_trylock, __down_write); |
72 | rwsem_set_owner(sem); | ||
51 | } | 73 | } |
52 | 74 | ||
53 | EXPORT_SYMBOL(down_write); | 75 | EXPORT_SYMBOL(down_write); |
@@ -59,8 +81,11 @@ int down_write_trylock(struct rw_semaphore *sem) | |||
59 | { | 81 | { |
60 | int ret = __down_write_trylock(sem); | 82 | int ret = __down_write_trylock(sem); |
61 | 83 | ||
62 | if (ret == 1) | 84 | if (ret == 1) { |
63 | rwsem_acquire(&sem->dep_map, 0, 1, _RET_IP_); | 85 | rwsem_acquire(&sem->dep_map, 0, 1, _RET_IP_); |
86 | rwsem_set_owner(sem); | ||
87 | } | ||
88 | |||
64 | return ret; | 89 | return ret; |
65 | } | 90 | } |
66 | 91 | ||
@@ -85,6 +110,7 @@ void up_write(struct rw_semaphore *sem) | |||
85 | { | 110 | { |
86 | rwsem_release(&sem->dep_map, 1, _RET_IP_); | 111 | rwsem_release(&sem->dep_map, 1, _RET_IP_); |
87 | 112 | ||
113 | rwsem_clear_owner(sem); | ||
88 | __up_write(sem); | 114 | __up_write(sem); |
89 | } | 115 | } |
90 | 116 | ||
@@ -99,6 +125,7 @@ void downgrade_write(struct rw_semaphore *sem) | |||
99 | * lockdep: a downgraded write will live on as a write | 125 | * lockdep: a downgraded write will live on as a write |
100 | * dependency. | 126 | * dependency. |
101 | */ | 127 | */ |
128 | rwsem_clear_owner(sem); | ||
102 | __downgrade_write(sem); | 129 | __downgrade_write(sem); |
103 | } | 130 | } |
104 | 131 | ||
@@ -122,6 +149,7 @@ void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest) | |||
122 | rwsem_acquire_nest(&sem->dep_map, 0, 0, nest, _RET_IP_); | 149 | rwsem_acquire_nest(&sem->dep_map, 0, 0, nest, _RET_IP_); |
123 | 150 | ||
124 | LOCK_CONTENDED(sem, __down_write_trylock, __down_write); | 151 | LOCK_CONTENDED(sem, __down_write_trylock, __down_write); |
152 | rwsem_set_owner(sem); | ||
125 | } | 153 | } |
126 | 154 | ||
127 | EXPORT_SYMBOL(_down_write_nest_lock); | 155 | EXPORT_SYMBOL(_down_write_nest_lock); |
@@ -141,6 +169,7 @@ void down_write_nested(struct rw_semaphore *sem, int subclass) | |||
141 | rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_); | 169 | rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_); |
142 | 170 | ||
143 | LOCK_CONTENDED(sem, __down_write_trylock, __down_write); | 171 | LOCK_CONTENDED(sem, __down_write_trylock, __down_write); |
172 | rwsem_set_owner(sem); | ||
144 | } | 173 | } |
145 | 174 | ||
146 | EXPORT_SYMBOL(down_write_nested); | 175 | EXPORT_SYMBOL(down_write_nested); |