aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/atomic_bitops.txt66
-rw-r--r--Documentation/atomic_t.txt200
-rw-r--r--Documentation/locking/crossrelease.txt874
-rw-r--r--Documentation/memory-barriers.txt101
-rw-r--r--Documentation/static-keys.txt20
-rw-r--r--Documentation/translations/ko_KR/memory-barriers.txt5
-rw-r--r--arch/Kconfig12
-rw-r--r--arch/arc/include/asm/atomic.h2
-rw-r--r--arch/arm64/include/asm/spinlock.h11
-rw-r--r--arch/hexagon/include/asm/atomic.h2
-rw-r--r--arch/metag/include/asm/atomic_lock1.h2
-rw-r--r--arch/parisc/include/asm/atomic.h2
-rw-r--r--arch/powerpc/include/asm/barrier.h7
-rw-r--r--arch/powerpc/include/asm/spinlock.h3
-rw-r--r--arch/sparc/include/asm/atomic_32.h2
-rw-r--r--arch/tile/include/asm/atomic_32.h2
-rw-r--r--arch/x86/Kconfig1
-rw-r--r--arch/x86/Kconfig.debug2
-rw-r--r--arch/x86/entry/Makefile1
-rw-r--r--arch/x86/entry/calling.h5
-rw-r--r--arch/x86/entry/entry_64.S170
-rw-r--r--arch/x86/include/asm/asm.h6
-rw-r--r--arch/x86/include/asm/atomic.h69
-rw-r--r--arch/x86/include/asm/atomic64_32.h81
-rw-r--r--arch/x86/include/asm/atomic64_64.h73
-rw-r--r--arch/x86/include/asm/cmpxchg.h2
-rw-r--r--arch/x86/include/asm/io.h98
-rw-r--r--arch/x86/include/asm/orc_types.h107
-rw-r--r--arch/x86/include/asm/processor.h3
-rw-r--r--arch/x86/include/asm/refcount.h109
-rw-r--r--arch/x86/include/asm/rmwcc.h37
-rw-r--r--arch/x86/include/asm/unwind_hints.h103
-rw-r--r--arch/x86/kernel/dumpstack.c12
-rw-r--r--arch/x86/kernel/dumpstack_32.c4
-rw-r--r--arch/x86/kernel/dumpstack_64.c4
-rw-r--r--arch/x86/kernel/process_64.c3
-rw-r--r--arch/x86/mm/extable.c42
-rw-r--r--drivers/clocksource/arm_arch_timer.c6
-rw-r--r--drivers/gpu/drm/i915/i915_debugfs.c5
-rw-r--r--fs/overlayfs/readdir.c4
-rw-r--r--fs/userfaultfd.c25
-rw-r--r--include/asm-generic/atomic64.h2
-rw-r--r--include/asm-generic/io.h27
-rw-r--r--include/linux/atomic.h3
-rw-r--r--include/linux/compiler-gcc.h13
-rw-r--r--include/linux/compiler.h3
-rw-r--r--include/linux/completion.h45
-rw-r--r--include/linux/cpuset.h6
-rw-r--r--include/linux/futex.h7
-rw-r--r--include/linux/irqflags.h24
-rw-r--r--include/linux/jump_label.h33
-rw-r--r--include/linux/kasan-checks.h10
-rw-r--r--include/linux/kernel.h7
-rw-r--r--include/linux/lockdep.h162
-rw-r--r--include/linux/mm_types.h29
-rw-r--r--include/linux/refcount.h4
-rw-r--r--include/linux/rwsem-spinlock.h1
-rw-r--r--include/linux/rwsem.h1
-rw-r--r--include/linux/sched.h12
-rw-r--r--include/linux/sched/mm.h8
-rw-r--r--include/linux/spinlock.h41
-rw-r--r--init/Kconfig7
-rw-r--r--kernel/cgroup/cpuset.c7
-rw-r--r--kernel/exit.c1
-rw-r--r--kernel/fork.c4
-rw-r--r--kernel/futex.c22
-rw-r--r--kernel/jump_label.c104
-rw-r--r--kernel/locking/lockdep.c995
-rw-r--r--kernel/locking/lockdep_internals.h2
-rw-r--r--kernel/locking/lockdep_proc.c4
-rw-r--r--kernel/locking/lockdep_states.h1
-rw-r--r--kernel/locking/osq_lock.c13
-rw-r--r--kernel/locking/rtmutex_common.h29
-rw-r--r--kernel/locking/rwsem-spinlock.c37
-rw-r--r--kernel/locking/rwsem-xadd.c33
-rw-r--r--kernel/panic.c12
-rw-r--r--kernel/sched/completion.c11
-rw-r--r--kernel/sched/core.c4
-rw-r--r--kernel/sched/swait.c6
-rw-r--r--kernel/workqueue.c13
-rw-r--r--lib/Kconfig.debug18
-rw-r--r--mm/huge_memory.c12
-rw-r--r--mm/kasan/kasan.c4
-rw-r--r--mm/page_alloc.c49
-rw-r--r--mm/slab.h6
-rw-r--r--mm/slob.c6
-rw-r--r--mm/vmscan.c13
-rw-r--r--net/ipv4/udp.c3
-rw-r--r--net/ipv6/udp.c3
-rw-r--r--scripts/Makefile.build3
-rw-r--r--tools/objtool/Build3
-rw-r--r--tools/objtool/Documentation/stack-validation.txt56
-rw-r--r--tools/objtool/Makefile3
-rw-r--r--tools/objtool/builtin-check.c7
-rw-r--r--tools/objtool/builtin-orc.c70
-rw-r--r--tools/objtool/builtin.h1
-rw-r--r--tools/objtool/check.c281
-rw-r--r--tools/objtool/check.h19
-rw-r--r--tools/objtool/elf.c212
-rw-r--r--tools/objtool/elf.h15
-rw-r--r--tools/objtool/objtool.c3
-rw-r--r--tools/objtool/orc.h30
-rw-r--r--tools/objtool/orc_dump.c212
-rw-r--r--tools/objtool/orc_gen.c214
-rw-r--r--tools/objtool/orc_types.h107
105 files changed, 4638 insertions, 738 deletions
diff --git a/Documentation/atomic_bitops.txt b/Documentation/atomic_bitops.txt
new file mode 100644
index 000000000000..5550bfdcce5f
--- /dev/null
+++ b/Documentation/atomic_bitops.txt
@@ -0,0 +1,66 @@
1
2On atomic bitops.
3
4
5While our bitmap_{}() functions are non-atomic, we have a number of operations
6operating on single bits in a bitmap that are atomic.
7
8
9API
10---
11
12The single bit operations are:
13
14Non-RMW ops:
15
16 test_bit()
17
18RMW atomic operations without return value:
19
20 {set,clear,change}_bit()
21 clear_bit_unlock()
22
23RMW atomic operations with return value:
24
25 test_and_{set,clear,change}_bit()
26 test_and_set_bit_lock()
27
28Barriers:
29
30 smp_mb__{before,after}_atomic()
31
32
33All RMW atomic operations have a '__' prefixed variant which is non-atomic.
34
35
36SEMANTICS
37---------
38
39Non-atomic ops:
40
41In particular __clear_bit_unlock() suffers the same issue as atomic_set(),
42which is why the generic version maps to clear_bit_unlock(), see atomic_t.txt.
43
44
45RMW ops:
46
47The test_and_{}_bit() operations return the original value of the bit.
48
49
50ORDERING
51--------
52
53Like with atomic_t, the rule of thumb is:
54
55 - non-RMW operations are unordered;
56
57 - RMW operations that have no return value are unordered;
58
59 - RMW operations that have a return value are fully ordered.
60
61Except for test_and_set_bit_lock() which has ACQUIRE semantics and
62clear_bit_unlock() which has RELEASE semantics.
63
64Since a platform only has a single means of achieving atomic operations
65the same barriers as for atomic_t are used, see atomic_t.txt.
66
diff --git a/Documentation/atomic_t.txt b/Documentation/atomic_t.txt
new file mode 100644
index 000000000000..eee127115277
--- /dev/null
+++ b/Documentation/atomic_t.txt
@@ -0,0 +1,200 @@
1
2On atomic types (atomic_t atomic64_t and atomic_long_t).
3
4The atomic type provides an interface to the architecture's means of atomic
5RMW operations between CPUs (atomic operations on MMIO are not supported and
6can lead to fatal traps on some platforms).
7
8API
9---
10
11The 'full' API consists of (atomic64_ and atomic_long_ prefixes omitted for
12brevity):
13
14Non-RMW ops:
15
16 atomic_read(), atomic_set()
17 atomic_read_acquire(), atomic_set_release()
18
19
20RMW atomic operations:
21
22Arithmetic:
23
24 atomic_{add,sub,inc,dec}()
25 atomic_{add,sub,inc,dec}_return{,_relaxed,_acquire,_release}()
26 atomic_fetch_{add,sub,inc,dec}{,_relaxed,_acquire,_release}()
27
28
29Bitwise:
30
31 atomic_{and,or,xor,andnot}()
32 atomic_fetch_{and,or,xor,andnot}{,_relaxed,_acquire,_release}()
33
34
35Swap:
36
37 atomic_xchg{,_relaxed,_acquire,_release}()
38 atomic_cmpxchg{,_relaxed,_acquire,_release}()
39 atomic_try_cmpxchg{,_relaxed,_acquire,_release}()
40
41
42Reference count (but please see refcount_t):
43
44 atomic_add_unless(), atomic_inc_not_zero()
45 atomic_sub_and_test(), atomic_dec_and_test()
46
47
48Misc:
49
50 atomic_inc_and_test(), atomic_add_negative()
51 atomic_dec_unless_positive(), atomic_inc_unless_negative()
52
53
54Barriers:
55
56 smp_mb__{before,after}_atomic()
57
58
59
60SEMANTICS
61---------
62
63Non-RMW ops:
64
65The non-RMW ops are (typically) regular LOADs and STOREs and are canonically
66implemented using READ_ONCE(), WRITE_ONCE(), smp_load_acquire() and
67smp_store_release() respectively.
68
69The one detail to this is that atomic_set{}() should be observable to the RMW
70ops. That is:
71
72 C atomic-set
73
74 {
75 atomic_set(v, 1);
76 }
77
78 P1(atomic_t *v)
79 {
80 atomic_add_unless(v, 1, 0);
81 }
82
83 P2(atomic_t *v)
84 {
85 atomic_set(v, 0);
86 }
87
88 exists
89 (v=2)
90
91In this case we would expect the atomic_set() from CPU1 to either happen
92before the atomic_add_unless(), in which case that latter one would no-op, or
93_after_ in which case we'd overwrite its result. In no case is "2" a valid
94outcome.
95
96This is typically true on 'normal' platforms, where a regular competing STORE
97will invalidate a LL/SC or fail a CMPXCHG.
98
99The obvious case where this is not so is when we need to implement atomic ops
100with a lock:
101
102 CPU0 CPU1
103
104 atomic_add_unless(v, 1, 0);
105 lock();
106 ret = READ_ONCE(v->counter); // == 1
107 atomic_set(v, 0);
108 if (ret != u) WRITE_ONCE(v->counter, 0);
109 WRITE_ONCE(v->counter, ret + 1);
110 unlock();
111
112the typical solution is to then implement atomic_set{}() with atomic_xchg().
113
114
115RMW ops:
116
117These come in various forms:
118
119 - plain operations without return value: atomic_{}()
120
121 - operations which return the modified value: atomic_{}_return()
122
123 these are limited to the arithmetic operations because those are
124 reversible. Bitops are irreversible and therefore the modified value
125 is of dubious utility.
126
127 - operations which return the original value: atomic_fetch_{}()
128
129 - swap operations: xchg(), cmpxchg() and try_cmpxchg()
130
131 - misc; the special purpose operations that are commonly used and would,
132 given the interface, normally be implemented using (try_)cmpxchg loops but
133 are time critical and can, (typically) on LL/SC architectures, be more
134 efficiently implemented.
135
136All these operations are SMP atomic; that is, the operations (for a single
137atomic variable) can be fully ordered and no intermediate state is lost or
138visible.
139
140
141ORDERING (go read memory-barriers.txt first)
142--------
143
144The rule of thumb:
145
146 - non-RMW operations are unordered;
147
148 - RMW operations that have no return value are unordered;
149
150 - RMW operations that have a return value are fully ordered;
151
152 - RMW operations that are conditional are unordered on FAILURE,
153 otherwise the above rules apply.
154
155Except of course when an operation has an explicit ordering like:
156
157 {}_relaxed: unordered
158 {}_acquire: the R of the RMW (or atomic_read) is an ACQUIRE
159 {}_release: the W of the RMW (or atomic_set) is a RELEASE
160
161Where 'unordered' is against other memory locations. Address dependencies are
162not defeated.
163
164Fully ordered primitives are ordered against everything prior and everything
165subsequent. Therefore a fully ordered primitive is like having an smp_mb()
166before and an smp_mb() after the primitive.
167
168
169The barriers:
170
171 smp_mb__{before,after}_atomic()
172
173only apply to the RMW ops and can be used to augment/upgrade the ordering
174inherent to the used atomic op. These barriers provide a full smp_mb().
175
176These helper barriers exist because architectures have varying implicit
177ordering on their SMP atomic primitives. For example our TSO architectures
178provide full ordered atomics and these barriers are no-ops.
179
180Thus:
181
182 atomic_fetch_add();
183
184is equivalent to:
185
186 smp_mb__before_atomic();
187 atomic_fetch_add_relaxed();
188 smp_mb__after_atomic();
189
190However the atomic_fetch_add() might be implemented more efficiently.
191
192Further, while something like:
193
194 smp_mb__before_atomic();
195 atomic_dec(&X);
196
197is a 'typical' RELEASE pattern, the barrier is strictly stronger than
198a RELEASE. Similarly for something like:
199
200
diff --git a/Documentation/locking/crossrelease.txt b/Documentation/locking/crossrelease.txt
new file mode 100644
index 000000000000..bdf1423d5f99
--- /dev/null
+++ b/Documentation/locking/crossrelease.txt
@@ -0,0 +1,874 @@
1Crossrelease
2============
3
4Started by Byungchul Park <byungchul.park@lge.com>
5
6Contents:
7
8 (*) Background
9
10 - What causes deadlock
11 - How lockdep works
12
13 (*) Limitation
14
15 - Limit lockdep
16 - Pros from the limitation
17 - Cons from the limitation
18 - Relax the limitation
19
20 (*) Crossrelease
21
22 - Introduce crossrelease
23 - Introduce commit
24
25 (*) Implementation
26
27 - Data structures
28 - How crossrelease works
29
30 (*) Optimizations
31
32 - Avoid duplication
33 - Lockless for hot paths
34
35 (*) APPENDIX A: What lockdep does to work aggresively
36
37 (*) APPENDIX B: How to avoid adding false dependencies
38
39
40==========
41Background
42==========
43
44What causes deadlock
45--------------------
46
47A deadlock occurs when a context is waiting for an event to happen,
48which is impossible because another (or the) context who can trigger the
49event is also waiting for another (or the) event to happen, which is
50also impossible due to the same reason.
51
52For example:
53
54 A context going to trigger event C is waiting for event A to happen.
55 A context going to trigger event A is waiting for event B to happen.
56 A context going to trigger event B is waiting for event C to happen.
57
58A deadlock occurs when these three wait operations run at the same time,
59because event C cannot be triggered if event A does not happen, which in
60turn cannot be triggered if event B does not happen, which in turn
61cannot be triggered if event C does not happen. After all, no event can
62be triggered since any of them never meets its condition to wake up.
63
64A dependency might exist between two waiters and a deadlock might happen
65due to an incorrect releationship between dependencies. Thus, we must
66define what a dependency is first. A dependency exists between them if:
67
68 1. There are two waiters waiting for each event at a given time.
69 2. The only way to wake up each waiter is to trigger its event.
70 3. Whether one can be woken up depends on whether the other can.
71
72Each wait in the example creates its dependency like:
73
74 Event C depends on event A.
75 Event A depends on event B.
76 Event B depends on event C.
77
78 NOTE: Precisely speaking, a dependency is one between whether a
79 waiter for an event can be woken up and whether another waiter for
80 another event can be woken up. However from now on, we will describe
81 a dependency as if it's one between an event and another event for
82 simplicity.
83
84And they form circular dependencies like:
85
86 -> C -> A -> B -
87 / \
88 \ /
89 ----------------
90
91 where 'A -> B' means that event A depends on event B.
92
93Such circular dependencies lead to a deadlock since no waiter can meet
94its condition to wake up as described.
95
96CONCLUSION
97
98Circular dependencies cause a deadlock.
99
100
101How lockdep works
102-----------------
103
104Lockdep tries to detect a deadlock by checking dependencies created by
105lock operations, acquire and release. Waiting for a lock corresponds to
106waiting for an event, and releasing a lock corresponds to triggering an
107event in the previous section.
108
109In short, lockdep does:
110
111 1. Detect a new dependency.
112 2. Add the dependency into a global graph.
113 3. Check if that makes dependencies circular.
114 4. Report a deadlock or its possibility if so.
115
116For example, consider a graph built by lockdep that looks like:
117
118 A -> B -
119 \
120 -> E
121 /
122 C -> D -
123
124 where A, B,..., E are different lock classes.
125
126Lockdep will add a dependency into the graph on detection of a new
127dependency. For example, it will add a dependency 'E -> C' when a new
128dependency between lock E and lock C is detected. Then the graph will be:
129
130 A -> B -
131 \
132 -> E -
133 / \
134 -> C -> D - \
135 / /
136 \ /
137 ------------------
138
139 where A, B,..., E are different lock classes.
140
141This graph contains a subgraph which demonstrates circular dependencies:
142
143 -> E -
144 / \
145 -> C -> D - \
146 / /
147 \ /
148 ------------------
149
150 where C, D and E are different lock classes.
151
152This is the condition under which a deadlock might occur. Lockdep
153reports it on detection after adding a new dependency. This is the way
154how lockdep works.
155
156CONCLUSION
157
158Lockdep detects a deadlock or its possibility by checking if circular
159dependencies were created after adding each new dependency.
160
161
162==========
163Limitation
164==========
165
166Limit lockdep
167-------------
168
169Limiting lockdep to work on only typical locks e.g. spin locks and
170mutexes, which are released within the acquire context, the
171implementation becomes simple but its capacity for detection becomes
172limited. Let's check pros and cons in next section.
173
174
175Pros from the limitation
176------------------------
177
178Given the limitation, when acquiring a lock, locks in a held_locks
179cannot be released if the context cannot acquire it so has to wait to
180acquire it, which means all waiters for the locks in the held_locks are
181stuck. It's an exact case to create dependencies between each lock in
182the held_locks and the lock to acquire.
183
184For example:
185
186 CONTEXT X
187 ---------
188 acquire A
189 acquire B /* Add a dependency 'A -> B' */
190 release B
191 release A
192
193 where A and B are different lock classes.
194
195When acquiring lock A, the held_locks of CONTEXT X is empty thus no
196dependency is added. But when acquiring lock B, lockdep detects and adds
197a new dependency 'A -> B' between lock A in the held_locks and lock B.
198They can be simply added whenever acquiring each lock.
199
200And data required by lockdep exists in a local structure, held_locks
201embedded in task_struct. Forcing to access the data within the context,
202lockdep can avoid racy problems without explicit locks while handling
203the local data.
204
205Lastly, lockdep only needs to keep locks currently being held, to build
206a dependency graph. However, relaxing the limitation, it needs to keep
207even locks already released, because a decision whether they created
208dependencies might be long-deferred.
209
210To sum up, we can expect several advantages from the limitation:
211
212 1. Lockdep can easily identify a dependency when acquiring a lock.
213 2. Races are avoidable while accessing local locks in a held_locks.
214 3. Lockdep only needs to keep locks currently being held.
215
216CONCLUSION
217
218Given the limitation, the implementation becomes simple and efficient.
219
220
221Cons from the limitation
222------------------------
223
224Given the limitation, lockdep is applicable only to typical locks. For
225example, page locks for page access or completions for synchronization
226cannot work with lockdep.
227
228Can we detect deadlocks below, under the limitation?
229
230Example 1:
231
232 CONTEXT X CONTEXT Y CONTEXT Z
233 --------- --------- ----------
234 mutex_lock A
235 lock_page B
236 lock_page B
237 mutex_lock A /* DEADLOCK */
238 unlock_page B held by X
239 unlock_page B
240 mutex_unlock A
241 mutex_unlock A
242
243 where A and B are different lock classes.
244
245No, we cannot.
246
247Example 2:
248
249 CONTEXT X CONTEXT Y
250 --------- ---------
251 mutex_lock A
252 mutex_lock A
253 wait_for_complete B /* DEADLOCK */
254 complete B
255 mutex_unlock A
256 mutex_unlock A
257
258 where A is a lock class and B is a completion variable.
259
260No, we cannot.
261
262CONCLUSION
263
264Given the limitation, lockdep cannot detect a deadlock or its
265possibility caused by page locks or completions.
266
267
268Relax the limitation
269--------------------
270
271Under the limitation, things to create dependencies are limited to
272typical locks. However, synchronization primitives like page locks and
273completions, which are allowed to be released in any context, also
274create dependencies and can cause a deadlock. So lockdep should track
275these locks to do a better job. We have to relax the limitation for
276these locks to work with lockdep.
277
278Detecting dependencies is very important for lockdep to work because
279adding a dependency means adding an opportunity to check whether it
280causes a deadlock. The more lockdep adds dependencies, the more it
281thoroughly works. Thus Lockdep has to do its best to detect and add as
282many true dependencies into a graph as possible.
283
284For example, considering only typical locks, lockdep builds a graph like:
285
286 A -> B -
287 \
288 -> E
289 /
290 C -> D -
291
292 where A, B,..., E are different lock classes.
293
294On the other hand, under the relaxation, additional dependencies might
295be created and added. Assuming additional 'FX -> C' and 'E -> GX' are
296added thanks to the relaxation, the graph will be:
297
298 A -> B -
299 \
300 -> E -> GX
301 /
302 FX -> C -> D -
303
304 where A, B,..., E, FX and GX are different lock classes, and a suffix
305 'X' is added on non-typical locks.
306
307The latter graph gives us more chances to check circular dependencies
308than the former. However, it might suffer performance degradation since
309relaxing the limitation, with which design and implementation of lockdep
310can be efficient, might introduce inefficiency inevitably. So lockdep
311should provide two options, strong detection and efficient detection.
312
313Choosing efficient detection:
314
315 Lockdep works with only locks restricted to be released within the
316 acquire context. However, lockdep works efficiently.
317
318Choosing strong detection:
319
320 Lockdep works with all synchronization primitives. However, lockdep
321 suffers performance degradation.
322
323CONCLUSION
324
325Relaxing the limitation, lockdep can add additional dependencies giving
326additional opportunities to check circular dependencies.
327
328
329============
330Crossrelease
331============
332
333Introduce crossrelease
334----------------------
335
336In order to allow lockdep to handle additional dependencies by what
337might be released in any context, namely 'crosslock', we have to be able
338to identify those created by crosslocks. The proposed 'crossrelease'
339feature provoides a way to do that.
340
341Crossrelease feature has to do:
342
343 1. Identify dependencies created by crosslocks.
344 2. Add the dependencies into a dependency graph.
345
346That's all. Once a meaningful dependency is added into graph, then
347lockdep would work with the graph as it did. The most important thing
348crossrelease feature has to do is to correctly identify and add true
349dependencies into the global graph.
350
351A dependency e.g. 'A -> B' can be identified only in the A's release
352context because a decision required to identify the dependency can be
353made only in the release context. That is to decide whether A can be
354released so that a waiter for A can be woken up. It cannot be made in
355other than the A's release context.
356
357It's no matter for typical locks because each acquire context is same as
358its release context, thus lockdep can decide whether a lock can be
359released in the acquire context. However for crosslocks, lockdep cannot
360make the decision in the acquire context but has to wait until the
361release context is identified.
362
363Therefore, deadlocks by crosslocks cannot be detected just when it
364happens, because those cannot be identified until the crosslocks are
365released. However, deadlock possibilities can be detected and it's very
366worth. See 'APPENDIX A' section to check why.
367
368CONCLUSION
369
370Using crossrelease feature, lockdep can work with what might be released
371in any context, namely crosslock.
372
373
374Introduce commit
375----------------
376
377Since crossrelease defers the work adding true dependencies of
378crosslocks until they are actually released, crossrelease has to queue
379all acquisitions which might create dependencies with the crosslocks.
380Then it identifies dependencies using the queued data in batches at a
381proper time. We call it 'commit'.
382
383There are four types of dependencies:
384
3851. TT type: 'typical lock A -> typical lock B'
386
387 Just when acquiring B, lockdep can see it's in the A's release
388 context. So the dependency between A and B can be identified
389 immediately. Commit is unnecessary.
390
3912. TC type: 'typical lock A -> crosslock BX'
392
393 Just when acquiring BX, lockdep can see it's in the A's release
394 context. So the dependency between A and BX can be identified
395 immediately. Commit is unnecessary, too.
396
3973. CT type: 'crosslock AX -> typical lock B'
398
399 When acquiring B, lockdep cannot identify the dependency because
400 there's no way to know if it's in the AX's release context. It has
401 to wait until the decision can be made. Commit is necessary.
402
4034. CC type: 'crosslock AX -> crosslock BX'
404
405 When acquiring BX, lockdep cannot identify the dependency because
406 there's no way to know if it's in the AX's release context. It has
407 to wait until the decision can be made. Commit is necessary.
408 But, handling CC type is not implemented yet. It's a future work.
409
410Lockdep can work without commit for typical locks, but commit step is
411necessary once crosslocks are involved. Introducing commit, lockdep
412performs three steps. What lockdep does in each step is:
413
4141. Acquisition: For typical locks, lockdep does what it originally did
415 and queues the lock so that CT type dependencies can be checked using
416 it at the commit step. For crosslocks, it saves data which will be
417 used at the commit step and increases a reference count for it.
418
4192. Commit: No action is reauired for typical locks. For crosslocks,
420 lockdep adds CT type dependencies using the data saved at the
421 acquisition step.
422
4233. Release: No changes are required for typical locks. When a crosslock
424 is released, it decreases a reference count for it.
425
426CONCLUSION
427
428Crossrelease introduces commit step to handle dependencies of crosslocks
429in batches at a proper time.
430
431
432==============
433Implementation
434==============
435
436Data structures
437---------------
438
439Crossrelease introduces two main data structures.
440
4411. hist_lock
442
443 This is an array embedded in task_struct, for keeping lock history so
444 that dependencies can be added using them at the commit step. Since
445 it's local data, it can be accessed locklessly in the owner context.
446 The array is filled at the acquisition step and consumed at the
447 commit step. And it's managed in circular manner.
448
4492. cross_lock
450
451 One per lockdep_map exists. This is for keeping data of crosslocks
452 and used at the commit step.
453
454
455How crossrelease works
456----------------------
457
458It's the key of how crossrelease works, to defer necessary works to an
459appropriate point in time and perform in at once at the commit step.
460Let's take a look with examples step by step, starting from how lockdep
461works without crossrelease for typical locks.
462
463 acquire A /* Push A onto held_locks */
464 acquire B /* Push B onto held_locks and add 'A -> B' */
465 acquire C /* Push C onto held_locks and add 'B -> C' */
466 release C /* Pop C from held_locks */
467 release B /* Pop B from held_locks */
468 release A /* Pop A from held_locks */
469
470 where A, B and C are different lock classes.
471
472 NOTE: This document assumes that readers already understand how
473 lockdep works without crossrelease thus omits details. But there's
474 one thing to note. Lockdep pretends to pop a lock from held_locks
475 when releasing it. But it's subtly different from the original pop
476 operation because lockdep allows other than the top to be poped.
477
478In this case, lockdep adds 'the top of held_locks -> the lock to acquire'
479dependency every time acquiring a lock.
480
481After adding 'A -> B', a dependency graph will be:
482
483 A -> B
484
485 where A and B are different lock classes.
486
487And after adding 'B -> C', the graph will be:
488
489 A -> B -> C
490
491 where A, B and C are different lock classes.
492
493Let's performs commit step even for typical locks to add dependencies.
494Of course, commit step is not necessary for them, however, it would work
495well because this is a more general way.
496
497 acquire A
498 /*
499 * Queue A into hist_locks
500 *
501 * In hist_locks: A
502 * In graph: Empty
503 */
504
505 acquire B
506 /*
507 * Queue B into hist_locks
508 *
509 * In hist_locks: A, B
510 * In graph: Empty
511 */
512
513 acquire C
514 /*
515 * Queue C into hist_locks
516 *
517 * In hist_locks: A, B, C
518 * In graph: Empty
519 */
520
521 commit C
522 /*
523 * Add 'C -> ?'
524 * Answer the following to decide '?'
525 * What has been queued since acquire C: Nothing
526 *
527 * In hist_locks: A, B, C
528 * In graph: Empty
529 */
530
531 release C
532
533 commit B
534 /*
535 * Add 'B -> ?'
536 * Answer the following to decide '?'
537 * What has been queued since acquire B: C
538 *
539 * In hist_locks: A, B, C
540 * In graph: 'B -> C'
541 */
542
543 release B
544
545 commit A
546 /*
547 * Add 'A -> ?'
548 * Answer the following to decide '?'
549 * What has been queued since acquire A: B, C
550 *
551 * In hist_locks: A, B, C
552 * In graph: 'B -> C', 'A -> B', 'A -> C'
553 */
554
555 release A
556
557 where A, B and C are different lock classes.
558
559In this case, dependencies are added at the commit step as described.
560
561After commits for A, B and C, the graph will be:
562
563 A -> B -> C
564
565 where A, B and C are different lock classes.
566
567 NOTE: A dependency 'A -> C' is optimized out.
568
569We can see the former graph built without commit step is same as the
570latter graph built using commit steps. Of course the former way leads to
571earlier finish for building the graph, which means we can detect a
572deadlock or its possibility sooner. So the former way would be prefered
573when possible. But we cannot avoid using the latter way for crosslocks.
574
575Let's look at how commit steps work for crosslocks. In this case, the
576commit step is performed only on crosslock AX as real. And it assumes
577that the AX release context is different from the AX acquire context.
578
579 BX RELEASE CONTEXT BX ACQUIRE CONTEXT
580 ------------------ ------------------
581 acquire A
582 /*
583 * Push A onto held_locks
584 * Queue A into hist_locks
585 *
586 * In held_locks: A
587 * In hist_locks: A
588 * In graph: Empty
589 */
590
591 acquire BX
592 /*
593 * Add 'the top of held_locks -> BX'
594 *
595 * In held_locks: A
596 * In hist_locks: A
597 * In graph: 'A -> BX'
598 */
599
600 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
601 It must be guaranteed that the following operations are seen after
602 acquiring BX globally. It can be done by things like barrier.
603 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
604
605 acquire C
606 /*
607 * Push C onto held_locks
608 * Queue C into hist_locks
609 *
610 * In held_locks: C
611 * In hist_locks: C
612 * In graph: 'A -> BX'
613 */
614
615 release C
616 /*
617 * Pop C from held_locks
618 *
619 * In held_locks: Empty
620 * In hist_locks: C
621 * In graph: 'A -> BX'
622 */
623 acquire D
624 /*
625 * Push D onto held_locks
626 * Queue D into hist_locks
627 * Add 'the top of held_locks -> D'
628 *
629 * In held_locks: A, D
630 * In hist_locks: A, D
631 * In graph: 'A -> BX', 'A -> D'
632 */
633 acquire E
634 /*
635 * Push E onto held_locks
636 * Queue E into hist_locks
637 *
638 * In held_locks: E
639 * In hist_locks: C, E
640 * In graph: 'A -> BX', 'A -> D'
641 */
642
643 release E
644 /*
645 * Pop E from held_locks
646 *
647 * In held_locks: Empty
648 * In hist_locks: D, E
649 * In graph: 'A -> BX', 'A -> D'
650 */
651 release D
652 /*
653 * Pop D from held_locks
654 *
655 * In held_locks: A
656 * In hist_locks: A, D
657 * In graph: 'A -> BX', 'A -> D'
658 */
659 commit BX
660 /*
661 * Add 'BX -> ?'
662 * What has been queued since acquire BX: C, E
663 *
664 * In held_locks: Empty
665 * In hist_locks: D, E
666 * In graph: 'A -> BX', 'A -> D',
667 * 'BX -> C', 'BX -> E'
668 */
669
670 release BX
671 /*
672 * In held_locks: Empty
673 * In hist_locks: D, E
674 * In graph: 'A -> BX', 'A -> D',
675 * 'BX -> C', 'BX -> E'
676 */
677 release A
678 /*
679 * Pop A from held_locks
680 *
681 * In held_locks: Empty
682 * In hist_locks: A, D
683 * In graph: 'A -> BX', 'A -> D',
684 * 'BX -> C', 'BX -> E'
685 */
686
687 where A, BX, C,..., E are different lock classes, and a suffix 'X' is
688 added on crosslocks.
689
690Crossrelease considers all acquisitions after acqiuring BX are
691candidates which might create dependencies with BX. True dependencies
692will be determined when identifying the release context of BX. Meanwhile,
693all typical locks are queued so that they can be used at the commit step.
694And then two dependencies 'BX -> C' and 'BX -> E' are added at the
695commit step when identifying the release context.
696
697The final graph will be, with crossrelease:
698
699 -> C
700 /
701 -> BX -
702 / \
703 A - -> E
704 \
705 -> D
706
707 where A, BX, C,..., E are different lock classes, and a suffix 'X' is
708 added on crosslocks.
709
710However, the final graph will be, without crossrelease:
711
712 A -> D
713
714 where A and D are different lock classes.
715
716The former graph has three more dependencies, 'A -> BX', 'BX -> C' and
717'BX -> E' giving additional opportunities to check if they cause
718deadlocks. This way lockdep can detect a deadlock or its possibility
719caused by crosslocks.
720
721CONCLUSION
722
723We checked how crossrelease works with several examples.
724
725
726=============
727Optimizations
728=============
729
730Avoid duplication
731-----------------
732
733Crossrelease feature uses a cache like what lockdep already uses for
734dependency chains, but this time it's for caching CT type dependencies.
735Once that dependency is cached, the same will never be added again.
736
737
738Lockless for hot paths
739----------------------
740
741To keep all locks for later use at the commit step, crossrelease adopts
742a local array embedded in task_struct, which makes access to the data
743lockless by forcing it to happen only within the owner context. It's
744like how lockdep handles held_locks. Lockless implmentation is important
745since typical locks are very frequently acquired and released.
746
747
748=================================================
749APPENDIX A: What lockdep does to work aggresively
750=================================================
751
752A deadlock actually occurs when all wait operations creating circular
753dependencies run at the same time. Even though they don't, a potential
754deadlock exists if the problematic dependencies exist. Thus it's
755meaningful to detect not only an actual deadlock but also its potential
756possibility. The latter is rather valuable. When a deadlock occurs
757actually, we can identify what happens in the system by some means or
758other even without lockdep. However, there's no way to detect possiblity
759without lockdep unless the whole code is parsed in head. It's terrible.
760Lockdep does the both, and crossrelease only focuses on the latter.
761
762Whether or not a deadlock actually occurs depends on several factors.
763For example, what order contexts are switched in is a factor. Assuming
764circular dependencies exist, a deadlock would occur when contexts are
765switched so that all wait operations creating the dependencies run
766simultaneously. Thus to detect a deadlock possibility even in the case
767that it has not occured yet, lockdep should consider all possible
768combinations of dependencies, trying to:
769
7701. Use a global dependency graph.
771
772 Lockdep combines all dependencies into one global graph and uses them,
773 regardless of which context generates them or what order contexts are
774 switched in. Aggregated dependencies are only considered so they are
775 prone to be circular if a problem exists.
776
7772. Check dependencies between classes instead of instances.
778
779 What actually causes a deadlock are instances of lock. However,
780 lockdep checks dependencies between classes instead of instances.
781 This way lockdep can detect a deadlock which has not happened but
782 might happen in future by others but the same class.
783
7843. Assume all acquisitions lead to waiting.
785
786 Although locks might be acquired without waiting which is essential
787 to create dependencies, lockdep assumes all acquisitions lead to
788 waiting since it might be true some time or another.
789
790CONCLUSION
791
792Lockdep detects not only an actual deadlock but also its possibility,
793and the latter is more valuable.
794
795
796==================================================
797APPENDIX B: How to avoid adding false dependencies
798==================================================
799
800Remind what a dependency is. A dependency exists if:
801
802 1. There are two waiters waiting for each event at a given time.
803 2. The only way to wake up each waiter is to trigger its event.
804 3. Whether one can be woken up depends on whether the other can.
805
806For example:
807
808 acquire A
809 acquire B /* A dependency 'A -> B' exists */
810 release B
811 release A
812
813 where A and B are different lock classes.
814
815A depedency 'A -> B' exists since:
816
817 1. A waiter for A and a waiter for B might exist when acquiring B.
818 2. Only way to wake up each is to release what it waits for.
819 3. Whether the waiter for A can be woken up depends on whether the
820 other can. IOW, TASK X cannot release A if it fails to acquire B.
821
822For another example:
823
824 TASK X TASK Y
825 ------ ------
826 acquire AX
827 acquire B /* A dependency 'AX -> B' exists */
828 release B
829 release AX held by Y
830
831 where AX and B are different lock classes, and a suffix 'X' is added
832 on crosslocks.
833
834Even in this case involving crosslocks, the same rule can be applied. A
835depedency 'AX -> B' exists since:
836
837 1. A waiter for AX and a waiter for B might exist when acquiring B.
838 2. Only way to wake up each is to release what it waits for.
839 3. Whether the waiter for AX can be woken up depends on whether the
840 other can. IOW, TASK X cannot release AX if it fails to acquire B.
841
842Let's take a look at more complicated example:
843
844 TASK X TASK Y
845 ------ ------
846 acquire B
847 release B
848 fork Y
849 acquire AX
850 acquire C /* A dependency 'AX -> C' exists */
851 release C
852 release AX held by Y
853
854 where AX, B and C are different lock classes, and a suffix 'X' is
855 added on crosslocks.
856
857Does a dependency 'AX -> B' exist? Nope.
858
859Two waiters are essential to create a dependency. However, waiters for
860AX and B to create 'AX -> B' cannot exist at the same time in this
861example. Thus the dependency 'AX -> B' cannot be created.
862
863It would be ideal if the full set of true ones can be considered. But
864we can ensure nothing but what actually happened. Relying on what
865actually happens at runtime, we can anyway add only true ones, though
866they might be a subset of true ones. It's similar to how lockdep works
867for typical locks. There might be more true dependencies than what
868lockdep has detected in runtime. Lockdep has no choice but to rely on
869what actually happens. Crossrelease also relies on it.
870
871CONCLUSION
872
873Relying on what actually happens, lockdep can avoid adding false
874dependencies.
diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt
index c4ddfcd5ee32..d1d1716f904b 100644
--- a/Documentation/memory-barriers.txt
+++ b/Documentation/memory-barriers.txt
@@ -498,11 +498,11 @@ And a couple of implicit varieties:
498 This means that ACQUIRE acts as a minimal "acquire" operation and 498 This means that ACQUIRE acts as a minimal "acquire" operation and
499 RELEASE acts as a minimal "release" operation. 499 RELEASE acts as a minimal "release" operation.
500 500
501A subset of the atomic operations described in core-api/atomic_ops.rst have 501A subset of the atomic operations described in atomic_t.txt have ACQUIRE and
502ACQUIRE and RELEASE variants in addition to fully-ordered and relaxed (no 502RELEASE variants in addition to fully-ordered and relaxed (no barrier
503barrier semantics) definitions. For compound atomics performing both a load 503semantics) definitions. For compound atomics performing both a load and a
504and a store, ACQUIRE semantics apply only to the load and RELEASE semantics 504store, ACQUIRE semantics apply only to the load and RELEASE semantics apply
505apply only to the store portion of the operation. 505only to the store portion of the operation.
506 506
507Memory barriers are only required where there's a possibility of interaction 507Memory barriers are only required where there's a possibility of interaction
508between two CPUs or between a CPU and a device. If it can be guaranteed that 508between two CPUs or between a CPU and a device. If it can be guaranteed that
@@ -1876,8 +1876,7 @@ There are some more advanced barrier functions:
1876 This makes sure that the death mark on the object is perceived to be set 1876 This makes sure that the death mark on the object is perceived to be set
1877 *before* the reference counter is decremented. 1877 *before* the reference counter is decremented.
1878 1878
1879 See Documentation/core-api/atomic_ops.rst for more information. See the 1879 See Documentation/atomic_{t,bitops}.txt for more information.
1880 "Atomic operations" subsection for information on where to use these.
1881 1880
1882 1881
1883 (*) lockless_dereference(); 1882 (*) lockless_dereference();
@@ -1982,10 +1981,7 @@ for each construct. These operations all imply certain barriers:
1982 ACQUIRE operation has completed. 1981 ACQUIRE operation has completed.
1983 1982
1984 Memory operations issued before the ACQUIRE may be completed after 1983 Memory operations issued before the ACQUIRE may be completed after
1985 the ACQUIRE operation has completed. An smp_mb__before_spinlock(), 1984 the ACQUIRE operation has completed.
1986 combined with a following ACQUIRE, orders prior stores against
1987 subsequent loads and stores. Note that this is weaker than smp_mb()!
1988 The smp_mb__before_spinlock() primitive is free on many architectures.
1989 1985
1990 (2) RELEASE operation implication: 1986 (2) RELEASE operation implication:
1991 1987
@@ -2503,88 +2499,7 @@ operations are noted specially as some of them imply full memory barriers and
2503some don't, but they're very heavily relied on as a group throughout the 2499some don't, but they're very heavily relied on as a group throughout the
2504kernel. 2500kernel.
2505 2501
2506Any atomic operation that modifies some state in memory and returns information 2502See Documentation/atomic_t.txt for more information.
2507about the state (old or new) implies an SMP-conditional general memory barrier
2508(smp_mb()) on each side of the actual operation (with the exception of
2509explicit lock operations, described later). These include:
2510
2511 xchg();
2512 atomic_xchg(); atomic_long_xchg();
2513 atomic_inc_return(); atomic_long_inc_return();
2514 atomic_dec_return(); atomic_long_dec_return();
2515 atomic_add_return(); atomic_long_add_return();
2516 atomic_sub_return(); atomic_long_sub_return();
2517 atomic_inc_and_test(); atomic_long_inc_and_test();
2518 atomic_dec_and_test(); atomic_long_dec_and_test();
2519 atomic_sub_and_test(); atomic_long_sub_and_test();
2520 atomic_add_negative(); atomic_long_add_negative();
2521 test_and_set_bit();
2522 test_and_clear_bit();
2523 test_and_change_bit();
2524
2525 /* when succeeds */
2526 cmpxchg();
2527 atomic_cmpxchg(); atomic_long_cmpxchg();
2528 atomic_add_unless(); atomic_long_add_unless();
2529
2530These are used for such things as implementing ACQUIRE-class and RELEASE-class
2531operations and adjusting reference counters towards object destruction, and as
2532such the implicit memory barrier effects are necessary.
2533
2534
2535The following operations are potential problems as they do _not_ imply memory
2536barriers, but might be used for implementing such things as RELEASE-class
2537operations:
2538
2539 atomic_set();
2540 set_bit();
2541 clear_bit();
2542 change_bit();
2543
2544With these the appropriate explicit memory barrier should be used if necessary
2545(smp_mb__before_atomic() for instance).
2546
2547
2548The following also do _not_ imply memory barriers, and so may require explicit
2549memory barriers under some circumstances (smp_mb__before_atomic() for
2550instance):
2551
2552 atomic_add();
2553 atomic_sub();
2554 atomic_inc();
2555 atomic_dec();
2556
2557If they're used for statistics generation, then they probably don't need memory
2558barriers, unless there's a coupling between statistical data.
2559
2560If they're used for reference counting on an object to control its lifetime,
2561they probably don't need memory barriers because either the reference count
2562will be adjusted inside a locked section, or the caller will already hold
2563sufficient references to make the lock, and thus a memory barrier unnecessary.
2564
2565If they're used for constructing a lock of some description, then they probably
2566do need memory barriers as a lock primitive generally has to do things in a
2567specific order.
2568
2569Basically, each usage case has to be carefully considered as to whether memory
2570barriers are needed or not.
2571
2572The following operations are special locking primitives:
2573
2574 test_and_set_bit_lock();
2575 clear_bit_unlock();
2576 __clear_bit_unlock();
2577
2578These implement ACQUIRE-class and RELEASE-class operations. These should be
2579used in preference to other operations when implementing locking primitives,
2580because their implementations can be optimised on many architectures.
2581
2582[!] Note that special memory barrier primitives are available for these
2583situations because on some CPUs the atomic instructions used imply full memory
2584barriers, and so barrier instructions are superfluous in conjunction with them,
2585and in such cases the special barrier primitives will be no-ops.
2586
2587See Documentation/core-api/atomic_ops.rst for more information.
2588 2503
2589 2504
2590ACCESSING DEVICES 2505ACCESSING DEVICES
diff --git a/Documentation/static-keys.txt b/Documentation/static-keys.txt
index b83dfa1c0602..ab16efe0c79d 100644
--- a/Documentation/static-keys.txt
+++ b/Documentation/static-keys.txt
@@ -149,6 +149,26 @@ static_branch_inc(), will change the branch back to true. Likewise, if the
149key is initialized false, a 'static_branch_inc()', will change the branch to 149key is initialized false, a 'static_branch_inc()', will change the branch to
150true. And then a 'static_branch_dec()', will again make the branch false. 150true. And then a 'static_branch_dec()', will again make the branch false.
151 151
152The state and the reference count can be retrieved with 'static_key_enabled()'
153and 'static_key_count()'. In general, if you use these functions, they
154should be protected with the same mutex used around the enable/disable
155or increment/decrement function.
156
157Note that switching branches results in some locks being taken,
158particularly the CPU hotplug lock (in order to avoid races against
159CPUs being brought in the kernel whilst the kernel is getting
160patched). Calling the static key API from within a hotplug notifier is
161thus a sure deadlock recipe. In order to still allow use of the
162functionnality, the following functions are provided:
163
164 static_key_enable_cpuslocked()
165 static_key_disable_cpuslocked()
166 static_branch_enable_cpuslocked()
167 static_branch_disable_cpuslocked()
168
169These functions are *not* general purpose, and must only be used when
170you really know that you're in the above context, and no other.
171
152Where an array of keys is required, it can be defined as:: 172Where an array of keys is required, it can be defined as::
153 173
154 DEFINE_STATIC_KEY_ARRAY_TRUE(keys, count); 174 DEFINE_STATIC_KEY_ARRAY_TRUE(keys, count);
diff --git a/Documentation/translations/ko_KR/memory-barriers.txt b/Documentation/translations/ko_KR/memory-barriers.txt
index 38310dcd6620..bc80fc0e210f 100644
--- a/Documentation/translations/ko_KR/memory-barriers.txt
+++ b/Documentation/translations/ko_KR/memory-barriers.txt
@@ -1956,10 +1956,7 @@ MMIO 쓰기 배리어
1956 뒤에 완료됩니다. 1956 뒤에 완료됩니다.
1957 1957
1958 ACQUIRE 앞에서 요청된 메모리 오퍼레이션은 ACQUIRE 오퍼레이션이 완료된 후에 1958 ACQUIRE 앞에서 요청된 메모리 오퍼레이션은 ACQUIRE 오퍼레이션이 완료된 후에
1959 완료될 수 있습니다. smp_mb__before_spinlock() 뒤에 ACQUIRE 가 실행되는 1959 완료될 수 있습니다.
1960 코드 블록은 블록 앞의 스토어를 블록 뒤의 로드와 스토어에 대해 순서
1961 맞춥니다. 이건 smp_mb() 보다 완화된 것임을 기억하세요! 많은 아키텍쳐에서
1962 smp_mb__before_spinlock() 은 사실 아무일도 하지 않습니다.
1963 1960
1964 (2) RELEASE 오퍼레이션의 영향: 1961 (2) RELEASE 오퍼레이션의 영향:
1965 1962
diff --git a/arch/Kconfig b/arch/Kconfig
index 21d0089117fe..2520ca5b42eb 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -931,6 +931,18 @@ config STRICT_MODULE_RWX
931config ARCH_WANT_RELAX_ORDER 931config ARCH_WANT_RELAX_ORDER
932 bool 932 bool
933 933
934config ARCH_HAS_REFCOUNT
935 bool
936 help
937 An architecture selects this when it has implemented refcount_t
938 using open coded assembly primitives that provide an optimized
939 refcount_t implementation, possibly at the expense of some full
940 refcount state checks of CONFIG_REFCOUNT_FULL=y.
941
942 The refcount overflow check behavior, however, must be retained.
943 Catching overflows is the primary security concern for protecting
944 against bugs in reference counts.
945
934config REFCOUNT_FULL 946config REFCOUNT_FULL
935 bool "Perform full reference count validation at the expense of speed" 947 bool "Perform full reference count validation at the expense of speed"
936 help 948 help
diff --git a/arch/arc/include/asm/atomic.h b/arch/arc/include/asm/atomic.h
index 54b54da6384c..11859287c52a 100644
--- a/arch/arc/include/asm/atomic.h
+++ b/arch/arc/include/asm/atomic.h
@@ -123,6 +123,8 @@ static inline void atomic_set(atomic_t *v, int i)
123 atomic_ops_unlock(flags); 123 atomic_ops_unlock(flags);
124} 124}
125 125
126#define atomic_set_release(v, i) atomic_set((v), (i))
127
126#endif 128#endif
127 129
128/* 130/*
diff --git a/arch/arm64/include/asm/spinlock.h b/arch/arm64/include/asm/spinlock.h
index cae331d553f8..ae4241ab19a8 100644
--- a/arch/arm64/include/asm/spinlock.h
+++ b/arch/arm64/include/asm/spinlock.h
@@ -358,14 +358,7 @@ static inline int arch_read_trylock(arch_rwlock_t *rw)
358#define arch_read_relax(lock) cpu_relax() 358#define arch_read_relax(lock) cpu_relax()
359#define arch_write_relax(lock) cpu_relax() 359#define arch_write_relax(lock) cpu_relax()
360 360
361/* 361/* See include/linux/spinlock.h */
362 * Accesses appearing in program order before a spin_lock() operation 362#define smp_mb__after_spinlock() smp_mb()
363 * can be reordered with accesses inside the critical section, by virtue
364 * of arch_spin_lock being constructed using acquire semantics.
365 *
366 * In cases where this is problematic (e.g. try_to_wake_up), an
367 * smp_mb__before_spinlock() can restore the required ordering.
368 */
369#define smp_mb__before_spinlock() smp_mb()
370 363
371#endif /* __ASM_SPINLOCK_H */ 364#endif /* __ASM_SPINLOCK_H */
diff --git a/arch/hexagon/include/asm/atomic.h b/arch/hexagon/include/asm/atomic.h
index a62ba368b27d..fb3dfb2a667e 100644
--- a/arch/hexagon/include/asm/atomic.h
+++ b/arch/hexagon/include/asm/atomic.h
@@ -42,6 +42,8 @@ static inline void atomic_set(atomic_t *v, int new)
42 ); 42 );
43} 43}
44 44
45#define atomic_set_release(v, i) atomic_set((v), (i))
46
45/** 47/**
46 * atomic_read - reads a word, atomically 48 * atomic_read - reads a word, atomically
47 * @v: pointer to atomic value 49 * @v: pointer to atomic value
diff --git a/arch/metag/include/asm/atomic_lock1.h b/arch/metag/include/asm/atomic_lock1.h
index 6c1380a8a0d4..eee779f26cc4 100644
--- a/arch/metag/include/asm/atomic_lock1.h
+++ b/arch/metag/include/asm/atomic_lock1.h
@@ -37,6 +37,8 @@ static inline int atomic_set(atomic_t *v, int i)
37 return i; 37 return i;
38} 38}
39 39
40#define atomic_set_release(v, i) atomic_set((v), (i))
41
40#define ATOMIC_OP(op, c_op) \ 42#define ATOMIC_OP(op, c_op) \
41static inline void atomic_##op(int i, atomic_t *v) \ 43static inline void atomic_##op(int i, atomic_t *v) \
42{ \ 44{ \
diff --git a/arch/parisc/include/asm/atomic.h b/arch/parisc/include/asm/atomic.h
index 5394b9c5f914..17b98a87e5e2 100644
--- a/arch/parisc/include/asm/atomic.h
+++ b/arch/parisc/include/asm/atomic.h
@@ -65,6 +65,8 @@ static __inline__ void atomic_set(atomic_t *v, int i)
65 _atomic_spin_unlock_irqrestore(v, flags); 65 _atomic_spin_unlock_irqrestore(v, flags);
66} 66}
67 67
68#define atomic_set_release(v, i) atomic_set((v), (i))
69
68static __inline__ int atomic_read(const atomic_t *v) 70static __inline__ int atomic_read(const atomic_t *v)
69{ 71{
70 return READ_ONCE((v)->counter); 72 return READ_ONCE((v)->counter);
diff --git a/arch/powerpc/include/asm/barrier.h b/arch/powerpc/include/asm/barrier.h
index 25d42bd3f114..9c601adfc500 100644
--- a/arch/powerpc/include/asm/barrier.h
+++ b/arch/powerpc/include/asm/barrier.h
@@ -74,13 +74,6 @@ do { \
74 ___p1; \ 74 ___p1; \
75}) 75})
76 76
77/*
78 * This must resolve to hwsync on SMP for the context switch path.
79 * See _switch, and core scheduler context switch memory ordering
80 * comments.
81 */
82#define smp_mb__before_spinlock() smp_mb()
83
84#include <asm-generic/barrier.h> 77#include <asm-generic/barrier.h>
85 78
86#endif /* _ASM_POWERPC_BARRIER_H */ 79#endif /* _ASM_POWERPC_BARRIER_H */
diff --git a/arch/powerpc/include/asm/spinlock.h b/arch/powerpc/include/asm/spinlock.h
index 8c1b913de6d7..c1b1ec94b06c 100644
--- a/arch/powerpc/include/asm/spinlock.h
+++ b/arch/powerpc/include/asm/spinlock.h
@@ -342,5 +342,8 @@ static inline void arch_write_unlock(arch_rwlock_t *rw)
342#define arch_read_relax(lock) __rw_yield(lock) 342#define arch_read_relax(lock) __rw_yield(lock)
343#define arch_write_relax(lock) __rw_yield(lock) 343#define arch_write_relax(lock) __rw_yield(lock)
344 344
345/* See include/linux/spinlock.h */
346#define smp_mb__after_spinlock() smp_mb()
347
345#endif /* __KERNEL__ */ 348#endif /* __KERNEL__ */
346#endif /* __ASM_SPINLOCK_H */ 349#endif /* __ASM_SPINLOCK_H */
diff --git a/arch/sparc/include/asm/atomic_32.h b/arch/sparc/include/asm/atomic_32.h
index ee3f11c43cda..7643e979e333 100644
--- a/arch/sparc/include/asm/atomic_32.h
+++ b/arch/sparc/include/asm/atomic_32.h
@@ -29,6 +29,8 @@ int atomic_xchg(atomic_t *, int);
29int __atomic_add_unless(atomic_t *, int, int); 29int __atomic_add_unless(atomic_t *, int, int);
30void atomic_set(atomic_t *, int); 30void atomic_set(atomic_t *, int);
31 31
32#define atomic_set_release(v, i) atomic_set((v), (i))
33
32#define atomic_read(v) ACCESS_ONCE((v)->counter) 34#define atomic_read(v) ACCESS_ONCE((v)->counter)
33 35
34#define atomic_add(i, v) ((void)atomic_add_return( (int)(i), (v))) 36#define atomic_add(i, v) ((void)atomic_add_return( (int)(i), (v)))
diff --git a/arch/tile/include/asm/atomic_32.h b/arch/tile/include/asm/atomic_32.h
index a93774255136..53a423e7cb92 100644
--- a/arch/tile/include/asm/atomic_32.h
+++ b/arch/tile/include/asm/atomic_32.h
@@ -101,6 +101,8 @@ static inline void atomic_set(atomic_t *v, int n)
101 _atomic_xchg(&v->counter, n); 101 _atomic_xchg(&v->counter, n);
102} 102}
103 103
104#define atomic_set_release(v, i) atomic_set((v), (i))
105
104/* A 64bit atomic type */ 106/* A 64bit atomic type */
105 107
106typedef struct { 108typedef struct {
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 323cb065be5e..6e01f585d57c 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -55,6 +55,7 @@ config X86
55 select ARCH_HAS_KCOV if X86_64 55 select ARCH_HAS_KCOV if X86_64
56 select ARCH_HAS_MMIO_FLUSH 56 select ARCH_HAS_MMIO_FLUSH
57 select ARCH_HAS_PMEM_API if X86_64 57 select ARCH_HAS_PMEM_API if X86_64
58 select ARCH_HAS_REFCOUNT
58 select ARCH_HAS_UACCESS_FLUSHCACHE if X86_64 59 select ARCH_HAS_UACCESS_FLUSHCACHE if X86_64
59 select ARCH_HAS_SET_MEMORY 60 select ARCH_HAS_SET_MEMORY
60 select ARCH_HAS_SG_CHAIN 61 select ARCH_HAS_SG_CHAIN
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index cd20ca0b4043..1fc519f3c49e 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -305,8 +305,6 @@ config DEBUG_ENTRY
305 Some of these sanity checks may slow down kernel entries and 305 Some of these sanity checks may slow down kernel entries and
306 exits or otherwise impact performance. 306 exits or otherwise impact performance.
307 307
308 This is currently used to help test NMI code.
309
310 If unsure, say N. 308 If unsure, say N.
311 309
312config DEBUG_NMI_SELFTEST 310config DEBUG_NMI_SELFTEST
diff --git a/arch/x86/entry/Makefile b/arch/x86/entry/Makefile
index 9976fcecd17e..af28a8a24366 100644
--- a/arch/x86/entry/Makefile
+++ b/arch/x86/entry/Makefile
@@ -2,7 +2,6 @@
2# Makefile for the x86 low level entry code 2# Makefile for the x86 low level entry code
3# 3#
4 4
5OBJECT_FILES_NON_STANDARD_entry_$(BITS).o := y
6OBJECT_FILES_NON_STANDARD_entry_64_compat.o := y 5OBJECT_FILES_NON_STANDARD_entry_64_compat.o := y
7 6
8CFLAGS_syscall_64.o += $(call cc-option,-Wno-override-init,) 7CFLAGS_syscall_64.o += $(call cc-option,-Wno-override-init,)
diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
index 05ed3d393da7..640aafebdc00 100644
--- a/arch/x86/entry/calling.h
+++ b/arch/x86/entry/calling.h
@@ -1,4 +1,5 @@
1#include <linux/jump_label.h> 1#include <linux/jump_label.h>
2#include <asm/unwind_hints.h>
2 3
3/* 4/*
4 5
@@ -112,6 +113,7 @@ For 32-bit we have the following conventions - kernel is built with
112 movq %rdx, 12*8+\offset(%rsp) 113 movq %rdx, 12*8+\offset(%rsp)
113 movq %rsi, 13*8+\offset(%rsp) 114 movq %rsi, 13*8+\offset(%rsp)
114 movq %rdi, 14*8+\offset(%rsp) 115 movq %rdi, 14*8+\offset(%rsp)
116 UNWIND_HINT_REGS offset=\offset extra=0
115 .endm 117 .endm
116 .macro SAVE_C_REGS offset=0 118 .macro SAVE_C_REGS offset=0
117 SAVE_C_REGS_HELPER \offset, 1, 1, 1, 1 119 SAVE_C_REGS_HELPER \offset, 1, 1, 1, 1
@@ -136,6 +138,7 @@ For 32-bit we have the following conventions - kernel is built with
136 movq %r12, 3*8+\offset(%rsp) 138 movq %r12, 3*8+\offset(%rsp)
137 movq %rbp, 4*8+\offset(%rsp) 139 movq %rbp, 4*8+\offset(%rsp)
138 movq %rbx, 5*8+\offset(%rsp) 140 movq %rbx, 5*8+\offset(%rsp)
141 UNWIND_HINT_REGS offset=\offset
139 .endm 142 .endm
140 143
141 .macro RESTORE_EXTRA_REGS offset=0 144 .macro RESTORE_EXTRA_REGS offset=0
@@ -145,6 +148,7 @@ For 32-bit we have the following conventions - kernel is built with
145 movq 3*8+\offset(%rsp), %r12 148 movq 3*8+\offset(%rsp), %r12
146 movq 4*8+\offset(%rsp), %rbp 149 movq 4*8+\offset(%rsp), %rbp
147 movq 5*8+\offset(%rsp), %rbx 150 movq 5*8+\offset(%rsp), %rbx
151 UNWIND_HINT_REGS offset=\offset extra=0
148 .endm 152 .endm
149 153
150 .macro RESTORE_C_REGS_HELPER rstor_rax=1, rstor_rcx=1, rstor_r11=1, rstor_r8910=1, rstor_rdx=1 154 .macro RESTORE_C_REGS_HELPER rstor_rax=1, rstor_rcx=1, rstor_r11=1, rstor_r8910=1, rstor_rdx=1
@@ -167,6 +171,7 @@ For 32-bit we have the following conventions - kernel is built with
167 .endif 171 .endif
168 movq 13*8(%rsp), %rsi 172 movq 13*8(%rsp), %rsi
169 movq 14*8(%rsp), %rdi 173 movq 14*8(%rsp), %rdi
174 UNWIND_HINT_IRET_REGS offset=16*8
170 .endm 175 .endm
171 .macro RESTORE_C_REGS 176 .macro RESTORE_C_REGS
172 RESTORE_C_REGS_HELPER 1,1,1,1,1 177 RESTORE_C_REGS_HELPER 1,1,1,1,1
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 6d078b89a5e8..64b233ab7cad 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -36,6 +36,7 @@
36#include <asm/smap.h> 36#include <asm/smap.h>
37#include <asm/pgtable_types.h> 37#include <asm/pgtable_types.h>
38#include <asm/export.h> 38#include <asm/export.h>
39#include <asm/frame.h>
39#include <linux/err.h> 40#include <linux/err.h>
40 41
41.code64 42.code64
@@ -43,9 +44,10 @@
43 44
44#ifdef CONFIG_PARAVIRT 45#ifdef CONFIG_PARAVIRT
45ENTRY(native_usergs_sysret64) 46ENTRY(native_usergs_sysret64)
47 UNWIND_HINT_EMPTY
46 swapgs 48 swapgs
47 sysretq 49 sysretq
48ENDPROC(native_usergs_sysret64) 50END(native_usergs_sysret64)
49#endif /* CONFIG_PARAVIRT */ 51#endif /* CONFIG_PARAVIRT */
50 52
51.macro TRACE_IRQS_IRETQ 53.macro TRACE_IRQS_IRETQ
@@ -134,6 +136,7 @@ ENDPROC(native_usergs_sysret64)
134 */ 136 */
135 137
136ENTRY(entry_SYSCALL_64) 138ENTRY(entry_SYSCALL_64)
139 UNWIND_HINT_EMPTY
137 /* 140 /*
138 * Interrupts are off on entry. 141 * Interrupts are off on entry.
139 * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, 142 * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON,
@@ -169,6 +172,7 @@ GLOBAL(entry_SYSCALL_64_after_swapgs)
169 pushq %r10 /* pt_regs->r10 */ 172 pushq %r10 /* pt_regs->r10 */
170 pushq %r11 /* pt_regs->r11 */ 173 pushq %r11 /* pt_regs->r11 */
171 sub $(6*8), %rsp /* pt_regs->bp, bx, r12-15 not saved */ 174 sub $(6*8), %rsp /* pt_regs->bp, bx, r12-15 not saved */
175 UNWIND_HINT_REGS extra=0
172 176
173 /* 177 /*
174 * If we need to do entry work or if we guess we'll need to do 178 * If we need to do entry work or if we guess we'll need to do
@@ -223,6 +227,7 @@ entry_SYSCALL_64_fastpath:
223 movq EFLAGS(%rsp), %r11 227 movq EFLAGS(%rsp), %r11
224 RESTORE_C_REGS_EXCEPT_RCX_R11 228 RESTORE_C_REGS_EXCEPT_RCX_R11
225 movq RSP(%rsp), %rsp 229 movq RSP(%rsp), %rsp
230 UNWIND_HINT_EMPTY
226 USERGS_SYSRET64 231 USERGS_SYSRET64
227 232
2281: 2331:
@@ -316,6 +321,7 @@ syscall_return_via_sysret:
316 /* rcx and r11 are already restored (see code above) */ 321 /* rcx and r11 are already restored (see code above) */
317 RESTORE_C_REGS_EXCEPT_RCX_R11 322 RESTORE_C_REGS_EXCEPT_RCX_R11
318 movq RSP(%rsp), %rsp 323 movq RSP(%rsp), %rsp
324 UNWIND_HINT_EMPTY
319 USERGS_SYSRET64 325 USERGS_SYSRET64
320 326
321opportunistic_sysret_failed: 327opportunistic_sysret_failed:
@@ -343,6 +349,7 @@ ENTRY(stub_ptregs_64)
343 DISABLE_INTERRUPTS(CLBR_ANY) 349 DISABLE_INTERRUPTS(CLBR_ANY)
344 TRACE_IRQS_OFF 350 TRACE_IRQS_OFF
345 popq %rax 351 popq %rax
352 UNWIND_HINT_REGS extra=0
346 jmp entry_SYSCALL64_slow_path 353 jmp entry_SYSCALL64_slow_path
347 354
3481: 3551:
@@ -351,6 +358,7 @@ END(stub_ptregs_64)
351 358
352.macro ptregs_stub func 359.macro ptregs_stub func
353ENTRY(ptregs_\func) 360ENTRY(ptregs_\func)
361 UNWIND_HINT_FUNC
354 leaq \func(%rip), %rax 362 leaq \func(%rip), %rax
355 jmp stub_ptregs_64 363 jmp stub_ptregs_64
356END(ptregs_\func) 364END(ptregs_\func)
@@ -367,6 +375,7 @@ END(ptregs_\func)
367 * %rsi: next task 375 * %rsi: next task
368 */ 376 */
369ENTRY(__switch_to_asm) 377ENTRY(__switch_to_asm)
378 UNWIND_HINT_FUNC
370 /* 379 /*
371 * Save callee-saved registers 380 * Save callee-saved registers
372 * This must match the order in inactive_task_frame 381 * This must match the order in inactive_task_frame
@@ -406,6 +415,7 @@ END(__switch_to_asm)
406 * r12: kernel thread arg 415 * r12: kernel thread arg
407 */ 416 */
408ENTRY(ret_from_fork) 417ENTRY(ret_from_fork)
418 UNWIND_HINT_EMPTY
409 movq %rax, %rdi 419 movq %rax, %rdi
410 call schedule_tail /* rdi: 'prev' task parameter */ 420 call schedule_tail /* rdi: 'prev' task parameter */
411 421
@@ -413,6 +423,7 @@ ENTRY(ret_from_fork)
413 jnz 1f /* kernel threads are uncommon */ 423 jnz 1f /* kernel threads are uncommon */
414 424
4152: 4252:
426 UNWIND_HINT_REGS
416 movq %rsp, %rdi 427 movq %rsp, %rdi
417 call syscall_return_slowpath /* returns with IRQs disabled */ 428 call syscall_return_slowpath /* returns with IRQs disabled */
418 TRACE_IRQS_ON /* user mode is traced as IRQS on */ 429 TRACE_IRQS_ON /* user mode is traced as IRQS on */
@@ -440,13 +451,102 @@ END(ret_from_fork)
440ENTRY(irq_entries_start) 451ENTRY(irq_entries_start)
441 vector=FIRST_EXTERNAL_VECTOR 452 vector=FIRST_EXTERNAL_VECTOR
442 .rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR) 453 .rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR)
454 UNWIND_HINT_IRET_REGS
443 pushq $(~vector+0x80) /* Note: always in signed byte range */ 455 pushq $(~vector+0x80) /* Note: always in signed byte range */
444 vector=vector+1
445 jmp common_interrupt 456 jmp common_interrupt
446 .align 8 457 .align 8
458 vector=vector+1
447 .endr 459 .endr
448END(irq_entries_start) 460END(irq_entries_start)
449 461
462.macro DEBUG_ENTRY_ASSERT_IRQS_OFF
463#ifdef CONFIG_DEBUG_ENTRY
464 pushfq
465 testl $X86_EFLAGS_IF, (%rsp)
466 jz .Lokay_\@
467 ud2
468.Lokay_\@:
469 addq $8, %rsp
470#endif
471.endm
472
473/*
474 * Enters the IRQ stack if we're not already using it. NMI-safe. Clobbers
475 * flags and puts old RSP into old_rsp, and leaves all other GPRs alone.
476 * Requires kernel GSBASE.
477 *
478 * The invariant is that, if irq_count != -1, then the IRQ stack is in use.
479 */
480.macro ENTER_IRQ_STACK regs=1 old_rsp
481 DEBUG_ENTRY_ASSERT_IRQS_OFF
482 movq %rsp, \old_rsp
483
484 .if \regs
485 UNWIND_HINT_REGS base=\old_rsp
486 .endif
487
488 incl PER_CPU_VAR(irq_count)
489 jnz .Lirq_stack_push_old_rsp_\@
490
491 /*
492 * Right now, if we just incremented irq_count to zero, we've
493 * claimed the IRQ stack but we haven't switched to it yet.
494 *
495 * If anything is added that can interrupt us here without using IST,
496 * it must be *extremely* careful to limit its stack usage. This
497 * could include kprobes and a hypothetical future IST-less #DB
498 * handler.
499 *
500 * The OOPS unwinder relies on the word at the top of the IRQ
501 * stack linking back to the previous RSP for the entire time we're
502 * on the IRQ stack. For this to work reliably, we need to write
503 * it before we actually move ourselves to the IRQ stack.
504 */
505
506 movq \old_rsp, PER_CPU_VAR(irq_stack_union + IRQ_STACK_SIZE - 8)
507 movq PER_CPU_VAR(irq_stack_ptr), %rsp
508
509#ifdef CONFIG_DEBUG_ENTRY
510 /*
511 * If the first movq above becomes wrong due to IRQ stack layout
512 * changes, the only way we'll notice is if we try to unwind right
513 * here. Assert that we set up the stack right to catch this type
514 * of bug quickly.
515 */
516 cmpq -8(%rsp), \old_rsp
517 je .Lirq_stack_okay\@
518 ud2
519 .Lirq_stack_okay\@:
520#endif
521
522.Lirq_stack_push_old_rsp_\@:
523 pushq \old_rsp
524
525 .if \regs
526 UNWIND_HINT_REGS indirect=1
527 .endif
528.endm
529
530/*
531 * Undoes ENTER_IRQ_STACK.
532 */
533.macro LEAVE_IRQ_STACK regs=1
534 DEBUG_ENTRY_ASSERT_IRQS_OFF
535 /* We need to be off the IRQ stack before decrementing irq_count. */
536 popq %rsp
537
538 .if \regs
539 UNWIND_HINT_REGS
540 .endif
541
542 /*
543 * As in ENTER_IRQ_STACK, irq_count == 0, we are still claiming
544 * the irq stack but we're not on it.
545 */
546
547 decl PER_CPU_VAR(irq_count)
548.endm
549
450/* 550/*
451 * Interrupt entry/exit. 551 * Interrupt entry/exit.
452 * 552 *
@@ -485,17 +585,7 @@ END(irq_entries_start)
485 CALL_enter_from_user_mode 585 CALL_enter_from_user_mode
486 586
4871: 5871:
488 /* 588 ENTER_IRQ_STACK old_rsp=%rdi
489 * Save previous stack pointer, optionally switch to interrupt stack.
490 * irq_count is used to check if a CPU is already on an interrupt stack
491 * or not. While this is essentially redundant with preempt_count it is
492 * a little cheaper to use a separate counter in the PDA (short of
493 * moving irq_enter into assembly, which would be too much work)
494 */
495 movq %rsp, %rdi
496 incl PER_CPU_VAR(irq_count)
497 cmovzq PER_CPU_VAR(irq_stack_ptr), %rsp
498 pushq %rdi
499 /* We entered an interrupt context - irqs are off: */ 589 /* We entered an interrupt context - irqs are off: */
500 TRACE_IRQS_OFF 590 TRACE_IRQS_OFF
501 591
@@ -515,10 +605,8 @@ common_interrupt:
515ret_from_intr: 605ret_from_intr:
516 DISABLE_INTERRUPTS(CLBR_ANY) 606 DISABLE_INTERRUPTS(CLBR_ANY)
517 TRACE_IRQS_OFF 607 TRACE_IRQS_OFF
518 decl PER_CPU_VAR(irq_count)
519 608
520 /* Restore saved previous stack */ 609 LEAVE_IRQ_STACK
521 popq %rsp
522 610
523 testb $3, CS(%rsp) 611 testb $3, CS(%rsp)
524 jz retint_kernel 612 jz retint_kernel
@@ -561,6 +649,7 @@ restore_c_regs_and_iret:
561 INTERRUPT_RETURN 649 INTERRUPT_RETURN
562 650
563ENTRY(native_iret) 651ENTRY(native_iret)
652 UNWIND_HINT_IRET_REGS
564 /* 653 /*
565 * Are we returning to a stack segment from the LDT? Note: in 654 * Are we returning to a stack segment from the LDT? Note: in
566 * 64-bit mode SS:RSP on the exception stack is always valid. 655 * 64-bit mode SS:RSP on the exception stack is always valid.
@@ -633,6 +722,7 @@ native_irq_return_ldt:
633 orq PER_CPU_VAR(espfix_stack), %rax 722 orq PER_CPU_VAR(espfix_stack), %rax
634 SWAPGS 723 SWAPGS
635 movq %rax, %rsp 724 movq %rax, %rsp
725 UNWIND_HINT_IRET_REGS offset=8
636 726
637 /* 727 /*
638 * At this point, we cannot write to the stack any more, but we can 728 * At this point, we cannot write to the stack any more, but we can
@@ -654,6 +744,7 @@ END(common_interrupt)
654 */ 744 */
655.macro apicinterrupt3 num sym do_sym 745.macro apicinterrupt3 num sym do_sym
656ENTRY(\sym) 746ENTRY(\sym)
747 UNWIND_HINT_IRET_REGS
657 ASM_CLAC 748 ASM_CLAC
658 pushq $~(\num) 749 pushq $~(\num)
659.Lcommon_\sym: 750.Lcommon_\sym:
@@ -740,6 +831,8 @@ apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt
740 831
741.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 832.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1
742ENTRY(\sym) 833ENTRY(\sym)
834 UNWIND_HINT_IRET_REGS offset=8
835
743 /* Sanity check */ 836 /* Sanity check */
744 .if \shift_ist != -1 && \paranoid == 0 837 .if \shift_ist != -1 && \paranoid == 0
745 .error "using shift_ist requires paranoid=1" 838 .error "using shift_ist requires paranoid=1"
@@ -763,6 +856,7 @@ ENTRY(\sym)
763 .else 856 .else
764 call error_entry 857 call error_entry
765 .endif 858 .endif
859 UNWIND_HINT_REGS
766 /* returned flag: ebx=0: need swapgs on exit, ebx=1: don't need it */ 860 /* returned flag: ebx=0: need swapgs on exit, ebx=1: don't need it */
767 861
768 .if \paranoid 862 .if \paranoid
@@ -860,6 +954,7 @@ idtentry simd_coprocessor_error do_simd_coprocessor_error has_error_code=0
860 * edi: new selector 954 * edi: new selector
861 */ 955 */
862ENTRY(native_load_gs_index) 956ENTRY(native_load_gs_index)
957 FRAME_BEGIN
863 pushfq 958 pushfq
864 DISABLE_INTERRUPTS(CLBR_ANY & ~CLBR_RDI) 959 DISABLE_INTERRUPTS(CLBR_ANY & ~CLBR_RDI)
865 SWAPGS 960 SWAPGS
@@ -868,8 +963,9 @@ ENTRY(native_load_gs_index)
8682: ALTERNATIVE "", "mfence", X86_BUG_SWAPGS_FENCE 9632: ALTERNATIVE "", "mfence", X86_BUG_SWAPGS_FENCE
869 SWAPGS 964 SWAPGS
870 popfq 965 popfq
966 FRAME_END
871 ret 967 ret
872END(native_load_gs_index) 968ENDPROC(native_load_gs_index)
873EXPORT_SYMBOL(native_load_gs_index) 969EXPORT_SYMBOL(native_load_gs_index)
874 970
875 _ASM_EXTABLE(.Lgs_change, bad_gs) 971 _ASM_EXTABLE(.Lgs_change, bad_gs)
@@ -892,14 +988,12 @@ bad_gs:
892ENTRY(do_softirq_own_stack) 988ENTRY(do_softirq_own_stack)
893 pushq %rbp 989 pushq %rbp
894 mov %rsp, %rbp 990 mov %rsp, %rbp
895 incl PER_CPU_VAR(irq_count) 991 ENTER_IRQ_STACK regs=0 old_rsp=%r11
896 cmove PER_CPU_VAR(irq_stack_ptr), %rsp
897 push %rbp /* frame pointer backlink */
898 call __do_softirq 992 call __do_softirq
993 LEAVE_IRQ_STACK regs=0
899 leaveq 994 leaveq
900 decl PER_CPU_VAR(irq_count)
901 ret 995 ret
902END(do_softirq_own_stack) 996ENDPROC(do_softirq_own_stack)
903 997
904#ifdef CONFIG_XEN 998#ifdef CONFIG_XEN
905idtentry xen_hypervisor_callback xen_do_hypervisor_callback has_error_code=0 999idtentry xen_hypervisor_callback xen_do_hypervisor_callback has_error_code=0
@@ -923,14 +1017,14 @@ ENTRY(xen_do_hypervisor_callback) /* do_hypervisor_callback(struct *pt_regs) */
923 * Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will 1017 * Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will
924 * see the correct pointer to the pt_regs 1018 * see the correct pointer to the pt_regs
925 */ 1019 */
1020 UNWIND_HINT_FUNC
926 movq %rdi, %rsp /* we don't return, adjust the stack frame */ 1021 movq %rdi, %rsp /* we don't return, adjust the stack frame */
92711: incl PER_CPU_VAR(irq_count) 1022 UNWIND_HINT_REGS
928 movq %rsp, %rbp 1023
929 cmovzq PER_CPU_VAR(irq_stack_ptr), %rsp 1024 ENTER_IRQ_STACK old_rsp=%r10
930 pushq %rbp /* frame pointer backlink */
931 call xen_evtchn_do_upcall 1025 call xen_evtchn_do_upcall
932 popq %rsp 1026 LEAVE_IRQ_STACK
933 decl PER_CPU_VAR(irq_count) 1027
934#ifndef CONFIG_PREEMPT 1028#ifndef CONFIG_PREEMPT
935 call xen_maybe_preempt_hcall 1029 call xen_maybe_preempt_hcall
936#endif 1030#endif
@@ -951,6 +1045,7 @@ END(xen_do_hypervisor_callback)
951 * with its current contents: any discrepancy means we in category 1. 1045 * with its current contents: any discrepancy means we in category 1.
952 */ 1046 */
953ENTRY(xen_failsafe_callback) 1047ENTRY(xen_failsafe_callback)
1048 UNWIND_HINT_EMPTY
954 movl %ds, %ecx 1049 movl %ds, %ecx
955 cmpw %cx, 0x10(%rsp) 1050 cmpw %cx, 0x10(%rsp)
956 jne 1f 1051 jne 1f
@@ -970,11 +1065,13 @@ ENTRY(xen_failsafe_callback)
970 pushq $0 /* RIP */ 1065 pushq $0 /* RIP */
971 pushq %r11 1066 pushq %r11
972 pushq %rcx 1067 pushq %rcx
1068 UNWIND_HINT_IRET_REGS offset=8
973 jmp general_protection 1069 jmp general_protection
9741: /* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */ 10701: /* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */
975 movq (%rsp), %rcx 1071 movq (%rsp), %rcx
976 movq 8(%rsp), %r11 1072 movq 8(%rsp), %r11
977 addq $0x30, %rsp 1073 addq $0x30, %rsp
1074 UNWIND_HINT_IRET_REGS
978 pushq $-1 /* orig_ax = -1 => not a system call */ 1075 pushq $-1 /* orig_ax = -1 => not a system call */
979 ALLOC_PT_GPREGS_ON_STACK 1076 ALLOC_PT_GPREGS_ON_STACK
980 SAVE_C_REGS 1077 SAVE_C_REGS
@@ -1020,6 +1117,7 @@ idtentry machine_check has_error_code=0 paranoid=1 do_sym=*machine_check_vec
1020 * Return: ebx=0: need swapgs on exit, ebx=1: otherwise 1117 * Return: ebx=0: need swapgs on exit, ebx=1: otherwise
1021 */ 1118 */
1022ENTRY(paranoid_entry) 1119ENTRY(paranoid_entry)
1120 UNWIND_HINT_FUNC
1023 cld 1121 cld
1024 SAVE_C_REGS 8 1122 SAVE_C_REGS 8
1025 SAVE_EXTRA_REGS 8 1123 SAVE_EXTRA_REGS 8
@@ -1047,6 +1145,7 @@ END(paranoid_entry)
1047 * On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it) 1145 * On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it)
1048 */ 1146 */
1049ENTRY(paranoid_exit) 1147ENTRY(paranoid_exit)
1148 UNWIND_HINT_REGS
1050 DISABLE_INTERRUPTS(CLBR_ANY) 1149 DISABLE_INTERRUPTS(CLBR_ANY)
1051 TRACE_IRQS_OFF_DEBUG 1150 TRACE_IRQS_OFF_DEBUG
1052 testl %ebx, %ebx /* swapgs needed? */ 1151 testl %ebx, %ebx /* swapgs needed? */
@@ -1068,6 +1167,7 @@ END(paranoid_exit)
1068 * Return: EBX=0: came from user mode; EBX=1: otherwise 1167 * Return: EBX=0: came from user mode; EBX=1: otherwise
1069 */ 1168 */
1070ENTRY(error_entry) 1169ENTRY(error_entry)
1170 UNWIND_HINT_FUNC
1071 cld 1171 cld
1072 SAVE_C_REGS 8 1172 SAVE_C_REGS 8
1073 SAVE_EXTRA_REGS 8 1173 SAVE_EXTRA_REGS 8
@@ -1152,6 +1252,7 @@ END(error_entry)
1152 * 0: user gsbase is loaded, we need SWAPGS and standard preparation for return to usermode 1252 * 0: user gsbase is loaded, we need SWAPGS and standard preparation for return to usermode
1153 */ 1253 */
1154ENTRY(error_exit) 1254ENTRY(error_exit)
1255 UNWIND_HINT_REGS
1155 DISABLE_INTERRUPTS(CLBR_ANY) 1256 DISABLE_INTERRUPTS(CLBR_ANY)
1156 TRACE_IRQS_OFF 1257 TRACE_IRQS_OFF
1157 testl %ebx, %ebx 1258 testl %ebx, %ebx
@@ -1161,6 +1262,7 @@ END(error_exit)
1161 1262
1162/* Runs on exception stack */ 1263/* Runs on exception stack */
1163ENTRY(nmi) 1264ENTRY(nmi)
1265 UNWIND_HINT_IRET_REGS
1164 /* 1266 /*
1165 * Fix up the exception frame if we're on Xen. 1267 * Fix up the exception frame if we're on Xen.
1166 * PARAVIRT_ADJUST_EXCEPTION_FRAME is guaranteed to push at most 1268 * PARAVIRT_ADJUST_EXCEPTION_FRAME is guaranteed to push at most
@@ -1234,11 +1336,13 @@ ENTRY(nmi)
1234 cld 1336 cld
1235 movq %rsp, %rdx 1337 movq %rsp, %rdx
1236 movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp 1338 movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
1339 UNWIND_HINT_IRET_REGS base=%rdx offset=8
1237 pushq 5*8(%rdx) /* pt_regs->ss */ 1340 pushq 5*8(%rdx) /* pt_regs->ss */
1238 pushq 4*8(%rdx) /* pt_regs->rsp */ 1341 pushq 4*8(%rdx) /* pt_regs->rsp */
1239 pushq 3*8(%rdx) /* pt_regs->flags */ 1342 pushq 3*8(%rdx) /* pt_regs->flags */
1240 pushq 2*8(%rdx) /* pt_regs->cs */ 1343 pushq 2*8(%rdx) /* pt_regs->cs */
1241 pushq 1*8(%rdx) /* pt_regs->rip */ 1344 pushq 1*8(%rdx) /* pt_regs->rip */
1345 UNWIND_HINT_IRET_REGS
1242 pushq $-1 /* pt_regs->orig_ax */ 1346 pushq $-1 /* pt_regs->orig_ax */
1243 pushq %rdi /* pt_regs->di */ 1347 pushq %rdi /* pt_regs->di */
1244 pushq %rsi /* pt_regs->si */ 1348 pushq %rsi /* pt_regs->si */
@@ -1255,6 +1359,7 @@ ENTRY(nmi)
1255 pushq %r13 /* pt_regs->r13 */ 1359 pushq %r13 /* pt_regs->r13 */
1256 pushq %r14 /* pt_regs->r14 */ 1360 pushq %r14 /* pt_regs->r14 */
1257 pushq %r15 /* pt_regs->r15 */ 1361 pushq %r15 /* pt_regs->r15 */
1362 UNWIND_HINT_REGS
1258 ENCODE_FRAME_POINTER 1363 ENCODE_FRAME_POINTER
1259 1364
1260 /* 1365 /*
@@ -1409,6 +1514,7 @@ first_nmi:
1409 .rept 5 1514 .rept 5
1410 pushq 11*8(%rsp) 1515 pushq 11*8(%rsp)
1411 .endr 1516 .endr
1517 UNWIND_HINT_IRET_REGS
1412 1518
1413 /* Everything up to here is safe from nested NMIs */ 1519 /* Everything up to here is safe from nested NMIs */
1414 1520
@@ -1424,6 +1530,7 @@ first_nmi:
1424 pushq $__KERNEL_CS /* CS */ 1530 pushq $__KERNEL_CS /* CS */
1425 pushq $1f /* RIP */ 1531 pushq $1f /* RIP */
1426 INTERRUPT_RETURN /* continues at repeat_nmi below */ 1532 INTERRUPT_RETURN /* continues at repeat_nmi below */
1533 UNWIND_HINT_IRET_REGS
14271: 15341:
1428#endif 1535#endif
1429 1536
@@ -1473,6 +1580,7 @@ end_repeat_nmi:
1473 * exceptions might do. 1580 * exceptions might do.
1474 */ 1581 */
1475 call paranoid_entry 1582 call paranoid_entry
1583 UNWIND_HINT_REGS
1476 1584
1477 /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */ 1585 /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
1478 movq %rsp, %rdi 1586 movq %rsp, %rdi
@@ -1510,17 +1618,19 @@ nmi_restore:
1510END(nmi) 1618END(nmi)
1511 1619
1512ENTRY(ignore_sysret) 1620ENTRY(ignore_sysret)
1621 UNWIND_HINT_EMPTY
1513 mov $-ENOSYS, %eax 1622 mov $-ENOSYS, %eax
1514 sysret 1623 sysret
1515END(ignore_sysret) 1624END(ignore_sysret)
1516 1625
1517ENTRY(rewind_stack_do_exit) 1626ENTRY(rewind_stack_do_exit)
1627 UNWIND_HINT_FUNC
1518 /* Prevent any naive code from trying to unwind to our caller. */ 1628 /* Prevent any naive code from trying to unwind to our caller. */
1519 xorl %ebp, %ebp 1629 xorl %ebp, %ebp
1520 1630
1521 movq PER_CPU_VAR(cpu_current_top_of_stack), %rax 1631 movq PER_CPU_VAR(cpu_current_top_of_stack), %rax
1522 leaq -TOP_OF_KERNEL_STACK_PADDING-PTREGS_SIZE(%rax), %rsp 1632 leaq -PTREGS_SIZE(%rax), %rsp
1633 UNWIND_HINT_FUNC sp_offset=PTREGS_SIZE
1523 1634
1524 call do_exit 1635 call do_exit
15251: jmp 1b
1526END(rewind_stack_do_exit) 1636END(rewind_stack_do_exit)
diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h
index 7a9df3beb89b..676ee5807d86 100644
--- a/arch/x86/include/asm/asm.h
+++ b/arch/x86/include/asm/asm.h
@@ -74,6 +74,9 @@
74# define _ASM_EXTABLE_EX(from, to) \ 74# define _ASM_EXTABLE_EX(from, to) \
75 _ASM_EXTABLE_HANDLE(from, to, ex_handler_ext) 75 _ASM_EXTABLE_HANDLE(from, to, ex_handler_ext)
76 76
77# define _ASM_EXTABLE_REFCOUNT(from, to) \
78 _ASM_EXTABLE_HANDLE(from, to, ex_handler_refcount)
79
77# define _ASM_NOKPROBE(entry) \ 80# define _ASM_NOKPROBE(entry) \
78 .pushsection "_kprobe_blacklist","aw" ; \ 81 .pushsection "_kprobe_blacklist","aw" ; \
79 _ASM_ALIGN ; \ 82 _ASM_ALIGN ; \
@@ -123,6 +126,9 @@
123# define _ASM_EXTABLE_EX(from, to) \ 126# define _ASM_EXTABLE_EX(from, to) \
124 _ASM_EXTABLE_HANDLE(from, to, ex_handler_ext) 127 _ASM_EXTABLE_HANDLE(from, to, ex_handler_ext)
125 128
129# define _ASM_EXTABLE_REFCOUNT(from, to) \
130 _ASM_EXTABLE_HANDLE(from, to, ex_handler_refcount)
131
126/* For C file, we already have NOKPROBE_SYMBOL macro */ 132/* For C file, we already have NOKPROBE_SYMBOL macro */
127#endif 133#endif
128 134
diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h
index 33380b871463..0874ebda3069 100644
--- a/arch/x86/include/asm/atomic.h
+++ b/arch/x86/include/asm/atomic.h
@@ -197,35 +197,56 @@ static inline int atomic_xchg(atomic_t *v, int new)
197 return xchg(&v->counter, new); 197 return xchg(&v->counter, new);
198} 198}
199 199
200#define ATOMIC_OP(op) \ 200static inline void atomic_and(int i, atomic_t *v)
201static inline void atomic_##op(int i, atomic_t *v) \ 201{
202{ \ 202 asm volatile(LOCK_PREFIX "andl %1,%0"
203 asm volatile(LOCK_PREFIX #op"l %1,%0" \ 203 : "+m" (v->counter)
204 : "+m" (v->counter) \ 204 : "ir" (i)
205 : "ir" (i) \ 205 : "memory");
206 : "memory"); \ 206}
207
208static inline int atomic_fetch_and(int i, atomic_t *v)
209{
210 int val = atomic_read(v);
211
212 do { } while (!atomic_try_cmpxchg(v, &val, val & i));
213
214 return val;
207} 215}
208 216
209#define ATOMIC_FETCH_OP(op, c_op) \ 217static inline void atomic_or(int i, atomic_t *v)
210static inline int atomic_fetch_##op(int i, atomic_t *v) \ 218{
211{ \ 219 asm volatile(LOCK_PREFIX "orl %1,%0"
212 int val = atomic_read(v); \ 220 : "+m" (v->counter)
213 do { \ 221 : "ir" (i)
214 } while (!atomic_try_cmpxchg(v, &val, val c_op i)); \ 222 : "memory");
215 return val; \
216} 223}
217 224
218#define ATOMIC_OPS(op, c_op) \ 225static inline int atomic_fetch_or(int i, atomic_t *v)
219 ATOMIC_OP(op) \ 226{
220 ATOMIC_FETCH_OP(op, c_op) 227 int val = atomic_read(v);
221 228
222ATOMIC_OPS(and, &) 229 do { } while (!atomic_try_cmpxchg(v, &val, val | i));
223ATOMIC_OPS(or , |)
224ATOMIC_OPS(xor, ^)
225 230
226#undef ATOMIC_OPS 231 return val;
227#undef ATOMIC_FETCH_OP 232}
228#undef ATOMIC_OP 233
234static inline void atomic_xor(int i, atomic_t *v)
235{
236 asm volatile(LOCK_PREFIX "xorl %1,%0"
237 : "+m" (v->counter)
238 : "ir" (i)
239 : "memory");
240}
241
242static inline int atomic_fetch_xor(int i, atomic_t *v)
243{
244 int val = atomic_read(v);
245
246 do { } while (!atomic_try_cmpxchg(v, &val, val ^ i));
247
248 return val;
249}
229 250
230/** 251/**
231 * __atomic_add_unless - add unless the number is already a given value 252 * __atomic_add_unless - add unless the number is already a given value
@@ -239,10 +260,12 @@ ATOMIC_OPS(xor, ^)
239static __always_inline int __atomic_add_unless(atomic_t *v, int a, int u) 260static __always_inline int __atomic_add_unless(atomic_t *v, int a, int u)
240{ 261{
241 int c = atomic_read(v); 262 int c = atomic_read(v);
263
242 do { 264 do {
243 if (unlikely(c == u)) 265 if (unlikely(c == u))
244 break; 266 break;
245 } while (!atomic_try_cmpxchg(v, &c, c + a)); 267 } while (!atomic_try_cmpxchg(v, &c, c + a));
268
246 return c; 269 return c;
247} 270}
248 271
diff --git a/arch/x86/include/asm/atomic64_32.h b/arch/x86/include/asm/atomic64_32.h
index 71d7705fb303..9e206f31ce2a 100644
--- a/arch/x86/include/asm/atomic64_32.h
+++ b/arch/x86/include/asm/atomic64_32.h
@@ -312,37 +312,70 @@ static inline long long atomic64_dec_if_positive(atomic64_t *v)
312#undef alternative_atomic64 312#undef alternative_atomic64
313#undef __alternative_atomic64 313#undef __alternative_atomic64
314 314
315#define ATOMIC64_OP(op, c_op) \ 315static inline void atomic64_and(long long i, atomic64_t *v)
316static inline void atomic64_##op(long long i, atomic64_t *v) \ 316{
317{ \ 317 long long old, c = 0;
318 long long old, c = 0; \ 318
319 while ((old = atomic64_cmpxchg(v, c, c c_op i)) != c) \ 319 while ((old = atomic64_cmpxchg(v, c, c & i)) != c)
320 c = old; \ 320 c = old;
321} 321}
322 322
323#define ATOMIC64_FETCH_OP(op, c_op) \ 323static inline long long atomic64_fetch_and(long long i, atomic64_t *v)
324static inline long long atomic64_fetch_##op(long long i, atomic64_t *v) \ 324{
325{ \ 325 long long old, c = 0;
326 long long old, c = 0; \ 326
327 while ((old = atomic64_cmpxchg(v, c, c c_op i)) != c) \ 327 while ((old = atomic64_cmpxchg(v, c, c & i)) != c)
328 c = old; \ 328 c = old;
329 return old; \ 329
330 return old;
330} 331}
331 332
332ATOMIC64_FETCH_OP(add, +) 333static inline void atomic64_or(long long i, atomic64_t *v)
334{
335 long long old, c = 0;
333 336
334#define atomic64_fetch_sub(i, v) atomic64_fetch_add(-(i), (v)) 337 while ((old = atomic64_cmpxchg(v, c, c | i)) != c)
338 c = old;
339}
340
341static inline long long atomic64_fetch_or(long long i, atomic64_t *v)
342{
343 long long old, c = 0;
344
345 while ((old = atomic64_cmpxchg(v, c, c | i)) != c)
346 c = old;
347
348 return old;
349}
335 350
336#define ATOMIC64_OPS(op, c_op) \ 351static inline void atomic64_xor(long long i, atomic64_t *v)
337 ATOMIC64_OP(op, c_op) \ 352{
338 ATOMIC64_FETCH_OP(op, c_op) 353 long long old, c = 0;
354
355 while ((old = atomic64_cmpxchg(v, c, c ^ i)) != c)
356 c = old;
357}
339 358
340ATOMIC64_OPS(and, &) 359static inline long long atomic64_fetch_xor(long long i, atomic64_t *v)
341ATOMIC64_OPS(or, |) 360{
342ATOMIC64_OPS(xor, ^) 361 long long old, c = 0;
362
363 while ((old = atomic64_cmpxchg(v, c, c ^ i)) != c)
364 c = old;
365
366 return old;
367}
343 368
344#undef ATOMIC64_OPS 369static inline long long atomic64_fetch_add(long long i, atomic64_t *v)
345#undef ATOMIC64_FETCH_OP 370{
346#undef ATOMIC64_OP 371 long long old, c = 0;
372
373 while ((old = atomic64_cmpxchg(v, c, c + i)) != c)
374 c = old;
375
376 return old;
377}
378
379#define atomic64_fetch_sub(i, v) atomic64_fetch_add(-(i), (v))
347 380
348#endif /* _ASM_X86_ATOMIC64_32_H */ 381#endif /* _ASM_X86_ATOMIC64_32_H */
diff --git a/arch/x86/include/asm/atomic64_64.h b/arch/x86/include/asm/atomic64_64.h
index 6189a433c9a9..5d9de36a2f04 100644
--- a/arch/x86/include/asm/atomic64_64.h
+++ b/arch/x86/include/asm/atomic64_64.h
@@ -177,7 +177,7 @@ static inline long atomic64_cmpxchg(atomic64_t *v, long old, long new)
177} 177}
178 178
179#define atomic64_try_cmpxchg atomic64_try_cmpxchg 179#define atomic64_try_cmpxchg atomic64_try_cmpxchg
180static __always_inline bool atomic64_try_cmpxchg(atomic64_t *v, long *old, long new) 180static __always_inline bool atomic64_try_cmpxchg(atomic64_t *v, s64 *old, long new)
181{ 181{
182 return try_cmpxchg(&v->counter, old, new); 182 return try_cmpxchg(&v->counter, old, new);
183} 183}
@@ -198,7 +198,7 @@ static inline long atomic64_xchg(atomic64_t *v, long new)
198 */ 198 */
199static inline bool atomic64_add_unless(atomic64_t *v, long a, long u) 199static inline bool atomic64_add_unless(atomic64_t *v, long a, long u)
200{ 200{
201 long c = atomic64_read(v); 201 s64 c = atomic64_read(v);
202 do { 202 do {
203 if (unlikely(c == u)) 203 if (unlikely(c == u))
204 return false; 204 return false;
@@ -217,7 +217,7 @@ static inline bool atomic64_add_unless(atomic64_t *v, long a, long u)
217 */ 217 */
218static inline long atomic64_dec_if_positive(atomic64_t *v) 218static inline long atomic64_dec_if_positive(atomic64_t *v)
219{ 219{
220 long dec, c = atomic64_read(v); 220 s64 dec, c = atomic64_read(v);
221 do { 221 do {
222 dec = c - 1; 222 dec = c - 1;
223 if (unlikely(dec < 0)) 223 if (unlikely(dec < 0))
@@ -226,34 +226,55 @@ static inline long atomic64_dec_if_positive(atomic64_t *v)
226 return dec; 226 return dec;
227} 227}
228 228
229#define ATOMIC64_OP(op) \ 229static inline void atomic64_and(long i, atomic64_t *v)
230static inline void atomic64_##op(long i, atomic64_t *v) \ 230{
231{ \ 231 asm volatile(LOCK_PREFIX "andq %1,%0"
232 asm volatile(LOCK_PREFIX #op"q %1,%0" \ 232 : "+m" (v->counter)
233 : "+m" (v->counter) \ 233 : "er" (i)
234 : "er" (i) \ 234 : "memory");
235 : "memory"); \
236} 235}
237 236
238#define ATOMIC64_FETCH_OP(op, c_op) \ 237static inline long atomic64_fetch_and(long i, atomic64_t *v)
239static inline long atomic64_fetch_##op(long i, atomic64_t *v) \ 238{
240{ \ 239 s64 val = atomic64_read(v);
241 long val = atomic64_read(v); \ 240
242 do { \ 241 do {
243 } while (!atomic64_try_cmpxchg(v, &val, val c_op i)); \ 242 } while (!atomic64_try_cmpxchg(v, &val, val & i));
244 return val; \ 243 return val;
245} 244}
246 245
247#define ATOMIC64_OPS(op, c_op) \ 246static inline void atomic64_or(long i, atomic64_t *v)
248 ATOMIC64_OP(op) \ 247{
249 ATOMIC64_FETCH_OP(op, c_op) 248 asm volatile(LOCK_PREFIX "orq %1,%0"
249 : "+m" (v->counter)
250 : "er" (i)
251 : "memory");
252}
250 253
251ATOMIC64_OPS(and, &) 254static inline long atomic64_fetch_or(long i, atomic64_t *v)
252ATOMIC64_OPS(or, |) 255{
253ATOMIC64_OPS(xor, ^) 256 s64 val = atomic64_read(v);
254 257
255#undef ATOMIC64_OPS 258 do {
256#undef ATOMIC64_FETCH_OP 259 } while (!atomic64_try_cmpxchg(v, &val, val | i));
257#undef ATOMIC64_OP 260 return val;
261}
262
263static inline void atomic64_xor(long i, atomic64_t *v)
264{
265 asm volatile(LOCK_PREFIX "xorq %1,%0"
266 : "+m" (v->counter)
267 : "er" (i)
268 : "memory");
269}
270
271static inline long atomic64_fetch_xor(long i, atomic64_t *v)
272{
273 s64 val = atomic64_read(v);
274
275 do {
276 } while (!atomic64_try_cmpxchg(v, &val, val ^ i));
277 return val;
278}
258 279
259#endif /* _ASM_X86_ATOMIC64_64_H */ 280#endif /* _ASM_X86_ATOMIC64_64_H */
diff --git a/arch/x86/include/asm/cmpxchg.h b/arch/x86/include/asm/cmpxchg.h
index d90296d061e8..b5069e802d5c 100644
--- a/arch/x86/include/asm/cmpxchg.h
+++ b/arch/x86/include/asm/cmpxchg.h
@@ -157,7 +157,7 @@ extern void __add_wrong_size(void)
157#define __raw_try_cmpxchg(_ptr, _pold, _new, size, lock) \ 157#define __raw_try_cmpxchg(_ptr, _pold, _new, size, lock) \
158({ \ 158({ \
159 bool success; \ 159 bool success; \
160 __typeof__(_ptr) _old = (_pold); \ 160 __typeof__(_ptr) _old = (__typeof__(_ptr))(_pold); \
161 __typeof__(*(_ptr)) __old = *_old; \ 161 __typeof__(*(_ptr)) __old = *_old; \
162 __typeof__(*(_ptr)) __new = (_new); \ 162 __typeof__(*(_ptr)) __new = (_new); \
163 switch (size) { \ 163 switch (size) { \
diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
index 48febf07e828..1310e1f1cd65 100644
--- a/arch/x86/include/asm/io.h
+++ b/arch/x86/include/asm/io.h
@@ -69,6 +69,9 @@ build_mmio_write(__writeb, "b", unsigned char, "q", )
69build_mmio_write(__writew, "w", unsigned short, "r", ) 69build_mmio_write(__writew, "w", unsigned short, "r", )
70build_mmio_write(__writel, "l", unsigned int, "r", ) 70build_mmio_write(__writel, "l", unsigned int, "r", )
71 71
72#define readb readb
73#define readw readw
74#define readl readl
72#define readb_relaxed(a) __readb(a) 75#define readb_relaxed(a) __readb(a)
73#define readw_relaxed(a) __readw(a) 76#define readw_relaxed(a) __readw(a)
74#define readl_relaxed(a) __readl(a) 77#define readl_relaxed(a) __readl(a)
@@ -76,6 +79,9 @@ build_mmio_write(__writel, "l", unsigned int, "r", )
76#define __raw_readw __readw 79#define __raw_readw __readw
77#define __raw_readl __readl 80#define __raw_readl __readl
78 81
82#define writeb writeb
83#define writew writew
84#define writel writel
79#define writeb_relaxed(v, a) __writeb(v, a) 85#define writeb_relaxed(v, a) __writeb(v, a)
80#define writew_relaxed(v, a) __writew(v, a) 86#define writew_relaxed(v, a) __writew(v, a)
81#define writel_relaxed(v, a) __writel(v, a) 87#define writel_relaxed(v, a) __writel(v, a)
@@ -88,13 +94,15 @@ build_mmio_write(__writel, "l", unsigned int, "r", )
88#ifdef CONFIG_X86_64 94#ifdef CONFIG_X86_64
89 95
90build_mmio_read(readq, "q", unsigned long, "=r", :"memory") 96build_mmio_read(readq, "q", unsigned long, "=r", :"memory")
97build_mmio_read(__readq, "q", unsigned long, "=r", )
91build_mmio_write(writeq, "q", unsigned long, "r", :"memory") 98build_mmio_write(writeq, "q", unsigned long, "r", :"memory")
99build_mmio_write(__writeq, "q", unsigned long, "r", )
92 100
93#define readq_relaxed(a) readq(a) 101#define readq_relaxed(a) __readq(a)
94#define writeq_relaxed(v, a) writeq(v, a) 102#define writeq_relaxed(v, a) __writeq(v, a)
95 103
96#define __raw_readq(a) readq(a) 104#define __raw_readq __readq
97#define __raw_writeq(val, addr) writeq(val, addr) 105#define __raw_writeq __writeq
98 106
99/* Let people know that we have them */ 107/* Let people know that we have them */
100#define readq readq 108#define readq readq
@@ -119,6 +127,7 @@ static inline phys_addr_t virt_to_phys(volatile void *address)
119{ 127{
120 return __pa(address); 128 return __pa(address);
121} 129}
130#define virt_to_phys virt_to_phys
122 131
123/** 132/**
124 * phys_to_virt - map physical address to virtual 133 * phys_to_virt - map physical address to virtual
@@ -137,6 +146,7 @@ static inline void *phys_to_virt(phys_addr_t address)
137{ 146{
138 return __va(address); 147 return __va(address);
139} 148}
149#define phys_to_virt phys_to_virt
140 150
141/* 151/*
142 * Change "struct page" to physical address. 152 * Change "struct page" to physical address.
@@ -169,11 +179,14 @@ static inline unsigned int isa_virt_to_bus(volatile void *address)
169 * else, you probably want one of the following. 179 * else, you probably want one of the following.
170 */ 180 */
171extern void __iomem *ioremap_nocache(resource_size_t offset, unsigned long size); 181extern void __iomem *ioremap_nocache(resource_size_t offset, unsigned long size);
182#define ioremap_nocache ioremap_nocache
172extern void __iomem *ioremap_uc(resource_size_t offset, unsigned long size); 183extern void __iomem *ioremap_uc(resource_size_t offset, unsigned long size);
173#define ioremap_uc ioremap_uc 184#define ioremap_uc ioremap_uc
174 185
175extern void __iomem *ioremap_cache(resource_size_t offset, unsigned long size); 186extern void __iomem *ioremap_cache(resource_size_t offset, unsigned long size);
187#define ioremap_cache ioremap_cache
176extern void __iomem *ioremap_prot(resource_size_t offset, unsigned long size, unsigned long prot_val); 188extern void __iomem *ioremap_prot(resource_size_t offset, unsigned long size, unsigned long prot_val);
189#define ioremap_prot ioremap_prot
177 190
178/** 191/**
179 * ioremap - map bus memory into CPU space 192 * ioremap - map bus memory into CPU space
@@ -193,8 +206,10 @@ static inline void __iomem *ioremap(resource_size_t offset, unsigned long size)
193{ 206{
194 return ioremap_nocache(offset, size); 207 return ioremap_nocache(offset, size);
195} 208}
209#define ioremap ioremap
196 210
197extern void iounmap(volatile void __iomem *addr); 211extern void iounmap(volatile void __iomem *addr);
212#define iounmap iounmap
198 213
199extern void set_iounmap_nonlazy(void); 214extern void set_iounmap_nonlazy(void);
200 215
@@ -203,53 +218,6 @@ extern void set_iounmap_nonlazy(void);
203#include <asm-generic/iomap.h> 218#include <asm-generic/iomap.h>
204 219
205/* 220/*
206 * Convert a virtual cached pointer to an uncached pointer
207 */
208#define xlate_dev_kmem_ptr(p) p
209
210/**
211 * memset_io Set a range of I/O memory to a constant value
212 * @addr: The beginning of the I/O-memory range to set
213 * @val: The value to set the memory to
214 * @count: The number of bytes to set
215 *
216 * Set a range of I/O memory to a given value.
217 */
218static inline void
219memset_io(volatile void __iomem *addr, unsigned char val, size_t count)
220{
221 memset((void __force *)addr, val, count);
222}
223
224/**
225 * memcpy_fromio Copy a block of data from I/O memory
226 * @dst: The (RAM) destination for the copy
227 * @src: The (I/O memory) source for the data
228 * @count: The number of bytes to copy
229 *
230 * Copy a block of data from I/O memory.
231 */
232static inline void
233memcpy_fromio(void *dst, const volatile void __iomem *src, size_t count)
234{
235 memcpy(dst, (const void __force *)src, count);
236}
237
238/**
239 * memcpy_toio Copy a block of data into I/O memory
240 * @dst: The (I/O memory) destination for the copy
241 * @src: The (RAM) source for the data
242 * @count: The number of bytes to copy
243 *
244 * Copy a block of data to I/O memory.
245 */
246static inline void
247memcpy_toio(volatile void __iomem *dst, const void *src, size_t count)
248{
249 memcpy((void __force *)dst, src, count);
250}
251
252/*
253 * ISA space is 'always mapped' on a typical x86 system, no need to 221 * ISA space is 'always mapped' on a typical x86 system, no need to
254 * explicitly ioremap() it. The fact that the ISA IO space is mapped 222 * explicitly ioremap() it. The fact that the ISA IO space is mapped
255 * to PAGE_OFFSET is pure coincidence - it does not mean ISA values 223 * to PAGE_OFFSET is pure coincidence - it does not mean ISA values
@@ -341,13 +309,38 @@ BUILDIO(b, b, char)
341BUILDIO(w, w, short) 309BUILDIO(w, w, short)
342BUILDIO(l, , int) 310BUILDIO(l, , int)
343 311
312#define inb inb
313#define inw inw
314#define inl inl
315#define inb_p inb_p
316#define inw_p inw_p
317#define inl_p inl_p
318#define insb insb
319#define insw insw
320#define insl insl
321
322#define outb outb
323#define outw outw
324#define outl outl
325#define outb_p outb_p
326#define outw_p outw_p
327#define outl_p outl_p
328#define outsb outsb
329#define outsw outsw
330#define outsl outsl
331
344extern void *xlate_dev_mem_ptr(phys_addr_t phys); 332extern void *xlate_dev_mem_ptr(phys_addr_t phys);
345extern void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr); 333extern void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr);
346 334
335#define xlate_dev_mem_ptr xlate_dev_mem_ptr
336#define unxlate_dev_mem_ptr unxlate_dev_mem_ptr
337
347extern int ioremap_change_attr(unsigned long vaddr, unsigned long size, 338extern int ioremap_change_attr(unsigned long vaddr, unsigned long size,
348 enum page_cache_mode pcm); 339 enum page_cache_mode pcm);
349extern void __iomem *ioremap_wc(resource_size_t offset, unsigned long size); 340extern void __iomem *ioremap_wc(resource_size_t offset, unsigned long size);
341#define ioremap_wc ioremap_wc
350extern void __iomem *ioremap_wt(resource_size_t offset, unsigned long size); 342extern void __iomem *ioremap_wt(resource_size_t offset, unsigned long size);
343#define ioremap_wt ioremap_wt
351 344
352extern bool is_early_ioremap_ptep(pte_t *ptep); 345extern bool is_early_ioremap_ptep(pte_t *ptep);
353 346
@@ -365,6 +358,9 @@ extern bool xen_biovec_phys_mergeable(const struct bio_vec *vec1,
365 358
366#define IO_SPACE_LIMIT 0xffff 359#define IO_SPACE_LIMIT 0xffff
367 360
361#include <asm-generic/io.h>
362#undef PCI_IOBASE
363
368#ifdef CONFIG_MTRR 364#ifdef CONFIG_MTRR
369extern int __must_check arch_phys_wc_index(int handle); 365extern int __must_check arch_phys_wc_index(int handle);
370#define arch_phys_wc_index arch_phys_wc_index 366#define arch_phys_wc_index arch_phys_wc_index
diff --git a/arch/x86/include/asm/orc_types.h b/arch/x86/include/asm/orc_types.h
new file mode 100644
index 000000000000..7dc777a6cb40
--- /dev/null
+++ b/arch/x86/include/asm/orc_types.h
@@ -0,0 +1,107 @@
1/*
2 * Copyright (C) 2017 Josh Poimboeuf <jpoimboe@redhat.com>
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version 2
7 * of the License, or (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, see <http://www.gnu.org/licenses/>.
16 */
17
18#ifndef _ORC_TYPES_H
19#define _ORC_TYPES_H
20
21#include <linux/types.h>
22#include <linux/compiler.h>
23
24/*
25 * The ORC_REG_* registers are base registers which are used to find other
26 * registers on the stack.
27 *
28 * ORC_REG_PREV_SP, also known as DWARF Call Frame Address (CFA), is the
29 * address of the previous frame: the caller's SP before it called the current
30 * function.
31 *
32 * ORC_REG_UNDEFINED means the corresponding register's value didn't change in
33 * the current frame.
34 *
35 * The most commonly used base registers are SP and BP -- which the previous SP
36 * is usually based on -- and PREV_SP and UNDEFINED -- which the previous BP is
37 * usually based on.
38 *
39 * The rest of the base registers are needed for special cases like entry code
40 * and GCC realigned stacks.
41 */
42#define ORC_REG_UNDEFINED 0
43#define ORC_REG_PREV_SP 1
44#define ORC_REG_DX 2
45#define ORC_REG_DI 3
46#define ORC_REG_BP 4
47#define ORC_REG_SP 5
48#define ORC_REG_R10 6
49#define ORC_REG_R13 7
50#define ORC_REG_BP_INDIRECT 8
51#define ORC_REG_SP_INDIRECT 9
52#define ORC_REG_MAX 15
53
54/*
55 * ORC_TYPE_CALL: Indicates that sp_reg+sp_offset resolves to PREV_SP (the
56 * caller's SP right before it made the call). Used for all callable
57 * functions, i.e. all C code and all callable asm functions.
58 *
59 * ORC_TYPE_REGS: Used in entry code to indicate that sp_reg+sp_offset points
60 * to a fully populated pt_regs from a syscall, interrupt, or exception.
61 *
62 * ORC_TYPE_REGS_IRET: Used in entry code to indicate that sp_reg+sp_offset
63 * points to the iret return frame.
64 *
65 * The UNWIND_HINT macros are used only for the unwind_hint struct. They
66 * aren't used in struct orc_entry due to size and complexity constraints.
67 * Objtool converts them to real types when it converts the hints to orc
68 * entries.
69 */
70#define ORC_TYPE_CALL 0
71#define ORC_TYPE_REGS 1
72#define ORC_TYPE_REGS_IRET 2
73#define UNWIND_HINT_TYPE_SAVE 3
74#define UNWIND_HINT_TYPE_RESTORE 4
75
76#ifndef __ASSEMBLY__
77/*
78 * This struct is more or less a vastly simplified version of the DWARF Call
79 * Frame Information standard. It contains only the necessary parts of DWARF
80 * CFI, simplified for ease of access by the in-kernel unwinder. It tells the
81 * unwinder how to find the previous SP and BP (and sometimes entry regs) on
82 * the stack for a given code address. Each instance of the struct corresponds
83 * to one or more code locations.
84 */
85struct orc_entry {
86 s16 sp_offset;
87 s16 bp_offset;
88 unsigned sp_reg:4;
89 unsigned bp_reg:4;
90 unsigned type:2;
91};
92
93/*
94 * This struct is used by asm and inline asm code to manually annotate the
95 * location of registers on the stack for the ORC unwinder.
96 *
97 * Type can be either ORC_TYPE_* or UNWIND_HINT_TYPE_*.
98 */
99struct unwind_hint {
100 u32 ip;
101 s16 sp_offset;
102 u8 sp_reg;
103 u8 type;
104};
105#endif /* __ASSEMBLY__ */
106
107#endif /* _ORC_TYPES_H */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 028245e1c42b..0b03d655db7c 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -22,6 +22,7 @@ struct vm86;
22#include <asm/nops.h> 22#include <asm/nops.h>
23#include <asm/special_insns.h> 23#include <asm/special_insns.h>
24#include <asm/fpu/types.h> 24#include <asm/fpu/types.h>
25#include <asm/unwind_hints.h>
25 26
26#include <linux/personality.h> 27#include <linux/personality.h>
27#include <linux/cache.h> 28#include <linux/cache.h>
@@ -684,6 +685,7 @@ static inline void sync_core(void)
684 unsigned int tmp; 685 unsigned int tmp;
685 686
686 asm volatile ( 687 asm volatile (
688 UNWIND_HINT_SAVE
687 "mov %%ss, %0\n\t" 689 "mov %%ss, %0\n\t"
688 "pushq %q0\n\t" 690 "pushq %q0\n\t"
689 "pushq %%rsp\n\t" 691 "pushq %%rsp\n\t"
@@ -693,6 +695,7 @@ static inline void sync_core(void)
693 "pushq %q0\n\t" 695 "pushq %q0\n\t"
694 "pushq $1f\n\t" 696 "pushq $1f\n\t"
695 "iretq\n\t" 697 "iretq\n\t"
698 UNWIND_HINT_RESTORE
696 "1:" 699 "1:"
697 : "=&r" (tmp), "+r" (__sp) : : "cc", "memory"); 700 : "=&r" (tmp), "+r" (__sp) : : "cc", "memory");
698#endif 701#endif
diff --git a/arch/x86/include/asm/refcount.h b/arch/x86/include/asm/refcount.h
new file mode 100644
index 000000000000..ff871210b9f2
--- /dev/null
+++ b/arch/x86/include/asm/refcount.h
@@ -0,0 +1,109 @@
1#ifndef __ASM_X86_REFCOUNT_H
2#define __ASM_X86_REFCOUNT_H
3/*
4 * x86-specific implementation of refcount_t. Based on PAX_REFCOUNT from
5 * PaX/grsecurity.
6 */
7#include <linux/refcount.h>
8
9/*
10 * This is the first portion of the refcount error handling, which lives in
11 * .text.unlikely, and is jumped to from the CPU flag check (in the
12 * following macros). This saves the refcount value location into CX for
13 * the exception handler to use (in mm/extable.c), and then triggers the
14 * central refcount exception. The fixup address for the exception points
15 * back to the regular execution flow in .text.
16 */
17#define _REFCOUNT_EXCEPTION \
18 ".pushsection .text.unlikely\n" \
19 "111:\tlea %[counter], %%" _ASM_CX "\n" \
20 "112:\t" ASM_UD0 "\n" \
21 ASM_UNREACHABLE \
22 ".popsection\n" \
23 "113:\n" \
24 _ASM_EXTABLE_REFCOUNT(112b, 113b)
25
26/* Trigger refcount exception if refcount result is negative. */
27#define REFCOUNT_CHECK_LT_ZERO \
28 "js 111f\n\t" \
29 _REFCOUNT_EXCEPTION
30
31/* Trigger refcount exception if refcount result is zero or negative. */
32#define REFCOUNT_CHECK_LE_ZERO \
33 "jz 111f\n\t" \
34 REFCOUNT_CHECK_LT_ZERO
35
36/* Trigger refcount exception unconditionally. */
37#define REFCOUNT_ERROR \
38 "jmp 111f\n\t" \
39 _REFCOUNT_EXCEPTION
40
41static __always_inline void refcount_add(unsigned int i, refcount_t *r)
42{
43 asm volatile(LOCK_PREFIX "addl %1,%0\n\t"
44 REFCOUNT_CHECK_LT_ZERO
45 : [counter] "+m" (r->refs.counter)
46 : "ir" (i)
47 : "cc", "cx");
48}
49
50static __always_inline void refcount_inc(refcount_t *r)
51{
52 asm volatile(LOCK_PREFIX "incl %0\n\t"
53 REFCOUNT_CHECK_LT_ZERO
54 : [counter] "+m" (r->refs.counter)
55 : : "cc", "cx");
56}
57
58static __always_inline void refcount_dec(refcount_t *r)
59{
60 asm volatile(LOCK_PREFIX "decl %0\n\t"
61 REFCOUNT_CHECK_LE_ZERO
62 : [counter] "+m" (r->refs.counter)
63 : : "cc", "cx");
64}
65
66static __always_inline __must_check
67bool refcount_sub_and_test(unsigned int i, refcount_t *r)
68{
69 GEN_BINARY_SUFFIXED_RMWcc(LOCK_PREFIX "subl", REFCOUNT_CHECK_LT_ZERO,
70 r->refs.counter, "er", i, "%0", e);
71}
72
73static __always_inline __must_check bool refcount_dec_and_test(refcount_t *r)
74{
75 GEN_UNARY_SUFFIXED_RMWcc(LOCK_PREFIX "decl", REFCOUNT_CHECK_LT_ZERO,
76 r->refs.counter, "%0", e);
77}
78
79static __always_inline __must_check
80bool refcount_add_not_zero(unsigned int i, refcount_t *r)
81{
82 int c, result;
83
84 c = atomic_read(&(r->refs));
85 do {
86 if (unlikely(c == 0))
87 return false;
88
89 result = c + i;
90
91 /* Did we try to increment from/to an undesirable state? */
92 if (unlikely(c < 0 || c == INT_MAX || result < c)) {
93 asm volatile(REFCOUNT_ERROR
94 : : [counter] "m" (r->refs.counter)
95 : "cc", "cx");
96 break;
97 }
98
99 } while (!atomic_try_cmpxchg(&(r->refs), &c, result));
100
101 return c != 0;
102}
103
104static __always_inline __must_check bool refcount_inc_not_zero(refcount_t *r)
105{
106 return refcount_add_not_zero(1, r);
107}
108
109#endif
diff --git a/arch/x86/include/asm/rmwcc.h b/arch/x86/include/asm/rmwcc.h
index 661dd305694a..045f99211a99 100644
--- a/arch/x86/include/asm/rmwcc.h
+++ b/arch/x86/include/asm/rmwcc.h
@@ -1,45 +1,56 @@
1#ifndef _ASM_X86_RMWcc 1#ifndef _ASM_X86_RMWcc
2#define _ASM_X86_RMWcc 2#define _ASM_X86_RMWcc
3 3
4#define __CLOBBERS_MEM "memory"
5#define __CLOBBERS_MEM_CC_CX "memory", "cc", "cx"
6
4#if !defined(__GCC_ASM_FLAG_OUTPUTS__) && defined(CC_HAVE_ASM_GOTO) 7#if !defined(__GCC_ASM_FLAG_OUTPUTS__) && defined(CC_HAVE_ASM_GOTO)
5 8
6/* Use asm goto */ 9/* Use asm goto */
7 10
8#define __GEN_RMWcc(fullop, var, cc, ...) \ 11#define __GEN_RMWcc(fullop, var, cc, clobbers, ...) \
9do { \ 12do { \
10 asm_volatile_goto (fullop "; j" #cc " %l[cc_label]" \ 13 asm_volatile_goto (fullop "; j" #cc " %l[cc_label]" \
11 : : "m" (var), ## __VA_ARGS__ \ 14 : : [counter] "m" (var), ## __VA_ARGS__ \
12 : "memory" : cc_label); \ 15 : clobbers : cc_label); \
13 return 0; \ 16 return 0; \
14cc_label: \ 17cc_label: \
15 return 1; \ 18 return 1; \
16} while (0) 19} while (0)
17 20
18#define GEN_UNARY_RMWcc(op, var, arg0, cc) \ 21#define __BINARY_RMWcc_ARG " %1, "
19 __GEN_RMWcc(op " " arg0, var, cc)
20 22
21#define GEN_BINARY_RMWcc(op, var, vcon, val, arg0, cc) \
22 __GEN_RMWcc(op " %1, " arg0, var, cc, vcon (val))
23 23
24#else /* defined(__GCC_ASM_FLAG_OUTPUTS__) || !defined(CC_HAVE_ASM_GOTO) */ 24#else /* defined(__GCC_ASM_FLAG_OUTPUTS__) || !defined(CC_HAVE_ASM_GOTO) */
25 25
26/* Use flags output or a set instruction */ 26/* Use flags output or a set instruction */
27 27
28#define __GEN_RMWcc(fullop, var, cc, ...) \ 28#define __GEN_RMWcc(fullop, var, cc, clobbers, ...) \
29do { \ 29do { \
30 bool c; \ 30 bool c; \
31 asm volatile (fullop ";" CC_SET(cc) \ 31 asm volatile (fullop ";" CC_SET(cc) \
32 : "+m" (var), CC_OUT(cc) (c) \ 32 : [counter] "+m" (var), CC_OUT(cc) (c) \
33 : __VA_ARGS__ : "memory"); \ 33 : __VA_ARGS__ : clobbers); \
34 return c; \ 34 return c; \
35} while (0) 35} while (0)
36 36
37#define __BINARY_RMWcc_ARG " %2, "
38
39#endif /* defined(__GCC_ASM_FLAG_OUTPUTS__) || !defined(CC_HAVE_ASM_GOTO) */
40
37#define GEN_UNARY_RMWcc(op, var, arg0, cc) \ 41#define GEN_UNARY_RMWcc(op, var, arg0, cc) \
38 __GEN_RMWcc(op " " arg0, var, cc) 42 __GEN_RMWcc(op " " arg0, var, cc, __CLOBBERS_MEM)
43
44#define GEN_UNARY_SUFFIXED_RMWcc(op, suffix, var, arg0, cc) \
45 __GEN_RMWcc(op " " arg0 "\n\t" suffix, var, cc, \
46 __CLOBBERS_MEM_CC_CX)
39 47
40#define GEN_BINARY_RMWcc(op, var, vcon, val, arg0, cc) \ 48#define GEN_BINARY_RMWcc(op, var, vcon, val, arg0, cc) \
41 __GEN_RMWcc(op " %2, " arg0, var, cc, vcon (val)) 49 __GEN_RMWcc(op __BINARY_RMWcc_ARG arg0, var, cc, \
50 __CLOBBERS_MEM, vcon (val))
42 51
43#endif /* defined(__GCC_ASM_FLAG_OUTPUTS__) || !defined(CC_HAVE_ASM_GOTO) */ 52#define GEN_BINARY_SUFFIXED_RMWcc(op, suffix, var, vcon, val, arg0, cc) \
53 __GEN_RMWcc(op __BINARY_RMWcc_ARG arg0 "\n\t" suffix, var, cc, \
54 __CLOBBERS_MEM_CC_CX, vcon (val))
44 55
45#endif /* _ASM_X86_RMWcc */ 56#endif /* _ASM_X86_RMWcc */
diff --git a/arch/x86/include/asm/unwind_hints.h b/arch/x86/include/asm/unwind_hints.h
new file mode 100644
index 000000000000..5e02b11c9b86
--- /dev/null
+++ b/arch/x86/include/asm/unwind_hints.h
@@ -0,0 +1,103 @@
1#ifndef _ASM_X86_UNWIND_HINTS_H
2#define _ASM_X86_UNWIND_HINTS_H
3
4#include "orc_types.h"
5
6#ifdef __ASSEMBLY__
7
8/*
9 * In asm, there are two kinds of code: normal C-type callable functions and
10 * the rest. The normal callable functions can be called by other code, and
11 * don't do anything unusual with the stack. Such normal callable functions
12 * are annotated with the ENTRY/ENDPROC macros. Most asm code falls in this
13 * category. In this case, no special debugging annotations are needed because
14 * objtool can automatically generate the ORC data for the ORC unwinder to read
15 * at runtime.
16 *
17 * Anything which doesn't fall into the above category, such as syscall and
18 * interrupt handlers, tends to not be called directly by other functions, and
19 * often does unusual non-C-function-type things with the stack pointer. Such
20 * code needs to be annotated such that objtool can understand it. The
21 * following CFI hint macros are for this type of code.
22 *
23 * These macros provide hints to objtool about the state of the stack at each
24 * instruction. Objtool starts from the hints and follows the code flow,
25 * making automatic CFI adjustments when it sees pushes and pops, filling out
26 * the debuginfo as necessary. It will also warn if it sees any
27 * inconsistencies.
28 */
29.macro UNWIND_HINT sp_reg=ORC_REG_SP sp_offset=0 type=ORC_TYPE_CALL
30#ifdef CONFIG_STACK_VALIDATION
31.Lunwind_hint_ip_\@:
32 .pushsection .discard.unwind_hints
33 /* struct unwind_hint */
34 .long .Lunwind_hint_ip_\@ - .
35 .short \sp_offset
36 .byte \sp_reg
37 .byte \type
38 .popsection
39#endif
40.endm
41
42.macro UNWIND_HINT_EMPTY
43 UNWIND_HINT sp_reg=ORC_REG_UNDEFINED
44.endm
45
46.macro UNWIND_HINT_REGS base=%rsp offset=0 indirect=0 extra=1 iret=0
47 .if \base == %rsp && \indirect
48 .set sp_reg, ORC_REG_SP_INDIRECT
49 .elseif \base == %rsp
50 .set sp_reg, ORC_REG_SP
51 .elseif \base == %rbp
52 .set sp_reg, ORC_REG_BP
53 .elseif \base == %rdi
54 .set sp_reg, ORC_REG_DI
55 .elseif \base == %rdx
56 .set sp_reg, ORC_REG_DX
57 .elseif \base == %r10
58 .set sp_reg, ORC_REG_R10
59 .else
60 .error "UNWIND_HINT_REGS: bad base register"
61 .endif
62
63 .set sp_offset, \offset
64
65 .if \iret
66 .set type, ORC_TYPE_REGS_IRET
67 .elseif \extra == 0
68 .set type, ORC_TYPE_REGS_IRET
69 .set sp_offset, \offset + (16*8)
70 .else
71 .set type, ORC_TYPE_REGS
72 .endif
73
74 UNWIND_HINT sp_reg=sp_reg sp_offset=sp_offset type=type
75.endm
76
77.macro UNWIND_HINT_IRET_REGS base=%rsp offset=0
78 UNWIND_HINT_REGS base=\base offset=\offset iret=1
79.endm
80
81.macro UNWIND_HINT_FUNC sp_offset=8
82 UNWIND_HINT sp_offset=\sp_offset
83.endm
84
85#else /* !__ASSEMBLY__ */
86
87#define UNWIND_HINT(sp_reg, sp_offset, type) \
88 "987: \n\t" \
89 ".pushsection .discard.unwind_hints\n\t" \
90 /* struct unwind_hint */ \
91 ".long 987b - .\n\t" \
92 ".short " __stringify(sp_offset) "\n\t" \
93 ".byte " __stringify(sp_reg) "\n\t" \
94 ".byte " __stringify(type) "\n\t" \
95 ".popsection\n\t"
96
97#define UNWIND_HINT_SAVE UNWIND_HINT(0, 0, UNWIND_HINT_TYPE_SAVE)
98
99#define UNWIND_HINT_RESTORE UNWIND_HINT(0, 0, UNWIND_HINT_TYPE_RESTORE)
100
101#endif /* __ASSEMBLY__ */
102
103#endif /* _ASM_X86_UNWIND_HINTS_H */
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index dbce3cca94cb..bd265a4cf108 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -94,6 +94,9 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
94 if (stack_name) 94 if (stack_name)
95 printk("%s <%s>\n", log_lvl, stack_name); 95 printk("%s <%s>\n", log_lvl, stack_name);
96 96
97 if (regs && on_stack(&stack_info, regs, sizeof(*regs)))
98 __show_regs(regs, 0);
99
97 /* 100 /*
98 * Scan the stack, printing any text addresses we find. At the 101 * Scan the stack, printing any text addresses we find. At the
99 * same time, follow proper stack frames with the unwinder. 102 * same time, follow proper stack frames with the unwinder.
@@ -118,10 +121,8 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
118 * Don't print regs->ip again if it was already printed 121 * Don't print regs->ip again if it was already printed
119 * by __show_regs() below. 122 * by __show_regs() below.
120 */ 123 */
121 if (regs && stack == &regs->ip) { 124 if (regs && stack == &regs->ip)
122 unwind_next_frame(&state); 125 goto next;
123 continue;
124 }
125 126
126 if (stack == ret_addr_p) 127 if (stack == ret_addr_p)
127 reliable = 1; 128 reliable = 1;
@@ -144,6 +145,7 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
144 if (!reliable) 145 if (!reliable)
145 continue; 146 continue;
146 147
148next:
147 /* 149 /*
148 * Get the next frame from the unwinder. No need to 150 * Get the next frame from the unwinder. No need to
149 * check for an error: if anything goes wrong, the rest 151 * check for an error: if anything goes wrong, the rest
@@ -153,7 +155,7 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
153 155
154 /* if the frame has entry regs, print them */ 156 /* if the frame has entry regs, print them */
155 regs = unwind_get_entry_regs(&state); 157 regs = unwind_get_entry_regs(&state);
156 if (regs) 158 if (regs && on_stack(&stack_info, regs, sizeof(*regs)))
157 __show_regs(regs, 0); 159 __show_regs(regs, 0);
158 } 160 }
159 161
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
index e5f0b40e66d2..4f0481474903 100644
--- a/arch/x86/kernel/dumpstack_32.c
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -37,7 +37,7 @@ static bool in_hardirq_stack(unsigned long *stack, struct stack_info *info)
37 * This is a software stack, so 'end' can be a valid stack pointer. 37 * This is a software stack, so 'end' can be a valid stack pointer.
38 * It just means the stack is empty. 38 * It just means the stack is empty.
39 */ 39 */
40 if (stack < begin || stack > end) 40 if (stack <= begin || stack > end)
41 return false; 41 return false;
42 42
43 info->type = STACK_TYPE_IRQ; 43 info->type = STACK_TYPE_IRQ;
@@ -62,7 +62,7 @@ static bool in_softirq_stack(unsigned long *stack, struct stack_info *info)
62 * This is a software stack, so 'end' can be a valid stack pointer. 62 * This is a software stack, so 'end' can be a valid stack pointer.
63 * It just means the stack is empty. 63 * It just means the stack is empty.
64 */ 64 */
65 if (stack < begin || stack > end) 65 if (stack <= begin || stack > end)
66 return false; 66 return false;
67 67
68 info->type = STACK_TYPE_SOFTIRQ; 68 info->type = STACK_TYPE_SOFTIRQ;
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index 3e1471d57487..225af4184f06 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -55,7 +55,7 @@ static bool in_exception_stack(unsigned long *stack, struct stack_info *info)
55 begin = end - (exception_stack_sizes[k] / sizeof(long)); 55 begin = end - (exception_stack_sizes[k] / sizeof(long));
56 regs = (struct pt_regs *)end - 1; 56 regs = (struct pt_regs *)end - 1;
57 57
58 if (stack < begin || stack >= end) 58 if (stack <= begin || stack >= end)
59 continue; 59 continue;
60 60
61 info->type = STACK_TYPE_EXCEPTION + k; 61 info->type = STACK_TYPE_EXCEPTION + k;
@@ -78,7 +78,7 @@ static bool in_irq_stack(unsigned long *stack, struct stack_info *info)
78 * This is a software stack, so 'end' can be a valid stack pointer. 78 * This is a software stack, so 'end' can be a valid stack pointer.
79 * It just means the stack is empty. 79 * It just means the stack is empty.
80 */ 80 */
81 if (stack < begin || stack > end) 81 if (stack <= begin || stack > end)
82 return false; 82 return false;
83 83
84 info->type = STACK_TYPE_IRQ; 84 info->type = STACK_TYPE_IRQ;
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index c3169be4c596..2987e3991c2b 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -279,6 +279,9 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
279 struct tss_struct *tss = &per_cpu(cpu_tss, cpu); 279 struct tss_struct *tss = &per_cpu(cpu_tss, cpu);
280 unsigned prev_fsindex, prev_gsindex; 280 unsigned prev_fsindex, prev_gsindex;
281 281
282 WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ENTRY) &&
283 this_cpu_read(irq_count) != -1);
284
282 switch_fpu_prepare(prev_fpu, cpu); 285 switch_fpu_prepare(prev_fpu, cpu);
283 286
284 /* We must save %fs and %gs before load_TLS() because 287 /* We must save %fs and %gs before load_TLS() because
diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c
index 0ea8afcb929c..761fc88cd820 100644
--- a/arch/x86/mm/extable.c
+++ b/arch/x86/mm/extable.c
@@ -36,6 +36,48 @@ bool ex_handler_fault(const struct exception_table_entry *fixup,
36} 36}
37EXPORT_SYMBOL_GPL(ex_handler_fault); 37EXPORT_SYMBOL_GPL(ex_handler_fault);
38 38
39/*
40 * Handler for UD0 exception following a failed test against the
41 * result of a refcount inc/dec/add/sub.
42 */
43bool ex_handler_refcount(const struct exception_table_entry *fixup,
44 struct pt_regs *regs, int trapnr)
45{
46 /* First unconditionally saturate the refcount. */
47 *(int *)regs->cx = INT_MIN / 2;
48
49 /*
50 * Strictly speaking, this reports the fixup destination, not
51 * the fault location, and not the actually overflowing
52 * instruction, which is the instruction before the "js", but
53 * since that instruction could be a variety of lengths, just
54 * report the location after the overflow, which should be close
55 * enough for finding the overflow, as it's at least back in
56 * the function, having returned from .text.unlikely.
57 */
58 regs->ip = ex_fixup_addr(fixup);
59
60 /*
61 * This function has been called because either a negative refcount
62 * value was seen by any of the refcount functions, or a zero
63 * refcount value was seen by refcount_dec().
64 *
65 * If we crossed from INT_MAX to INT_MIN, OF (Overflow Flag: result
66 * wrapped around) will be set. Additionally, seeing the refcount
67 * reach 0 will set ZF (Zero Flag: result was zero). In each of
68 * these cases we want a report, since it's a boundary condition.
69 *
70 */
71 if (regs->flags & (X86_EFLAGS_OF | X86_EFLAGS_ZF)) {
72 bool zero = regs->flags & X86_EFLAGS_ZF;
73
74 refcount_error_report(regs, zero ? "hit zero" : "overflow");
75 }
76
77 return true;
78}
79EXPORT_SYMBOL_GPL(ex_handler_refcount);
80
39bool ex_handler_ext(const struct exception_table_entry *fixup, 81bool ex_handler_ext(const struct exception_table_entry *fixup,
40 struct pt_regs *regs, int trapnr) 82 struct pt_regs *regs, int trapnr)
41{ 83{
diff --git a/drivers/clocksource/arm_arch_timer.c b/drivers/clocksource/arm_arch_timer.c
index 72bbfccef113..fd4b7f684bd0 100644
--- a/drivers/clocksource/arm_arch_timer.c
+++ b/drivers/clocksource/arm_arch_timer.c
@@ -455,7 +455,11 @@ void arch_timer_enable_workaround(const struct arch_timer_erratum_workaround *wa
455 per_cpu(timer_unstable_counter_workaround, i) = wa; 455 per_cpu(timer_unstable_counter_workaround, i) = wa;
456 } 456 }
457 457
458 static_branch_enable(&arch_timer_read_ool_enabled); 458 /*
459 * Use the locked version, as we're called from the CPU
460 * hotplug framework. Otherwise, we end-up in deadlock-land.
461 */
462 static_branch_enable_cpuslocked(&arch_timer_read_ool_enabled);
459 463
460 /* 464 /*
461 * Don't use the vdso fastpath if errata require using the 465 * Don't use the vdso fastpath if errata require using the
diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index d1bd53b73738..e3a81ed66bc2 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -28,6 +28,7 @@
28 28
29#include <linux/debugfs.h> 29#include <linux/debugfs.h>
30#include <linux/sort.h> 30#include <linux/sort.h>
31#include <linux/sched/mm.h>
31#include "intel_drv.h" 32#include "intel_drv.h"
32 33
33static inline struct drm_i915_private *node_to_i915(struct drm_info_node *node) 34static inline struct drm_i915_private *node_to_i915(struct drm_info_node *node)
@@ -4331,7 +4332,7 @@ i915_drop_caches_set(void *data, u64 val)
4331 mutex_unlock(&dev->struct_mutex); 4332 mutex_unlock(&dev->struct_mutex);
4332 } 4333 }
4333 4334
4334 lockdep_set_current_reclaim_state(GFP_KERNEL); 4335 fs_reclaim_acquire(GFP_KERNEL);
4335 if (val & DROP_BOUND) 4336 if (val & DROP_BOUND)
4336 i915_gem_shrink(dev_priv, LONG_MAX, I915_SHRINK_BOUND); 4337 i915_gem_shrink(dev_priv, LONG_MAX, I915_SHRINK_BOUND);
4337 4338
@@ -4340,7 +4341,7 @@ i915_drop_caches_set(void *data, u64 val)
4340 4341
4341 if (val & DROP_SHRINK_ALL) 4342 if (val & DROP_SHRINK_ALL)
4342 i915_gem_shrink_all(dev_priv); 4343 i915_gem_shrink_all(dev_priv);
4343 lockdep_clear_current_reclaim_state(); 4344 fs_reclaim_release(GFP_KERNEL);
4344 4345
4345 if (val & DROP_FREED) { 4346 if (val & DROP_FREED) {
4346 synchronize_rcu(); 4347 synchronize_rcu();
diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c
index 3d424a51cabb..f0fd3adb1693 100644
--- a/fs/overlayfs/readdir.c
+++ b/fs/overlayfs/readdir.c
@@ -446,14 +446,14 @@ static int ovl_dir_fsync(struct file *file, loff_t start, loff_t end,
446 446
447 ovl_path_upper(dentry, &upperpath); 447 ovl_path_upper(dentry, &upperpath);
448 realfile = ovl_path_open(&upperpath, O_RDONLY); 448 realfile = ovl_path_open(&upperpath, O_RDONLY);
449 smp_mb__before_spinlock(); 449
450 inode_lock(inode); 450 inode_lock(inode);
451 if (!od->upperfile) { 451 if (!od->upperfile) {
452 if (IS_ERR(realfile)) { 452 if (IS_ERR(realfile)) {
453 inode_unlock(inode); 453 inode_unlock(inode);
454 return PTR_ERR(realfile); 454 return PTR_ERR(realfile);
455 } 455 }
456 od->upperfile = realfile; 456 smp_store_release(&od->upperfile, realfile);
457 } else { 457 } else {
458 /* somebody has beaten us to it */ 458 /* somebody has beaten us to it */
459 if (!IS_ERR(realfile)) 459 if (!IS_ERR(realfile))
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index b0d5897bc4e6..886085b47c75 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -109,27 +109,24 @@ static int userfaultfd_wake_function(wait_queue_entry_t *wq, unsigned mode,
109 goto out; 109 goto out;
110 WRITE_ONCE(uwq->waken, true); 110 WRITE_ONCE(uwq->waken, true);
111 /* 111 /*
112 * The implicit smp_mb__before_spinlock in try_to_wake_up() 112 * The Program-Order guarantees provided by the scheduler
113 * renders uwq->waken visible to other CPUs before the task is 113 * ensure uwq->waken is visible before the task is woken.
114 * waken.
115 */ 114 */
116 ret = wake_up_state(wq->private, mode); 115 ret = wake_up_state(wq->private, mode);
117 if (ret) 116 if (ret) {
118 /* 117 /*
119 * Wake only once, autoremove behavior. 118 * Wake only once, autoremove behavior.
120 * 119 *
121 * After the effect of list_del_init is visible to the 120 * After the effect of list_del_init is visible to the other
122 * other CPUs, the waitqueue may disappear from under 121 * CPUs, the waitqueue may disappear from under us, see the
123 * us, see the !list_empty_careful() in 122 * !list_empty_careful() in handle_userfault().
124 * handle_userfault(). try_to_wake_up() has an 123 *
125 * implicit smp_mb__before_spinlock, and the 124 * try_to_wake_up() has an implicit smp_mb(), and the
126 * wq->private is read before calling the extern 125 * wq->private is read before calling the extern function
127 * function "wake_up_state" (which in turns calls 126 * "wake_up_state" (which in turns calls try_to_wake_up).
128 * try_to_wake_up). While the spin_lock;spin_unlock;
129 * wouldn't be enough, the smp_mb__before_spinlock is
130 * enough to avoid an explicit smp_mb() here.
131 */ 127 */
132 list_del_init(&wq->entry); 128 list_del_init(&wq->entry);
129 }
133out: 130out:
134 return ret; 131 return ret;
135} 132}
diff --git a/include/asm-generic/atomic64.h b/include/asm-generic/atomic64.h
index dad68bf46c77..8d28eb010d0d 100644
--- a/include/asm-generic/atomic64.h
+++ b/include/asm-generic/atomic64.h
@@ -21,6 +21,8 @@ typedef struct {
21extern long long atomic64_read(const atomic64_t *v); 21extern long long atomic64_read(const atomic64_t *v);
22extern void atomic64_set(atomic64_t *v, long long i); 22extern void atomic64_set(atomic64_t *v, long long i);
23 23
24#define atomic64_set_release(v, i) atomic64_set((v), (i))
25
24#define ATOMIC64_OP(op) \ 26#define ATOMIC64_OP(op) \
25extern void atomic64_##op(long long a, atomic64_t *v); 27extern void atomic64_##op(long long a, atomic64_t *v);
26 28
diff --git a/include/asm-generic/io.h b/include/asm-generic/io.h
index 7ef015eb3403..b4531e3b2120 100644
--- a/include/asm-generic/io.h
+++ b/include/asm-generic/io.h
@@ -915,6 +915,9 @@ extern void ioport_unmap(void __iomem *p);
915#endif /* CONFIG_GENERIC_IOMAP */ 915#endif /* CONFIG_GENERIC_IOMAP */
916#endif /* CONFIG_HAS_IOPORT_MAP */ 916#endif /* CONFIG_HAS_IOPORT_MAP */
917 917
918/*
919 * Convert a virtual cached pointer to an uncached pointer
920 */
918#ifndef xlate_dev_kmem_ptr 921#ifndef xlate_dev_kmem_ptr
919#define xlate_dev_kmem_ptr xlate_dev_kmem_ptr 922#define xlate_dev_kmem_ptr xlate_dev_kmem_ptr
920static inline void *xlate_dev_kmem_ptr(void *addr) 923static inline void *xlate_dev_kmem_ptr(void *addr)
@@ -954,6 +957,14 @@ static inline void *bus_to_virt(unsigned long address)
954 957
955#ifndef memset_io 958#ifndef memset_io
956#define memset_io memset_io 959#define memset_io memset_io
960/**
961 * memset_io Set a range of I/O memory to a constant value
962 * @addr: The beginning of the I/O-memory range to set
963 * @val: The value to set the memory to
964 * @count: The number of bytes to set
965 *
966 * Set a range of I/O memory to a given value.
967 */
957static inline void memset_io(volatile void __iomem *addr, int value, 968static inline void memset_io(volatile void __iomem *addr, int value,
958 size_t size) 969 size_t size)
959{ 970{
@@ -963,6 +974,14 @@ static inline void memset_io(volatile void __iomem *addr, int value,
963 974
964#ifndef memcpy_fromio 975#ifndef memcpy_fromio
965#define memcpy_fromio memcpy_fromio 976#define memcpy_fromio memcpy_fromio
977/**
978 * memcpy_fromio Copy a block of data from I/O memory
979 * @dst: The (RAM) destination for the copy
980 * @src: The (I/O memory) source for the data
981 * @count: The number of bytes to copy
982 *
983 * Copy a block of data from I/O memory.
984 */
966static inline void memcpy_fromio(void *buffer, 985static inline void memcpy_fromio(void *buffer,
967 const volatile void __iomem *addr, 986 const volatile void __iomem *addr,
968 size_t size) 987 size_t size)
@@ -973,6 +992,14 @@ static inline void memcpy_fromio(void *buffer,
973 992
974#ifndef memcpy_toio 993#ifndef memcpy_toio
975#define memcpy_toio memcpy_toio 994#define memcpy_toio memcpy_toio
995/**
996 * memcpy_toio Copy a block of data into I/O memory
997 * @dst: The (I/O memory) destination for the copy
998 * @src: The (RAM) source for the data
999 * @count: The number of bytes to copy
1000 *
1001 * Copy a block of data to I/O memory.
1002 */
976static inline void memcpy_toio(volatile void __iomem *addr, const void *buffer, 1003static inline void memcpy_toio(volatile void __iomem *addr, const void *buffer,
977 size_t size) 1004 size_t size)
978{ 1005{
diff --git a/include/linux/atomic.h b/include/linux/atomic.h
index c56be7410130..40d6bfec0e0d 100644
--- a/include/linux/atomic.h
+++ b/include/linux/atomic.h
@@ -38,6 +38,9 @@
38 * Besides, if an arch has a special barrier for acquire/release, it could 38 * Besides, if an arch has a special barrier for acquire/release, it could
39 * implement its own __atomic_op_* and use the same framework for building 39 * implement its own __atomic_op_* and use the same framework for building
40 * variants 40 * variants
41 *
42 * If an architecture overrides __atomic_op_acquire() it will probably want
43 * to define smp_mb__after_spinlock().
41 */ 44 */
42#ifndef __atomic_op_acquire 45#ifndef __atomic_op_acquire
43#define __atomic_op_acquire(op, args...) \ 46#define __atomic_op_acquire(op, args...) \
diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h
index bdb80c4aef6e..10825052b03f 100644
--- a/include/linux/compiler-gcc.h
+++ b/include/linux/compiler-gcc.h
@@ -203,11 +203,16 @@
203 203
204#ifdef CONFIG_STACK_VALIDATION 204#ifdef CONFIG_STACK_VALIDATION
205#define annotate_unreachable() ({ \ 205#define annotate_unreachable() ({ \
206 asm("%c0:\t\n" \ 206 asm("%c0:\n\t" \
207 ".pushsection .discard.unreachable\t\n" \ 207 ".pushsection .discard.unreachable\n\t" \
208 ".long %c0b - .\t\n" \ 208 ".long %c0b - .\n\t" \
209 ".popsection\t\n" : : "i" (__LINE__)); \ 209 ".popsection\n\t" : : "i" (__LINE__)); \
210}) 210})
211#define ASM_UNREACHABLE \
212 "999:\n\t" \
213 ".pushsection .discard.unreachable\n\t" \
214 ".long 999b - .\n\t" \
215 ".popsection\n\t"
211#else 216#else
212#define annotate_unreachable() 217#define annotate_unreachable()
213#endif 218#endif
diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index eca8ad75e28b..e25746d88697 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -185,6 +185,9 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val,
185#endif 185#endif
186 186
187/* Unreachable code */ 187/* Unreachable code */
188#ifndef ASM_UNREACHABLE
189# define ASM_UNREACHABLE
190#endif
188#ifndef unreachable 191#ifndef unreachable
189# define unreachable() do { } while (1) 192# define unreachable() do { } while (1)
190#endif 193#endif
diff --git a/include/linux/completion.h b/include/linux/completion.h
index 5d5aaae3af43..791f053f28b7 100644
--- a/include/linux/completion.h
+++ b/include/linux/completion.h
@@ -9,6 +9,9 @@
9 */ 9 */
10 10
11#include <linux/wait.h> 11#include <linux/wait.h>
12#ifdef CONFIG_LOCKDEP_COMPLETIONS
13#include <linux/lockdep.h>
14#endif
12 15
13/* 16/*
14 * struct completion - structure used to maintain state for a "completion" 17 * struct completion - structure used to maintain state for a "completion"
@@ -25,10 +28,50 @@
25struct completion { 28struct completion {
26 unsigned int done; 29 unsigned int done;
27 wait_queue_head_t wait; 30 wait_queue_head_t wait;
31#ifdef CONFIG_LOCKDEP_COMPLETIONS
32 struct lockdep_map_cross map;
33#endif
28}; 34};
29 35
36#ifdef CONFIG_LOCKDEP_COMPLETIONS
37static inline void complete_acquire(struct completion *x)
38{
39 lock_acquire_exclusive((struct lockdep_map *)&x->map, 0, 0, NULL, _RET_IP_);
40}
41
42static inline void complete_release(struct completion *x)
43{
44 lock_release((struct lockdep_map *)&x->map, 0, _RET_IP_);
45}
46
47static inline void complete_release_commit(struct completion *x)
48{
49 lock_commit_crosslock((struct lockdep_map *)&x->map);
50}
51
52#define init_completion(x) \
53do { \
54 static struct lock_class_key __key; \
55 lockdep_init_map_crosslock((struct lockdep_map *)&(x)->map, \
56 "(complete)" #x, \
57 &__key, 0); \
58 __init_completion(x); \
59} while (0)
60#else
61#define init_completion(x) __init_completion(x)
62static inline void complete_acquire(struct completion *x) {}
63static inline void complete_release(struct completion *x) {}
64static inline void complete_release_commit(struct completion *x) {}
65#endif
66
67#ifdef CONFIG_LOCKDEP_COMPLETIONS
68#define COMPLETION_INITIALIZER(work) \
69 { 0, __WAIT_QUEUE_HEAD_INITIALIZER((work).wait), \
70 STATIC_CROSS_LOCKDEP_MAP_INIT("(complete)" #work, &(work)) }
71#else
30#define COMPLETION_INITIALIZER(work) \ 72#define COMPLETION_INITIALIZER(work) \
31 { 0, __WAIT_QUEUE_HEAD_INITIALIZER((work).wait) } 73 { 0, __WAIT_QUEUE_HEAD_INITIALIZER((work).wait) }
74#endif
32 75
33#define COMPLETION_INITIALIZER_ONSTACK(work) \ 76#define COMPLETION_INITIALIZER_ONSTACK(work) \
34 ({ init_completion(&work); work; }) 77 ({ init_completion(&work); work; })
@@ -70,7 +113,7 @@ struct completion {
70 * This inline function will initialize a dynamically created completion 113 * This inline function will initialize a dynamically created completion
71 * structure. 114 * structure.
72 */ 115 */
73static inline void init_completion(struct completion *x) 116static inline void __init_completion(struct completion *x)
74{ 117{
75 x->done = 0; 118 x->done = 0;
76 init_waitqueue_head(&x->wait); 119 init_waitqueue_head(&x->wait);
diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index 898cfe2eeb42..e74655d941b7 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -37,12 +37,6 @@ static inline bool cpusets_enabled(void)
37 return static_branch_unlikely(&cpusets_enabled_key); 37 return static_branch_unlikely(&cpusets_enabled_key);
38} 38}
39 39
40static inline int nr_cpusets(void)
41{
42 /* jump label reference count + the top-level cpuset */
43 return static_key_count(&cpusets_enabled_key.key) + 1;
44}
45
46static inline void cpuset_inc(void) 40static inline void cpuset_inc(void)
47{ 41{
48 static_branch_inc(&cpusets_pre_enable_key); 42 static_branch_inc(&cpusets_pre_enable_key);
diff --git a/include/linux/futex.h b/include/linux/futex.h
index 7c5b694864cd..f36bfd26f998 100644
--- a/include/linux/futex.h
+++ b/include/linux/futex.h
@@ -54,7 +54,6 @@ union futex_key {
54 54
55#ifdef CONFIG_FUTEX 55#ifdef CONFIG_FUTEX
56extern void exit_robust_list(struct task_struct *curr); 56extern void exit_robust_list(struct task_struct *curr);
57extern void exit_pi_state_list(struct task_struct *curr);
58#ifdef CONFIG_HAVE_FUTEX_CMPXCHG 57#ifdef CONFIG_HAVE_FUTEX_CMPXCHG
59#define futex_cmpxchg_enabled 1 58#define futex_cmpxchg_enabled 1
60#else 59#else
@@ -64,8 +63,14 @@ extern int futex_cmpxchg_enabled;
64static inline void exit_robust_list(struct task_struct *curr) 63static inline void exit_robust_list(struct task_struct *curr)
65{ 64{
66} 65}
66#endif
67
68#ifdef CONFIG_FUTEX_PI
69extern void exit_pi_state_list(struct task_struct *curr);
70#else
67static inline void exit_pi_state_list(struct task_struct *curr) 71static inline void exit_pi_state_list(struct task_struct *curr)
68{ 72{
69} 73}
70#endif 74#endif
75
71#endif 76#endif
diff --git a/include/linux/irqflags.h b/include/linux/irqflags.h
index 5dd1272d1ab2..5fdd93bb9300 100644
--- a/include/linux/irqflags.h
+++ b/include/linux/irqflags.h
@@ -23,10 +23,26 @@
23# define trace_softirq_context(p) ((p)->softirq_context) 23# define trace_softirq_context(p) ((p)->softirq_context)
24# define trace_hardirqs_enabled(p) ((p)->hardirqs_enabled) 24# define trace_hardirqs_enabled(p) ((p)->hardirqs_enabled)
25# define trace_softirqs_enabled(p) ((p)->softirqs_enabled) 25# define trace_softirqs_enabled(p) ((p)->softirqs_enabled)
26# define trace_hardirq_enter() do { current->hardirq_context++; } while (0) 26# define trace_hardirq_enter() \
27# define trace_hardirq_exit() do { current->hardirq_context--; } while (0) 27do { \
28# define lockdep_softirq_enter() do { current->softirq_context++; } while (0) 28 current->hardirq_context++; \
29# define lockdep_softirq_exit() do { current->softirq_context--; } while (0) 29 crossrelease_hist_start(XHLOCK_HARD); \
30} while (0)
31# define trace_hardirq_exit() \
32do { \
33 current->hardirq_context--; \
34 crossrelease_hist_end(XHLOCK_HARD); \
35} while (0)
36# define lockdep_softirq_enter() \
37do { \
38 current->softirq_context++; \
39 crossrelease_hist_start(XHLOCK_SOFT); \
40} while (0)
41# define lockdep_softirq_exit() \
42do { \
43 current->softirq_context--; \
44 crossrelease_hist_end(XHLOCK_SOFT); \
45} while (0)
30# define INIT_TRACE_IRQFLAGS .softirqs_enabled = 1, 46# define INIT_TRACE_IRQFLAGS .softirqs_enabled = 1,
31#else 47#else
32# define trace_hardirqs_on() do { } while (0) 48# define trace_hardirqs_on() do { } while (0)
diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h
index 2afd74b9d844..cd5861651b17 100644
--- a/include/linux/jump_label.h
+++ b/include/linux/jump_label.h
@@ -163,6 +163,8 @@ extern void jump_label_apply_nops(struct module *mod);
163extern int static_key_count(struct static_key *key); 163extern int static_key_count(struct static_key *key);
164extern void static_key_enable(struct static_key *key); 164extern void static_key_enable(struct static_key *key);
165extern void static_key_disable(struct static_key *key); 165extern void static_key_disable(struct static_key *key);
166extern void static_key_enable_cpuslocked(struct static_key *key);
167extern void static_key_disable_cpuslocked(struct static_key *key);
166 168
167/* 169/*
168 * We should be using ATOMIC_INIT() for initializing .enabled, but 170 * We should be using ATOMIC_INIT() for initializing .enabled, but
@@ -234,24 +236,29 @@ static inline int jump_label_apply_nops(struct module *mod)
234 236
235static inline void static_key_enable(struct static_key *key) 237static inline void static_key_enable(struct static_key *key)
236{ 238{
237 int count = static_key_count(key); 239 STATIC_KEY_CHECK_USE();
238
239 WARN_ON_ONCE(count < 0 || count > 1);
240 240
241 if (!count) 241 if (atomic_read(&key->enabled) != 0) {
242 static_key_slow_inc(key); 242 WARN_ON_ONCE(atomic_read(&key->enabled) != 1);
243 return;
244 }
245 atomic_set(&key->enabled, 1);
243} 246}
244 247
245static inline void static_key_disable(struct static_key *key) 248static inline void static_key_disable(struct static_key *key)
246{ 249{
247 int count = static_key_count(key); 250 STATIC_KEY_CHECK_USE();
248
249 WARN_ON_ONCE(count < 0 || count > 1);
250 251
251 if (count) 252 if (atomic_read(&key->enabled) != 1) {
252 static_key_slow_dec(key); 253 WARN_ON_ONCE(atomic_read(&key->enabled) != 0);
254 return;
255 }
256 atomic_set(&key->enabled, 0);
253} 257}
254 258
259#define static_key_enable_cpuslocked(k) static_key_enable((k))
260#define static_key_disable_cpuslocked(k) static_key_disable((k))
261
255#define STATIC_KEY_INIT_TRUE { .enabled = ATOMIC_INIT(1) } 262#define STATIC_KEY_INIT_TRUE { .enabled = ATOMIC_INIT(1) }
256#define STATIC_KEY_INIT_FALSE { .enabled = ATOMIC_INIT(0) } 263#define STATIC_KEY_INIT_FALSE { .enabled = ATOMIC_INIT(0) }
257 264
@@ -413,8 +420,10 @@ extern bool ____wrong_branch_error(void);
413 * Normal usage; boolean enable/disable. 420 * Normal usage; boolean enable/disable.
414 */ 421 */
415 422
416#define static_branch_enable(x) static_key_enable(&(x)->key) 423#define static_branch_enable(x) static_key_enable(&(x)->key)
417#define static_branch_disable(x) static_key_disable(&(x)->key) 424#define static_branch_disable(x) static_key_disable(&(x)->key)
425#define static_branch_enable_cpuslocked(x) static_key_enable_cpuslocked(&(x)->key)
426#define static_branch_disable_cpuslocked(x) static_key_disable_cpuslocked(&(x)->key)
418 427
419#endif /* __ASSEMBLY__ */ 428#endif /* __ASSEMBLY__ */
420 429
diff --git a/include/linux/kasan-checks.h b/include/linux/kasan-checks.h
index b7f8aced7870..41960fecf783 100644
--- a/include/linux/kasan-checks.h
+++ b/include/linux/kasan-checks.h
@@ -2,11 +2,13 @@
2#define _LINUX_KASAN_CHECKS_H 2#define _LINUX_KASAN_CHECKS_H
3 3
4#ifdef CONFIG_KASAN 4#ifdef CONFIG_KASAN
5void kasan_check_read(const void *p, unsigned int size); 5void kasan_check_read(const volatile void *p, unsigned int size);
6void kasan_check_write(const void *p, unsigned int size); 6void kasan_check_write(const volatile void *p, unsigned int size);
7#else 7#else
8static inline void kasan_check_read(const void *p, unsigned int size) { } 8static inline void kasan_check_read(const volatile void *p, unsigned int size)
9static inline void kasan_check_write(const void *p, unsigned int size) { } 9{ }
10static inline void kasan_check_write(const volatile void *p, unsigned int size)
11{ }
10#endif 12#endif
11 13
12#endif 14#endif
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index bd6d96cf80b1..6607225d0ea4 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -277,6 +277,13 @@ extern int oops_may_print(void);
277void do_exit(long error_code) __noreturn; 277void do_exit(long error_code) __noreturn;
278void complete_and_exit(struct completion *, long) __noreturn; 278void complete_and_exit(struct completion *, long) __noreturn;
279 279
280#ifdef CONFIG_ARCH_HAS_REFCOUNT
281void refcount_error_report(struct pt_regs *regs, const char *err);
282#else
283static inline void refcount_error_report(struct pt_regs *regs, const char *err)
284{ }
285#endif
286
280/* Internal, do not use. */ 287/* Internal, do not use. */
281int __must_check _kstrtoul(const char *s, unsigned int base, unsigned long *res); 288int __must_check _kstrtoul(const char *s, unsigned int base, unsigned long *res);
282int __must_check _kstrtol(const char *s, unsigned int base, long *res); 289int __must_check _kstrtol(const char *s, unsigned int base, long *res);
diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index fffe49f188e6..fc827cab6d6e 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -29,7 +29,7 @@ extern int lock_stat;
29 * We'd rather not expose kernel/lockdep_states.h this wide, but we do need 29 * We'd rather not expose kernel/lockdep_states.h this wide, but we do need
30 * the total number of states... :-( 30 * the total number of states... :-(
31 */ 31 */
32#define XXX_LOCK_USAGE_STATES (1+3*4) 32#define XXX_LOCK_USAGE_STATES (1+2*4)
33 33
34/* 34/*
35 * NR_LOCKDEP_CACHING_CLASSES ... Number of classes 35 * NR_LOCKDEP_CACHING_CLASSES ... Number of classes
@@ -155,6 +155,12 @@ struct lockdep_map {
155 int cpu; 155 int cpu;
156 unsigned long ip; 156 unsigned long ip;
157#endif 157#endif
158#ifdef CONFIG_LOCKDEP_CROSSRELEASE
159 /*
160 * Whether it's a crosslock.
161 */
162 int cross;
163#endif
158}; 164};
159 165
160static inline void lockdep_copy_map(struct lockdep_map *to, 166static inline void lockdep_copy_map(struct lockdep_map *to,
@@ -258,8 +264,95 @@ struct held_lock {
258 unsigned int hardirqs_off:1; 264 unsigned int hardirqs_off:1;
259 unsigned int references:12; /* 32 bits */ 265 unsigned int references:12; /* 32 bits */
260 unsigned int pin_count; 266 unsigned int pin_count;
267#ifdef CONFIG_LOCKDEP_CROSSRELEASE
268 /*
269 * Generation id.
270 *
271 * A value of cross_gen_id will be stored when holding this,
272 * which is globally increased whenever each crosslock is held.
273 */
274 unsigned int gen_id;
275#endif
276};
277
278#ifdef CONFIG_LOCKDEP_CROSSRELEASE
279#define MAX_XHLOCK_TRACE_ENTRIES 5
280
281/*
282 * This is for keeping locks waiting for commit so that true dependencies
283 * can be added at commit step.
284 */
285struct hist_lock {
286 /*
287 * Id for each entry in the ring buffer. This is used to
288 * decide whether the ring buffer was overwritten or not.
289 *
290 * For example,
291 *
292 * |<----------- hist_lock ring buffer size ------->|
293 * pppppppppppppppppppppiiiiiiiiiiiiiiiiiiiiiiiiiiiii
294 * wrapped > iiiiiiiiiiiiiiiiiiiiiiiiiii.......................
295 *
296 * where 'p' represents an acquisition in process
297 * context, 'i' represents an acquisition in irq
298 * context.
299 *
300 * In this example, the ring buffer was overwritten by
301 * acquisitions in irq context, that should be detected on
302 * rollback or commit.
303 */
304 unsigned int hist_id;
305
306 /*
307 * Seperate stack_trace data. This will be used at commit step.
308 */
309 struct stack_trace trace;
310 unsigned long trace_entries[MAX_XHLOCK_TRACE_ENTRIES];
311
312 /*
313 * Seperate hlock instance. This will be used at commit step.
314 *
315 * TODO: Use a smaller data structure containing only necessary
316 * data. However, we should make lockdep code able to handle the
317 * smaller one first.
318 */
319 struct held_lock hlock;
320};
321
322/*
323 * To initialize a lock as crosslock, lockdep_init_map_crosslock() should
324 * be called instead of lockdep_init_map().
325 */
326struct cross_lock {
327 /*
328 * When more than one acquisition of crosslocks are overlapped,
329 * we have to perform commit for them based on cross_gen_id of
330 * the first acquisition, which allows us to add more true
331 * dependencies.
332 *
333 * Moreover, when no acquisition of a crosslock is in progress,
334 * we should not perform commit because the lock might not exist
335 * any more, which might cause incorrect memory access. So we
336 * have to track the number of acquisitions of a crosslock.
337 */
338 int nr_acquire;
339
340 /*
341 * Seperate hlock instance. This will be used at commit step.
342 *
343 * TODO: Use a smaller data structure containing only necessary
344 * data. However, we should make lockdep code able to handle the
345 * smaller one first.
346 */
347 struct held_lock hlock;
261}; 348};
262 349
350struct lockdep_map_cross {
351 struct lockdep_map map;
352 struct cross_lock xlock;
353};
354#endif
355
263/* 356/*
264 * Initialization, self-test and debugging-output methods: 357 * Initialization, self-test and debugging-output methods:
265 */ 358 */
@@ -282,13 +375,6 @@ extern void lockdep_init_map(struct lockdep_map *lock, const char *name,
282 struct lock_class_key *key, int subclass); 375 struct lock_class_key *key, int subclass);
283 376
284/* 377/*
285 * To initialize a lockdep_map statically use this macro.
286 * Note that _name must not be NULL.
287 */
288#define STATIC_LOCKDEP_MAP_INIT(_name, _key) \
289 { .name = (_name), .key = (void *)(_key), }
290
291/*
292 * Reinitialize a lock key - for cases where there is special locking or 378 * Reinitialize a lock key - for cases where there is special locking or
293 * special initialization of locks so that the validator gets the scope 379 * special initialization of locks so that the validator gets the scope
294 * of dependencies wrong: they are either too broad (they need a class-split) 380 * of dependencies wrong: they are either too broad (they need a class-split)
@@ -363,10 +449,6 @@ static inline void lock_set_subclass(struct lockdep_map *lock,
363 449
364extern void lock_downgrade(struct lockdep_map *lock, unsigned long ip); 450extern void lock_downgrade(struct lockdep_map *lock, unsigned long ip);
365 451
366extern void lockdep_set_current_reclaim_state(gfp_t gfp_mask);
367extern void lockdep_clear_current_reclaim_state(void);
368extern void lockdep_trace_alloc(gfp_t mask);
369
370struct pin_cookie { unsigned int val; }; 452struct pin_cookie { unsigned int val; };
371 453
372#define NIL_COOKIE (struct pin_cookie){ .val = 0U, } 454#define NIL_COOKIE (struct pin_cookie){ .val = 0U, }
@@ -375,7 +457,7 @@ extern struct pin_cookie lock_pin_lock(struct lockdep_map *lock);
375extern void lock_repin_lock(struct lockdep_map *lock, struct pin_cookie); 457extern void lock_repin_lock(struct lockdep_map *lock, struct pin_cookie);
376extern void lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie); 458extern void lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie);
377 459
378# define INIT_LOCKDEP .lockdep_recursion = 0, .lockdep_reclaim_gfp = 0, 460# define INIT_LOCKDEP .lockdep_recursion = 0,
379 461
380#define lockdep_depth(tsk) (debug_locks ? (tsk)->lockdep_depth : 0) 462#define lockdep_depth(tsk) (debug_locks ? (tsk)->lockdep_depth : 0)
381 463
@@ -416,9 +498,6 @@ static inline void lockdep_on(void)
416# define lock_downgrade(l, i) do { } while (0) 498# define lock_downgrade(l, i) do { } while (0)
417# define lock_set_class(l, n, k, s, i) do { } while (0) 499# define lock_set_class(l, n, k, s, i) do { } while (0)
418# define lock_set_subclass(l, s, i) do { } while (0) 500# define lock_set_subclass(l, s, i) do { } while (0)
419# define lockdep_set_current_reclaim_state(g) do { } while (0)
420# define lockdep_clear_current_reclaim_state() do { } while (0)
421# define lockdep_trace_alloc(g) do { } while (0)
422# define lockdep_info() do { } while (0) 501# define lockdep_info() do { } while (0)
423# define lockdep_init_map(lock, name, key, sub) \ 502# define lockdep_init_map(lock, name, key, sub) \
424 do { (void)(name); (void)(key); } while (0) 503 do { (void)(name); (void)(key); } while (0)
@@ -467,6 +546,57 @@ struct pin_cookie { };
467 546
468#endif /* !LOCKDEP */ 547#endif /* !LOCKDEP */
469 548
549enum xhlock_context_t {
550 XHLOCK_HARD,
551 XHLOCK_SOFT,
552 XHLOCK_PROC,
553 XHLOCK_CTX_NR,
554};
555
556#ifdef CONFIG_LOCKDEP_CROSSRELEASE
557extern void lockdep_init_map_crosslock(struct lockdep_map *lock,
558 const char *name,
559 struct lock_class_key *key,
560 int subclass);
561extern void lock_commit_crosslock(struct lockdep_map *lock);
562
563/*
564 * What we essencially have to initialize is 'nr_acquire'. Other members
565 * will be initialized in add_xlock().
566 */
567#define STATIC_CROSS_LOCK_INIT() \
568 { .nr_acquire = 0,}
569
570#define STATIC_CROSS_LOCKDEP_MAP_INIT(_name, _key) \
571 { .map.name = (_name), .map.key = (void *)(_key), \
572 .map.cross = 1, .xlock = STATIC_CROSS_LOCK_INIT(), }
573
574/*
575 * To initialize a lockdep_map statically use this macro.
576 * Note that _name must not be NULL.
577 */
578#define STATIC_LOCKDEP_MAP_INIT(_name, _key) \
579 { .name = (_name), .key = (void *)(_key), .cross = 0, }
580
581extern void crossrelease_hist_start(enum xhlock_context_t c);
582extern void crossrelease_hist_end(enum xhlock_context_t c);
583extern void lockdep_init_task(struct task_struct *task);
584extern void lockdep_free_task(struct task_struct *task);
585#else
586#define lockdep_init_map_crosslock(m, n, k, s) do {} while (0)
587/*
588 * To initialize a lockdep_map statically use this macro.
589 * Note that _name must not be NULL.
590 */
591#define STATIC_LOCKDEP_MAP_INIT(_name, _key) \
592 { .name = (_name), .key = (void *)(_key), }
593
594static inline void crossrelease_hist_start(enum xhlock_context_t c) {}
595static inline void crossrelease_hist_end(enum xhlock_context_t c) {}
596static inline void lockdep_init_task(struct task_struct *task) {}
597static inline void lockdep_free_task(struct task_struct *task) {}
598#endif
599
470#ifdef CONFIG_LOCK_STAT 600#ifdef CONFIG_LOCK_STAT
471 601
472extern void lock_contended(struct lockdep_map *lock, unsigned long ip); 602extern void lock_contended(struct lockdep_map *lock, unsigned long ip);
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 3cadee0a3508..dc1edec05a3f 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -535,6 +535,10 @@ extern void tlb_finish_mmu(struct mmu_gather *tlb,
535 */ 535 */
536static inline bool mm_tlb_flush_pending(struct mm_struct *mm) 536static inline bool mm_tlb_flush_pending(struct mm_struct *mm)
537{ 537{
538 /*
539 * Must be called with PTL held; such that our PTL acquire will have
540 * observed the store from set_tlb_flush_pending().
541 */
538 return atomic_read(&mm->tlb_flush_pending) > 0; 542 return atomic_read(&mm->tlb_flush_pending) > 0;
539} 543}
540 544
@@ -556,10 +560,29 @@ static inline void inc_tlb_flush_pending(struct mm_struct *mm)
556 atomic_inc(&mm->tlb_flush_pending); 560 atomic_inc(&mm->tlb_flush_pending);
557 561
558 /* 562 /*
559 * Guarantee that the tlb_flush_pending increase does not leak into the 563 * The only time this value is relevant is when there are indeed pages
560 * critical section updating the page tables 564 * to flush. And we'll only flush pages after changing them, which
565 * requires the PTL.
566 *
567 * So the ordering here is:
568 *
569 * atomic_inc(&mm->tlb_flush_pending);
570 * spin_lock(&ptl);
571 * ...
572 * set_pte_at();
573 * spin_unlock(&ptl);
574 *
575 * spin_lock(&ptl)
576 * mm_tlb_flush_pending();
577 * ....
578 * spin_unlock(&ptl);
579 *
580 * flush_tlb_range();
581 * atomic_dec(&mm->tlb_flush_pending);
582 *
583 * So the =true store is constrained by the PTL unlock, and the =false
584 * store is constrained by the TLB invalidate.
561 */ 585 */
562 smp_mb__before_spinlock();
563} 586}
564 587
565/* Clearing is done after a TLB flush, which also provides a barrier. */ 588/* Clearing is done after a TLB flush, which also provides a barrier. */
diff --git a/include/linux/refcount.h b/include/linux/refcount.h
index 591792c8e5b0..48b7c9c68c4d 100644
--- a/include/linux/refcount.h
+++ b/include/linux/refcount.h
@@ -53,6 +53,9 @@ extern __must_check bool refcount_sub_and_test(unsigned int i, refcount_t *r);
53extern __must_check bool refcount_dec_and_test(refcount_t *r); 53extern __must_check bool refcount_dec_and_test(refcount_t *r);
54extern void refcount_dec(refcount_t *r); 54extern void refcount_dec(refcount_t *r);
55#else 55#else
56# ifdef CONFIG_ARCH_HAS_REFCOUNT
57# include <asm/refcount.h>
58# else
56static inline __must_check bool refcount_add_not_zero(unsigned int i, refcount_t *r) 59static inline __must_check bool refcount_add_not_zero(unsigned int i, refcount_t *r)
57{ 60{
58 return atomic_add_unless(&r->refs, i, 0); 61 return atomic_add_unless(&r->refs, i, 0);
@@ -87,6 +90,7 @@ static inline void refcount_dec(refcount_t *r)
87{ 90{
88 atomic_dec(&r->refs); 91 atomic_dec(&r->refs);
89} 92}
93# endif /* !CONFIG_ARCH_HAS_REFCOUNT */
90#endif /* CONFIG_REFCOUNT_FULL */ 94#endif /* CONFIG_REFCOUNT_FULL */
91 95
92extern __must_check bool refcount_dec_if_one(refcount_t *r); 96extern __must_check bool refcount_dec_if_one(refcount_t *r);
diff --git a/include/linux/rwsem-spinlock.h b/include/linux/rwsem-spinlock.h
index ae0528b834cd..e784761a4443 100644
--- a/include/linux/rwsem-spinlock.h
+++ b/include/linux/rwsem-spinlock.h
@@ -32,6 +32,7 @@ struct rw_semaphore {
32#define RWSEM_UNLOCKED_VALUE 0x00000000 32#define RWSEM_UNLOCKED_VALUE 0x00000000
33 33
34extern void __down_read(struct rw_semaphore *sem); 34extern void __down_read(struct rw_semaphore *sem);
35extern int __must_check __down_read_killable(struct rw_semaphore *sem);
35extern int __down_read_trylock(struct rw_semaphore *sem); 36extern int __down_read_trylock(struct rw_semaphore *sem);
36extern void __down_write(struct rw_semaphore *sem); 37extern void __down_write(struct rw_semaphore *sem);
37extern int __must_check __down_write_killable(struct rw_semaphore *sem); 38extern int __must_check __down_write_killable(struct rw_semaphore *sem);
diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h
index dd1d14250340..0ad7318ff299 100644
--- a/include/linux/rwsem.h
+++ b/include/linux/rwsem.h
@@ -44,6 +44,7 @@ struct rw_semaphore {
44}; 44};
45 45
46extern struct rw_semaphore *rwsem_down_read_failed(struct rw_semaphore *sem); 46extern struct rw_semaphore *rwsem_down_read_failed(struct rw_semaphore *sem);
47extern struct rw_semaphore *rwsem_down_read_failed_killable(struct rw_semaphore *sem);
47extern struct rw_semaphore *rwsem_down_write_failed(struct rw_semaphore *sem); 48extern struct rw_semaphore *rwsem_down_write_failed(struct rw_semaphore *sem);
48extern struct rw_semaphore *rwsem_down_write_failed_killable(struct rw_semaphore *sem); 49extern struct rw_semaphore *rwsem_down_write_failed_killable(struct rw_semaphore *sem);
49extern struct rw_semaphore *rwsem_wake(struct rw_semaphore *); 50extern struct rw_semaphore *rwsem_wake(struct rw_semaphore *);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index c05ac5f5aa03..93be319e0cbf 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -846,7 +846,17 @@ struct task_struct {
846 int lockdep_depth; 846 int lockdep_depth;
847 unsigned int lockdep_recursion; 847 unsigned int lockdep_recursion;
848 struct held_lock held_locks[MAX_LOCK_DEPTH]; 848 struct held_lock held_locks[MAX_LOCK_DEPTH];
849 gfp_t lockdep_reclaim_gfp; 849#endif
850
851#ifdef CONFIG_LOCKDEP_CROSSRELEASE
852#define MAX_XHLOCKS_NR 64UL
853 struct hist_lock *xhlocks; /* Crossrelease history locks */
854 unsigned int xhlock_idx;
855 /* For restoring at history boundaries */
856 unsigned int xhlock_idx_hist[XHLOCK_CTX_NR];
857 unsigned int hist_id;
858 /* For overwrite check at each context exit */
859 unsigned int hist_id_save[XHLOCK_CTX_NR];
850#endif 860#endif
851 861
852#ifdef CONFIG_UBSAN 862#ifdef CONFIG_UBSAN
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index 2b24a6974847..2b0a281f9d26 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -167,6 +167,14 @@ static inline gfp_t current_gfp_context(gfp_t flags)
167 return flags; 167 return flags;
168} 168}
169 169
170#ifdef CONFIG_LOCKDEP
171extern void fs_reclaim_acquire(gfp_t gfp_mask);
172extern void fs_reclaim_release(gfp_t gfp_mask);
173#else
174static inline void fs_reclaim_acquire(gfp_t gfp_mask) { }
175static inline void fs_reclaim_release(gfp_t gfp_mask) { }
176#endif
177
170static inline unsigned int memalloc_noio_save(void) 178static inline unsigned int memalloc_noio_save(void)
171{ 179{
172 unsigned int flags = current->flags & PF_MEMALLOC_NOIO; 180 unsigned int flags = current->flags & PF_MEMALLOC_NOIO;
diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h
index d9510e8522d4..4e8cce19b507 100644
--- a/include/linux/spinlock.h
+++ b/include/linux/spinlock.h
@@ -118,16 +118,39 @@ do { \
118#endif 118#endif
119 119
120/* 120/*
121 * Despite its name it doesn't necessarily has to be a full barrier. 121 * This barrier must provide two things:
122 * It should only guarantee that a STORE before the critical section 122 *
123 * can not be reordered with LOADs and STOREs inside this section. 123 * - it must guarantee a STORE before the spin_lock() is ordered against a
124 * spin_lock() is the one-way barrier, this LOAD can not escape out 124 * LOAD after it, see the comments at its two usage sites.
125 * of the region. So the default implementation simply ensures that 125 *
126 * a STORE can not move into the critical section, smp_wmb() should 126 * - it must ensure the critical section is RCsc.
127 * serialize it with another STORE done by spin_lock(). 127 *
128 * The latter is important for cases where we observe values written by other
129 * CPUs in spin-loops, without barriers, while being subject to scheduling.
130 *
131 * CPU0 CPU1 CPU2
132 *
133 * for (;;) {
134 * if (READ_ONCE(X))
135 * break;
136 * }
137 * X=1
138 * <sched-out>
139 * <sched-in>
140 * r = X;
141 *
142 * without transitivity it could be that CPU1 observes X!=0 breaks the loop,
143 * we get migrated and CPU2 sees X==0.
144 *
145 * Since most load-store architectures implement ACQUIRE with an smp_mb() after
146 * the LL/SC loop, they need no further barriers. Similarly all our TSO
147 * architectures imply an smp_mb() for each atomic instruction and equally don't
148 * need more.
149 *
150 * Architectures that can implement ACQUIRE better need to take care.
128 */ 151 */
129#ifndef smp_mb__before_spinlock 152#ifndef smp_mb__after_spinlock
130#define smp_mb__before_spinlock() smp_wmb() 153#define smp_mb__after_spinlock() do { } while (0)
131#endif 154#endif
132 155
133/** 156/**
diff --git a/init/Kconfig b/init/Kconfig
index 8514b25db21c..5f0ef850e808 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1275,12 +1275,17 @@ config BASE_FULL
1275config FUTEX 1275config FUTEX
1276 bool "Enable futex support" if EXPERT 1276 bool "Enable futex support" if EXPERT
1277 default y 1277 default y
1278 select RT_MUTEXES 1278 imply RT_MUTEXES
1279 help 1279 help
1280 Disabling this option will cause the kernel to be built without 1280 Disabling this option will cause the kernel to be built without
1281 support for "fast userspace mutexes". The resulting kernel may not 1281 support for "fast userspace mutexes". The resulting kernel may not
1282 run glibc-based applications correctly. 1282 run glibc-based applications correctly.
1283 1283
1284config FUTEX_PI
1285 bool
1286 depends on FUTEX && RT_MUTEXES
1287 default y
1288
1284config HAVE_FUTEX_CMPXCHG 1289config HAVE_FUTEX_CMPXCHG
1285 bool 1290 bool
1286 depends on FUTEX 1291 depends on FUTEX
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 8d5151688504..9ed6a051a1b9 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -577,6 +577,13 @@ static void update_domain_attr_tree(struct sched_domain_attr *dattr,
577 rcu_read_unlock(); 577 rcu_read_unlock();
578} 578}
579 579
580/* Must be called with cpuset_mutex held. */
581static inline int nr_cpusets(void)
582{
583 /* jump label reference count + the top-level cpuset */
584 return static_key_count(&cpusets_enabled_key.key) + 1;
585}
586
580/* 587/*
581 * generate_sched_domains() 588 * generate_sched_domains()
582 * 589 *
diff --git a/kernel/exit.c b/kernel/exit.c
index c5548faa9f37..fa72d57db747 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -920,6 +920,7 @@ void __noreturn do_exit(long code)
920 exit_rcu(); 920 exit_rcu();
921 TASKS_RCU(__srcu_read_unlock(&tasks_rcu_exit_srcu, tasks_rcu_i)); 921 TASKS_RCU(__srcu_read_unlock(&tasks_rcu_exit_srcu, tasks_rcu_i));
922 922
923 lockdep_free_task(tsk);
923 do_task_dead(); 924 do_task_dead();
924} 925}
925EXPORT_SYMBOL_GPL(do_exit); 926EXPORT_SYMBOL_GPL(do_exit);
diff --git a/kernel/fork.c b/kernel/fork.c
index e075b7780421..5fc09911fbb9 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -484,6 +484,8 @@ void __init fork_init(void)
484 cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "fork:vm_stack_cache", 484 cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "fork:vm_stack_cache",
485 NULL, free_vm_stack_cache); 485 NULL, free_vm_stack_cache);
486#endif 486#endif
487
488 lockdep_init_task(&init_task);
487} 489}
488 490
489int __weak arch_dup_task_struct(struct task_struct *dst, 491int __weak arch_dup_task_struct(struct task_struct *dst,
@@ -1691,6 +1693,7 @@ static __latent_entropy struct task_struct *copy_process(
1691 p->lockdep_depth = 0; /* no locks held yet */ 1693 p->lockdep_depth = 0; /* no locks held yet */
1692 p->curr_chain_key = 0; 1694 p->curr_chain_key = 0;
1693 p->lockdep_recursion = 0; 1695 p->lockdep_recursion = 0;
1696 lockdep_init_task(p);
1694#endif 1697#endif
1695 1698
1696#ifdef CONFIG_DEBUG_MUTEXES 1699#ifdef CONFIG_DEBUG_MUTEXES
@@ -1949,6 +1952,7 @@ bad_fork_cleanup_audit:
1949bad_fork_cleanup_perf: 1952bad_fork_cleanup_perf:
1950 perf_event_free_task(p); 1953 perf_event_free_task(p);
1951bad_fork_cleanup_policy: 1954bad_fork_cleanup_policy:
1955 lockdep_free_task(p);
1952#ifdef CONFIG_NUMA 1956#ifdef CONFIG_NUMA
1953 mpol_put(p->mempolicy); 1957 mpol_put(p->mempolicy);
1954bad_fork_cleanup_threadgroup_lock: 1958bad_fork_cleanup_threadgroup_lock:
diff --git a/kernel/futex.c b/kernel/futex.c
index f50b434756c1..0939255fc750 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -876,6 +876,8 @@ static struct task_struct *futex_find_get_task(pid_t pid)
876 return p; 876 return p;
877} 877}
878 878
879#ifdef CONFIG_FUTEX_PI
880
879/* 881/*
880 * This task is holding PI mutexes at exit time => bad. 882 * This task is holding PI mutexes at exit time => bad.
881 * Kernel cleans up PI-state, but userspace is likely hosed. 883 * Kernel cleans up PI-state, but userspace is likely hosed.
@@ -933,6 +935,8 @@ void exit_pi_state_list(struct task_struct *curr)
933 raw_spin_unlock_irq(&curr->pi_lock); 935 raw_spin_unlock_irq(&curr->pi_lock);
934} 936}
935 937
938#endif
939
936/* 940/*
937 * We need to check the following states: 941 * We need to check the following states:
938 * 942 *
@@ -1800,6 +1804,15 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
1800 struct futex_q *this, *next; 1804 struct futex_q *this, *next;
1801 DEFINE_WAKE_Q(wake_q); 1805 DEFINE_WAKE_Q(wake_q);
1802 1806
1807 /*
1808 * When PI not supported: return -ENOSYS if requeue_pi is true,
1809 * consequently the compiler knows requeue_pi is always false past
1810 * this point which will optimize away all the conditional code
1811 * further down.
1812 */
1813 if (!IS_ENABLED(CONFIG_FUTEX_PI) && requeue_pi)
1814 return -ENOSYS;
1815
1803 if (requeue_pi) { 1816 if (requeue_pi) {
1804 /* 1817 /*
1805 * Requeue PI only works on two distinct uaddrs. This 1818 * Requeue PI only works on two distinct uaddrs. This
@@ -2595,6 +2608,9 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags,
2595 struct futex_q q = futex_q_init; 2608 struct futex_q q = futex_q_init;
2596 int res, ret; 2609 int res, ret;
2597 2610
2611 if (!IS_ENABLED(CONFIG_FUTEX_PI))
2612 return -ENOSYS;
2613
2598 if (refill_pi_state_cache()) 2614 if (refill_pi_state_cache())
2599 return -ENOMEM; 2615 return -ENOMEM;
2600 2616
@@ -2774,6 +2790,9 @@ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
2774 struct futex_q *top_waiter; 2790 struct futex_q *top_waiter;
2775 int ret; 2791 int ret;
2776 2792
2793 if (!IS_ENABLED(CONFIG_FUTEX_PI))
2794 return -ENOSYS;
2795
2777retry: 2796retry:
2778 if (get_user(uval, uaddr)) 2797 if (get_user(uval, uaddr))
2779 return -EFAULT; 2798 return -EFAULT;
@@ -2984,6 +3003,9 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
2984 struct futex_q q = futex_q_init; 3003 struct futex_q q = futex_q_init;
2985 int res, ret; 3004 int res, ret;
2986 3005
3006 if (!IS_ENABLED(CONFIG_FUTEX_PI))
3007 return -ENOSYS;
3008
2987 if (uaddr == uaddr2) 3009 if (uaddr == uaddr2)
2988 return -EINVAL; 3010 return -EINVAL;
2989 3011
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index d11c506a6ac3..0bf2e8f5244a 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -79,29 +79,7 @@ int static_key_count(struct static_key *key)
79} 79}
80EXPORT_SYMBOL_GPL(static_key_count); 80EXPORT_SYMBOL_GPL(static_key_count);
81 81
82void static_key_enable(struct static_key *key) 82static void static_key_slow_inc_cpuslocked(struct static_key *key)
83{
84 int count = static_key_count(key);
85
86 WARN_ON_ONCE(count < 0 || count > 1);
87
88 if (!count)
89 static_key_slow_inc(key);
90}
91EXPORT_SYMBOL_GPL(static_key_enable);
92
93void static_key_disable(struct static_key *key)
94{
95 int count = static_key_count(key);
96
97 WARN_ON_ONCE(count < 0 || count > 1);
98
99 if (count)
100 static_key_slow_dec(key);
101}
102EXPORT_SYMBOL_GPL(static_key_disable);
103
104void static_key_slow_inc(struct static_key *key)
105{ 83{
106 int v, v1; 84 int v, v1;
107 85
@@ -125,24 +103,87 @@ void static_key_slow_inc(struct static_key *key)
125 return; 103 return;
126 } 104 }
127 105
128 cpus_read_lock();
129 jump_label_lock(); 106 jump_label_lock();
130 if (atomic_read(&key->enabled) == 0) { 107 if (atomic_read(&key->enabled) == 0) {
131 atomic_set(&key->enabled, -1); 108 atomic_set(&key->enabled, -1);
132 jump_label_update(key); 109 jump_label_update(key);
133 atomic_set(&key->enabled, 1); 110 /*
111 * Ensure that if the above cmpxchg loop observes our positive
112 * value, it must also observe all the text changes.
113 */
114 atomic_set_release(&key->enabled, 1);
134 } else { 115 } else {
135 atomic_inc(&key->enabled); 116 atomic_inc(&key->enabled);
136 } 117 }
137 jump_label_unlock(); 118 jump_label_unlock();
119}
120
121void static_key_slow_inc(struct static_key *key)
122{
123 cpus_read_lock();
124 static_key_slow_inc_cpuslocked(key);
138 cpus_read_unlock(); 125 cpus_read_unlock();
139} 126}
140EXPORT_SYMBOL_GPL(static_key_slow_inc); 127EXPORT_SYMBOL_GPL(static_key_slow_inc);
141 128
142static void __static_key_slow_dec(struct static_key *key, 129void static_key_enable_cpuslocked(struct static_key *key)
143 unsigned long rate_limit, struct delayed_work *work) 130{
131 STATIC_KEY_CHECK_USE();
132
133 if (atomic_read(&key->enabled) > 0) {
134 WARN_ON_ONCE(atomic_read(&key->enabled) != 1);
135 return;
136 }
137
138 jump_label_lock();
139 if (atomic_read(&key->enabled) == 0) {
140 atomic_set(&key->enabled, -1);
141 jump_label_update(key);
142 /*
143 * See static_key_slow_inc().
144 */
145 atomic_set_release(&key->enabled, 1);
146 }
147 jump_label_unlock();
148}
149EXPORT_SYMBOL_GPL(static_key_enable_cpuslocked);
150
151void static_key_enable(struct static_key *key)
152{
153 cpus_read_lock();
154 static_key_enable_cpuslocked(key);
155 cpus_read_unlock();
156}
157EXPORT_SYMBOL_GPL(static_key_enable);
158
159void static_key_disable_cpuslocked(struct static_key *key)
160{
161 STATIC_KEY_CHECK_USE();
162
163 if (atomic_read(&key->enabled) != 1) {
164 WARN_ON_ONCE(atomic_read(&key->enabled) != 0);
165 return;
166 }
167
168 jump_label_lock();
169 if (atomic_cmpxchg(&key->enabled, 1, 0))
170 jump_label_update(key);
171 jump_label_unlock();
172}
173EXPORT_SYMBOL_GPL(static_key_disable_cpuslocked);
174
175void static_key_disable(struct static_key *key)
144{ 176{
145 cpus_read_lock(); 177 cpus_read_lock();
178 static_key_disable_cpuslocked(key);
179 cpus_read_unlock();
180}
181EXPORT_SYMBOL_GPL(static_key_disable);
182
183static void static_key_slow_dec_cpuslocked(struct static_key *key,
184 unsigned long rate_limit,
185 struct delayed_work *work)
186{
146 /* 187 /*
147 * The negative count check is valid even when a negative 188 * The negative count check is valid even when a negative
148 * key->enabled is in use by static_key_slow_inc(); a 189 * key->enabled is in use by static_key_slow_inc(); a
@@ -153,7 +194,6 @@ static void __static_key_slow_dec(struct static_key *key,
153 if (!atomic_dec_and_mutex_lock(&key->enabled, &jump_label_mutex)) { 194 if (!atomic_dec_and_mutex_lock(&key->enabled, &jump_label_mutex)) {
154 WARN(atomic_read(&key->enabled) < 0, 195 WARN(atomic_read(&key->enabled) < 0,
155 "jump label: negative count!\n"); 196 "jump label: negative count!\n");
156 cpus_read_unlock();
157 return; 197 return;
158 } 198 }
159 199
@@ -164,6 +204,14 @@ static void __static_key_slow_dec(struct static_key *key,
164 jump_label_update(key); 204 jump_label_update(key);
165 } 205 }
166 jump_label_unlock(); 206 jump_label_unlock();
207}
208
209static void __static_key_slow_dec(struct static_key *key,
210 unsigned long rate_limit,
211 struct delayed_work *work)
212{
213 cpus_read_lock();
214 static_key_slow_dec_cpuslocked(key, rate_limit, work);
167 cpus_read_unlock(); 215 cpus_read_unlock();
168} 216}
169 217
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 7d2499bec5fe..66011c9f5df3 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -58,6 +58,10 @@
58#define CREATE_TRACE_POINTS 58#define CREATE_TRACE_POINTS
59#include <trace/events/lock.h> 59#include <trace/events/lock.h>
60 60
61#ifdef CONFIG_LOCKDEP_CROSSRELEASE
62#include <linux/slab.h>
63#endif
64
61#ifdef CONFIG_PROVE_LOCKING 65#ifdef CONFIG_PROVE_LOCKING
62int prove_locking = 1; 66int prove_locking = 1;
63module_param(prove_locking, int, 0644); 67module_param(prove_locking, int, 0644);
@@ -344,14 +348,12 @@ EXPORT_SYMBOL(lockdep_on);
344#if VERBOSE 348#if VERBOSE
345# define HARDIRQ_VERBOSE 1 349# define HARDIRQ_VERBOSE 1
346# define SOFTIRQ_VERBOSE 1 350# define SOFTIRQ_VERBOSE 1
347# define RECLAIM_VERBOSE 1
348#else 351#else
349# define HARDIRQ_VERBOSE 0 352# define HARDIRQ_VERBOSE 0
350# define SOFTIRQ_VERBOSE 0 353# define SOFTIRQ_VERBOSE 0
351# define RECLAIM_VERBOSE 0
352#endif 354#endif
353 355
354#if VERBOSE || HARDIRQ_VERBOSE || SOFTIRQ_VERBOSE || RECLAIM_VERBOSE 356#if VERBOSE || HARDIRQ_VERBOSE || SOFTIRQ_VERBOSE
355/* 357/*
356 * Quick filtering for interesting events: 358 * Quick filtering for interesting events:
357 */ 359 */
@@ -726,6 +728,18 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
726 return is_static || static_obj(lock->key) ? NULL : ERR_PTR(-EINVAL); 728 return is_static || static_obj(lock->key) ? NULL : ERR_PTR(-EINVAL);
727} 729}
728 730
731#ifdef CONFIG_LOCKDEP_CROSSRELEASE
732static void cross_init(struct lockdep_map *lock, int cross);
733static int cross_lock(struct lockdep_map *lock);
734static int lock_acquire_crosslock(struct held_lock *hlock);
735static int lock_release_crosslock(struct lockdep_map *lock);
736#else
737static inline void cross_init(struct lockdep_map *lock, int cross) {}
738static inline int cross_lock(struct lockdep_map *lock) { return 0; }
739static inline int lock_acquire_crosslock(struct held_lock *hlock) { return 2; }
740static inline int lock_release_crosslock(struct lockdep_map *lock) { return 2; }
741#endif
742
729/* 743/*
730 * Register a lock's class in the hash-table, if the class is not present 744 * Register a lock's class in the hash-table, if the class is not present
731 * yet. Otherwise we look it up. We cache the result in the lock object 745 * yet. Otherwise we look it up. We cache the result in the lock object
@@ -1125,22 +1139,41 @@ print_circular_lock_scenario(struct held_lock *src,
1125 printk(KERN_CONT "\n\n"); 1139 printk(KERN_CONT "\n\n");
1126 } 1140 }
1127 1141
1128 printk(" Possible unsafe locking scenario:\n\n"); 1142 if (cross_lock(tgt->instance)) {
1129 printk(" CPU0 CPU1\n"); 1143 printk(" Possible unsafe locking scenario by crosslock:\n\n");
1130 printk(" ---- ----\n"); 1144 printk(" CPU0 CPU1\n");
1131 printk(" lock("); 1145 printk(" ---- ----\n");
1132 __print_lock_name(target); 1146 printk(" lock(");
1133 printk(KERN_CONT ");\n"); 1147 __print_lock_name(parent);
1134 printk(" lock("); 1148 printk(KERN_CONT ");\n");
1135 __print_lock_name(parent); 1149 printk(" lock(");
1136 printk(KERN_CONT ");\n"); 1150 __print_lock_name(target);
1137 printk(" lock("); 1151 printk(KERN_CONT ");\n");
1138 __print_lock_name(target); 1152 printk(" lock(");
1139 printk(KERN_CONT ");\n"); 1153 __print_lock_name(source);
1140 printk(" lock("); 1154 printk(KERN_CONT ");\n");
1141 __print_lock_name(source); 1155 printk(" unlock(");
1142 printk(KERN_CONT ");\n"); 1156 __print_lock_name(target);
1143 printk("\n *** DEADLOCK ***\n\n"); 1157 printk(KERN_CONT ");\n");
1158 printk("\n *** DEADLOCK ***\n\n");
1159 } else {
1160 printk(" Possible unsafe locking scenario:\n\n");
1161 printk(" CPU0 CPU1\n");
1162 printk(" ---- ----\n");
1163 printk(" lock(");
1164 __print_lock_name(target);
1165 printk(KERN_CONT ");\n");
1166 printk(" lock(");
1167 __print_lock_name(parent);
1168 printk(KERN_CONT ");\n");
1169 printk(" lock(");
1170 __print_lock_name(target);
1171 printk(KERN_CONT ");\n");
1172 printk(" lock(");
1173 __print_lock_name(source);
1174 printk(KERN_CONT ");\n");
1175 printk("\n *** DEADLOCK ***\n\n");
1176 }
1144} 1177}
1145 1178
1146/* 1179/*
@@ -1165,7 +1198,12 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth,
1165 pr_warn("%s/%d is trying to acquire lock:\n", 1198 pr_warn("%s/%d is trying to acquire lock:\n",
1166 curr->comm, task_pid_nr(curr)); 1199 curr->comm, task_pid_nr(curr));
1167 print_lock(check_src); 1200 print_lock(check_src);
1168 pr_warn("\nbut task is already holding lock:\n"); 1201
1202 if (cross_lock(check_tgt->instance))
1203 pr_warn("\nbut now in release context of a crosslock acquired at the following:\n");
1204 else
1205 pr_warn("\nbut task is already holding lock:\n");
1206
1169 print_lock(check_tgt); 1207 print_lock(check_tgt);
1170 pr_warn("\nwhich lock already depends on the new lock.\n\n"); 1208 pr_warn("\nwhich lock already depends on the new lock.\n\n");
1171 pr_warn("\nthe existing dependency chain (in reverse order) is:\n"); 1209 pr_warn("\nthe existing dependency chain (in reverse order) is:\n");
@@ -1183,7 +1221,8 @@ static inline int class_equal(struct lock_list *entry, void *data)
1183static noinline int print_circular_bug(struct lock_list *this, 1221static noinline int print_circular_bug(struct lock_list *this,
1184 struct lock_list *target, 1222 struct lock_list *target,
1185 struct held_lock *check_src, 1223 struct held_lock *check_src,
1186 struct held_lock *check_tgt) 1224 struct held_lock *check_tgt,
1225 struct stack_trace *trace)
1187{ 1226{
1188 struct task_struct *curr = current; 1227 struct task_struct *curr = current;
1189 struct lock_list *parent; 1228 struct lock_list *parent;
@@ -1193,7 +1232,9 @@ static noinline int print_circular_bug(struct lock_list *this,
1193 if (!debug_locks_off_graph_unlock() || debug_locks_silent) 1232 if (!debug_locks_off_graph_unlock() || debug_locks_silent)
1194 return 0; 1233 return 0;
1195 1234
1196 if (!save_trace(&this->trace)) 1235 if (cross_lock(check_tgt->instance))
1236 this->trace = *trace;
1237 else if (!save_trace(&this->trace))
1197 return 0; 1238 return 0;
1198 1239
1199 depth = get_lock_depth(target); 1240 depth = get_lock_depth(target);
@@ -1309,6 +1350,19 @@ check_noncircular(struct lock_list *root, struct lock_class *target,
1309 return result; 1350 return result;
1310} 1351}
1311 1352
1353static noinline int
1354check_redundant(struct lock_list *root, struct lock_class *target,
1355 struct lock_list **target_entry)
1356{
1357 int result;
1358
1359 debug_atomic_inc(nr_redundant_checks);
1360
1361 result = __bfs_forwards(root, target, class_equal, target_entry);
1362
1363 return result;
1364}
1365
1312#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) 1366#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING)
1313/* 1367/*
1314 * Forwards and backwards subgraph searching, for the purposes of 1368 * Forwards and backwards subgraph searching, for the purposes of
@@ -1784,6 +1838,9 @@ check_deadlock(struct task_struct *curr, struct held_lock *next,
1784 if (nest) 1838 if (nest)
1785 return 2; 1839 return 2;
1786 1840
1841 if (cross_lock(prev->instance))
1842 continue;
1843
1787 return print_deadlock_bug(curr, prev, next); 1844 return print_deadlock_bug(curr, prev, next);
1788 } 1845 }
1789 return 1; 1846 return 1;
@@ -1813,20 +1870,13 @@ check_deadlock(struct task_struct *curr, struct held_lock *next,
1813 */ 1870 */
1814static int 1871static int
1815check_prev_add(struct task_struct *curr, struct held_lock *prev, 1872check_prev_add(struct task_struct *curr, struct held_lock *prev,
1816 struct held_lock *next, int distance, int *stack_saved) 1873 struct held_lock *next, int distance, struct stack_trace *trace,
1874 int (*save)(struct stack_trace *trace))
1817{ 1875{
1818 struct lock_list *entry; 1876 struct lock_list *entry;
1819 int ret; 1877 int ret;
1820 struct lock_list this; 1878 struct lock_list this;
1821 struct lock_list *uninitialized_var(target_entry); 1879 struct lock_list *uninitialized_var(target_entry);
1822 /*
1823 * Static variable, serialized by the graph_lock().
1824 *
1825 * We use this static variable to save the stack trace in case
1826 * we call into this function multiple times due to encountering
1827 * trylocks in the held lock stack.
1828 */
1829 static struct stack_trace trace;
1830 1880
1831 /* 1881 /*
1832 * Prove that the new <prev> -> <next> dependency would not 1882 * Prove that the new <prev> -> <next> dependency would not
@@ -1841,7 +1891,7 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
1841 this.parent = NULL; 1891 this.parent = NULL;
1842 ret = check_noncircular(&this, hlock_class(prev), &target_entry); 1892 ret = check_noncircular(&this, hlock_class(prev), &target_entry);
1843 if (unlikely(!ret)) 1893 if (unlikely(!ret))
1844 return print_circular_bug(&this, target_entry, next, prev); 1894 return print_circular_bug(&this, target_entry, next, prev, trace);
1845 else if (unlikely(ret < 0)) 1895 else if (unlikely(ret < 0))
1846 return print_bfs_bug(ret); 1896 return print_bfs_bug(ret);
1847 1897
@@ -1870,15 +1920,26 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
1870 if (entry->class == hlock_class(next)) { 1920 if (entry->class == hlock_class(next)) {
1871 if (distance == 1) 1921 if (distance == 1)
1872 entry->distance = 1; 1922 entry->distance = 1;
1873 return 2; 1923 return 1;
1874 } 1924 }
1875 } 1925 }
1876 1926
1877 if (!*stack_saved) { 1927 /*
1878 if (!save_trace(&trace)) 1928 * Is the <prev> -> <next> link redundant?
1879 return 0; 1929 */
1880 *stack_saved = 1; 1930 this.class = hlock_class(prev);
1931 this.parent = NULL;
1932 ret = check_redundant(&this, hlock_class(next), &target_entry);
1933 if (!ret) {
1934 debug_atomic_inc(nr_redundant);
1935 return 2;
1881 } 1936 }
1937 if (ret < 0)
1938 return print_bfs_bug(ret);
1939
1940
1941 if (save && !save(trace))
1942 return 0;
1882 1943
1883 /* 1944 /*
1884 * Ok, all validations passed, add the new lock 1945 * Ok, all validations passed, add the new lock
@@ -1886,14 +1947,14 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
1886 */ 1947 */
1887 ret = add_lock_to_list(hlock_class(next), 1948 ret = add_lock_to_list(hlock_class(next),
1888 &hlock_class(prev)->locks_after, 1949 &hlock_class(prev)->locks_after,
1889 next->acquire_ip, distance, &trace); 1950 next->acquire_ip, distance, trace);
1890 1951
1891 if (!ret) 1952 if (!ret)
1892 return 0; 1953 return 0;
1893 1954
1894 ret = add_lock_to_list(hlock_class(prev), 1955 ret = add_lock_to_list(hlock_class(prev),
1895 &hlock_class(next)->locks_before, 1956 &hlock_class(next)->locks_before,
1896 next->acquire_ip, distance, &trace); 1957 next->acquire_ip, distance, trace);
1897 if (!ret) 1958 if (!ret)
1898 return 0; 1959 return 0;
1899 1960
@@ -1901,8 +1962,6 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
1901 * Debugging printouts: 1962 * Debugging printouts:
1902 */ 1963 */
1903 if (verbose(hlock_class(prev)) || verbose(hlock_class(next))) { 1964 if (verbose(hlock_class(prev)) || verbose(hlock_class(next))) {
1904 /* We drop graph lock, so another thread can overwrite trace. */
1905 *stack_saved = 0;
1906 graph_unlock(); 1965 graph_unlock();
1907 printk("\n new dependency: "); 1966 printk("\n new dependency: ");
1908 print_lock_name(hlock_class(prev)); 1967 print_lock_name(hlock_class(prev));
@@ -1910,9 +1969,10 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
1910 print_lock_name(hlock_class(next)); 1969 print_lock_name(hlock_class(next));
1911 printk(KERN_CONT "\n"); 1970 printk(KERN_CONT "\n");
1912 dump_stack(); 1971 dump_stack();
1913 return graph_lock(); 1972 if (!graph_lock())
1973 return 0;
1914 } 1974 }
1915 return 1; 1975 return 2;
1916} 1976}
1917 1977
1918/* 1978/*
@@ -1925,8 +1985,9 @@ static int
1925check_prevs_add(struct task_struct *curr, struct held_lock *next) 1985check_prevs_add(struct task_struct *curr, struct held_lock *next)
1926{ 1986{
1927 int depth = curr->lockdep_depth; 1987 int depth = curr->lockdep_depth;
1928 int stack_saved = 0;
1929 struct held_lock *hlock; 1988 struct held_lock *hlock;
1989 struct stack_trace trace;
1990 int (*save)(struct stack_trace *trace) = save_trace;
1930 1991
1931 /* 1992 /*
1932 * Debugging checks. 1993 * Debugging checks.
@@ -1947,21 +2008,36 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next)
1947 int distance = curr->lockdep_depth - depth + 1; 2008 int distance = curr->lockdep_depth - depth + 1;
1948 hlock = curr->held_locks + depth - 1; 2009 hlock = curr->held_locks + depth - 1;
1949 /* 2010 /*
1950 * Only non-recursive-read entries get new dependencies 2011 * Only non-crosslock entries get new dependencies added.
1951 * added: 2012 * Crosslock entries will be added by commit later:
1952 */ 2013 */
1953 if (hlock->read != 2 && hlock->check) { 2014 if (!cross_lock(hlock->instance)) {
1954 if (!check_prev_add(curr, hlock, next,
1955 distance, &stack_saved))
1956 return 0;
1957 /* 2015 /*
1958 * Stop after the first non-trylock entry, 2016 * Only non-recursive-read entries get new dependencies
1959 * as non-trylock entries have added their 2017 * added:
1960 * own direct dependencies already, so this
1961 * lock is connected to them indirectly:
1962 */ 2018 */
1963 if (!hlock->trylock) 2019 if (hlock->read != 2 && hlock->check) {
1964 break; 2020 int ret = check_prev_add(curr, hlock, next,
2021 distance, &trace, save);
2022 if (!ret)
2023 return 0;
2024
2025 /*
2026 * Stop saving stack_trace if save_trace() was
2027 * called at least once:
2028 */
2029 if (save && ret == 2)
2030 save = NULL;
2031
2032 /*
2033 * Stop after the first non-trylock entry,
2034 * as non-trylock entries have added their
2035 * own direct dependencies already, so this
2036 * lock is connected to them indirectly:
2037 */
2038 if (!hlock->trylock)
2039 break;
2040 }
1965 } 2041 }
1966 depth--; 2042 depth--;
1967 /* 2043 /*
@@ -2126,19 +2202,26 @@ static int check_no_collision(struct task_struct *curr,
2126} 2202}
2127 2203
2128/* 2204/*
2129 * Look up a dependency chain. If the key is not present yet then 2205 * This is for building a chain between just two different classes,
2130 * add it and return 1 - in this case the new dependency chain is 2206 * instead of adding a new hlock upon current, which is done by
2131 * validated. If the key is already hashed, return 0. 2207 * add_chain_cache().
2132 * (On return with 1 graph_lock is held.) 2208 *
2209 * This can be called in any context with two classes, while
2210 * add_chain_cache() must be done within the lock owener's context
2211 * since it uses hlock which might be racy in another context.
2133 */ 2212 */
2134static inline int lookup_chain_cache(struct task_struct *curr, 2213static inline int add_chain_cache_classes(unsigned int prev,
2135 struct held_lock *hlock, 2214 unsigned int next,
2136 u64 chain_key) 2215 unsigned int irq_context,
2216 u64 chain_key)
2137{ 2217{
2138 struct lock_class *class = hlock_class(hlock);
2139 struct hlist_head *hash_head = chainhashentry(chain_key); 2218 struct hlist_head *hash_head = chainhashentry(chain_key);
2140 struct lock_chain *chain; 2219 struct lock_chain *chain;
2141 int i, j; 2220
2221 /*
2222 * Allocate a new chain entry from the static array, and add
2223 * it to the hash:
2224 */
2142 2225
2143 /* 2226 /*
2144 * We might need to take the graph lock, ensure we've got IRQs 2227 * We might need to take the graph lock, ensure we've got IRQs
@@ -2147,43 +2230,76 @@ static inline int lookup_chain_cache(struct task_struct *curr,
2147 */ 2230 */
2148 if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) 2231 if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
2149 return 0; 2232 return 0;
2233
2234 if (unlikely(nr_lock_chains >= MAX_LOCKDEP_CHAINS)) {
2235 if (!debug_locks_off_graph_unlock())
2236 return 0;
2237
2238 print_lockdep_off("BUG: MAX_LOCKDEP_CHAINS too low!");
2239 dump_stack();
2240 return 0;
2241 }
2242
2243 chain = lock_chains + nr_lock_chains++;
2244 chain->chain_key = chain_key;
2245 chain->irq_context = irq_context;
2246 chain->depth = 2;
2247 if (likely(nr_chain_hlocks + chain->depth <= MAX_LOCKDEP_CHAIN_HLOCKS)) {
2248 chain->base = nr_chain_hlocks;
2249 nr_chain_hlocks += chain->depth;
2250 chain_hlocks[chain->base] = prev - 1;
2251 chain_hlocks[chain->base + 1] = next -1;
2252 }
2253#ifdef CONFIG_DEBUG_LOCKDEP
2150 /* 2254 /*
2151 * We can walk it lock-free, because entries only get added 2255 * Important for check_no_collision().
2152 * to the hash:
2153 */ 2256 */
2154 hlist_for_each_entry_rcu(chain, hash_head, entry) { 2257 else {
2155 if (chain->chain_key == chain_key) { 2258 if (!debug_locks_off_graph_unlock())
2156cache_hit:
2157 debug_atomic_inc(chain_lookup_hits);
2158 if (!check_no_collision(curr, hlock, chain))
2159 return 0;
2160
2161 if (very_verbose(class))
2162 printk("\nhash chain already cached, key: "
2163 "%016Lx tail class: [%p] %s\n",
2164 (unsigned long long)chain_key,
2165 class->key, class->name);
2166 return 0; 2259 return 0;
2167 } 2260
2261 print_lockdep_off("BUG: MAX_LOCKDEP_CHAIN_HLOCKS too low!");
2262 dump_stack();
2263 return 0;
2168 } 2264 }
2169 if (very_verbose(class)) 2265#endif
2170 printk("\nnew hash chain, key: %016Lx tail class: [%p] %s\n", 2266
2171 (unsigned long long)chain_key, class->key, class->name); 2267 hlist_add_head_rcu(&chain->entry, hash_head);
2268 debug_atomic_inc(chain_lookup_misses);
2269 inc_chains();
2270
2271 return 1;
2272}
2273
2274/*
2275 * Adds a dependency chain into chain hashtable. And must be called with
2276 * graph_lock held.
2277 *
2278 * Return 0 if fail, and graph_lock is released.
2279 * Return 1 if succeed, with graph_lock held.
2280 */
2281static inline int add_chain_cache(struct task_struct *curr,
2282 struct held_lock *hlock,
2283 u64 chain_key)
2284{
2285 struct lock_class *class = hlock_class(hlock);
2286 struct hlist_head *hash_head = chainhashentry(chain_key);
2287 struct lock_chain *chain;
2288 int i, j;
2289
2172 /* 2290 /*
2173 * Allocate a new chain entry from the static array, and add 2291 * Allocate a new chain entry from the static array, and add
2174 * it to the hash: 2292 * it to the hash:
2175 */ 2293 */
2176 if (!graph_lock()) 2294
2177 return 0;
2178 /* 2295 /*
2179 * We have to walk the chain again locked - to avoid duplicates: 2296 * We might need to take the graph lock, ensure we've got IRQs
2297 * disabled to make this an IRQ-safe lock.. for recursion reasons
2298 * lockdep won't complain about its own locking errors.
2180 */ 2299 */
2181 hlist_for_each_entry(chain, hash_head, entry) { 2300 if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
2182 if (chain->chain_key == chain_key) { 2301 return 0;
2183 graph_unlock(); 2302
2184 goto cache_hit;
2185 }
2186 }
2187 if (unlikely(nr_lock_chains >= MAX_LOCKDEP_CHAINS)) { 2303 if (unlikely(nr_lock_chains >= MAX_LOCKDEP_CHAINS)) {
2188 if (!debug_locks_off_graph_unlock()) 2304 if (!debug_locks_off_graph_unlock())
2189 return 0; 2305 return 0;
@@ -2235,6 +2351,78 @@ cache_hit:
2235 return 1; 2351 return 1;
2236} 2352}
2237 2353
2354/*
2355 * Look up a dependency chain.
2356 */
2357static inline struct lock_chain *lookup_chain_cache(u64 chain_key)
2358{
2359 struct hlist_head *hash_head = chainhashentry(chain_key);
2360 struct lock_chain *chain;
2361
2362 /*
2363 * We can walk it lock-free, because entries only get added
2364 * to the hash:
2365 */
2366 hlist_for_each_entry_rcu(chain, hash_head, entry) {
2367 if (chain->chain_key == chain_key) {
2368 debug_atomic_inc(chain_lookup_hits);
2369 return chain;
2370 }
2371 }
2372 return NULL;
2373}
2374
2375/*
2376 * If the key is not present yet in dependency chain cache then
2377 * add it and return 1 - in this case the new dependency chain is
2378 * validated. If the key is already hashed, return 0.
2379 * (On return with 1 graph_lock is held.)
2380 */
2381static inline int lookup_chain_cache_add(struct task_struct *curr,
2382 struct held_lock *hlock,
2383 u64 chain_key)
2384{
2385 struct lock_class *class = hlock_class(hlock);
2386 struct lock_chain *chain = lookup_chain_cache(chain_key);
2387
2388 if (chain) {
2389cache_hit:
2390 if (!check_no_collision(curr, hlock, chain))
2391 return 0;
2392
2393 if (very_verbose(class)) {
2394 printk("\nhash chain already cached, key: "
2395 "%016Lx tail class: [%p] %s\n",
2396 (unsigned long long)chain_key,
2397 class->key, class->name);
2398 }
2399
2400 return 0;
2401 }
2402
2403 if (very_verbose(class)) {
2404 printk("\nnew hash chain, key: %016Lx tail class: [%p] %s\n",
2405 (unsigned long long)chain_key, class->key, class->name);
2406 }
2407
2408 if (!graph_lock())
2409 return 0;
2410
2411 /*
2412 * We have to walk the chain again locked - to avoid duplicates:
2413 */
2414 chain = lookup_chain_cache(chain_key);
2415 if (chain) {
2416 graph_unlock();
2417 goto cache_hit;
2418 }
2419
2420 if (!add_chain_cache(curr, hlock, chain_key))
2421 return 0;
2422
2423 return 1;
2424}
2425
2238static int validate_chain(struct task_struct *curr, struct lockdep_map *lock, 2426static int validate_chain(struct task_struct *curr, struct lockdep_map *lock,
2239 struct held_lock *hlock, int chain_head, u64 chain_key) 2427 struct held_lock *hlock, int chain_head, u64 chain_key)
2240{ 2428{
@@ -2245,11 +2433,11 @@ static int validate_chain(struct task_struct *curr, struct lockdep_map *lock,
2245 * 2433 *
2246 * We look up the chain_key and do the O(N^2) check and update of 2434 * We look up the chain_key and do the O(N^2) check and update of
2247 * the dependencies only if this is a new dependency chain. 2435 * the dependencies only if this is a new dependency chain.
2248 * (If lookup_chain_cache() returns with 1 it acquires 2436 * (If lookup_chain_cache_add() return with 1 it acquires
2249 * graph_lock for us) 2437 * graph_lock for us)
2250 */ 2438 */
2251 if (!hlock->trylock && hlock->check && 2439 if (!hlock->trylock && hlock->check &&
2252 lookup_chain_cache(curr, hlock, chain_key)) { 2440 lookup_chain_cache_add(curr, hlock, chain_key)) {
2253 /* 2441 /*
2254 * Check whether last held lock: 2442 * Check whether last held lock:
2255 * 2443 *
@@ -2277,14 +2465,17 @@ static int validate_chain(struct task_struct *curr, struct lockdep_map *lock,
2277 * Add dependency only if this lock is not the head 2465 * Add dependency only if this lock is not the head
2278 * of the chain, and if it's not a secondary read-lock: 2466 * of the chain, and if it's not a secondary read-lock:
2279 */ 2467 */
2280 if (!chain_head && ret != 2) 2468 if (!chain_head && ret != 2) {
2281 if (!check_prevs_add(curr, hlock)) 2469 if (!check_prevs_add(curr, hlock))
2282 return 0; 2470 return 0;
2471 }
2472
2283 graph_unlock(); 2473 graph_unlock();
2284 } else 2474 } else {
2285 /* after lookup_chain_cache(): */ 2475 /* after lookup_chain_cache_add(): */
2286 if (unlikely(!debug_locks)) 2476 if (unlikely(!debug_locks))
2287 return 0; 2477 return 0;
2478 }
2288 2479
2289 return 1; 2480 return 1;
2290} 2481}
@@ -2567,14 +2758,6 @@ static int SOFTIRQ_verbose(struct lock_class *class)
2567 return 0; 2758 return 0;
2568} 2759}
2569 2760
2570static int RECLAIM_FS_verbose(struct lock_class *class)
2571{
2572#if RECLAIM_VERBOSE
2573 return class_filter(class);
2574#endif
2575 return 0;
2576}
2577
2578#define STRICT_READ_CHECKS 1 2761#define STRICT_READ_CHECKS 1
2579 2762
2580static int (*state_verbose_f[])(struct lock_class *class) = { 2763static int (*state_verbose_f[])(struct lock_class *class) = {
@@ -2870,57 +3053,6 @@ void trace_softirqs_off(unsigned long ip)
2870 debug_atomic_inc(redundant_softirqs_off); 3053 debug_atomic_inc(redundant_softirqs_off);
2871} 3054}
2872 3055
2873static void __lockdep_trace_alloc(gfp_t gfp_mask, unsigned long flags)
2874{
2875 struct task_struct *curr = current;
2876
2877 if (unlikely(!debug_locks))
2878 return;
2879
2880 gfp_mask = current_gfp_context(gfp_mask);
2881
2882 /* no reclaim without waiting on it */
2883 if (!(gfp_mask & __GFP_DIRECT_RECLAIM))
2884 return;
2885
2886 /* this guy won't enter reclaim */
2887 if ((curr->flags & PF_MEMALLOC) && !(gfp_mask & __GFP_NOMEMALLOC))
2888 return;
2889
2890 /* We're only interested __GFP_FS allocations for now */
2891 if (!(gfp_mask & __GFP_FS) || (curr->flags & PF_MEMALLOC_NOFS))
2892 return;
2893
2894 /*
2895 * Oi! Can't be having __GFP_FS allocations with IRQs disabled.
2896 */
2897 if (DEBUG_LOCKS_WARN_ON(irqs_disabled_flags(flags)))
2898 return;
2899
2900 /* Disable lockdep if explicitly requested */
2901 if (gfp_mask & __GFP_NOLOCKDEP)
2902 return;
2903
2904 mark_held_locks(curr, RECLAIM_FS);
2905}
2906
2907static void check_flags(unsigned long flags);
2908
2909void lockdep_trace_alloc(gfp_t gfp_mask)
2910{
2911 unsigned long flags;
2912
2913 if (unlikely(current->lockdep_recursion))
2914 return;
2915
2916 raw_local_irq_save(flags);
2917 check_flags(flags);
2918 current->lockdep_recursion = 1;
2919 __lockdep_trace_alloc(gfp_mask, flags);
2920 current->lockdep_recursion = 0;
2921 raw_local_irq_restore(flags);
2922}
2923
2924static int mark_irqflags(struct task_struct *curr, struct held_lock *hlock) 3056static int mark_irqflags(struct task_struct *curr, struct held_lock *hlock)
2925{ 3057{
2926 /* 3058 /*
@@ -2966,22 +3098,6 @@ static int mark_irqflags(struct task_struct *curr, struct held_lock *hlock)
2966 } 3098 }
2967 } 3099 }
2968 3100
2969 /*
2970 * We reuse the irq context infrastructure more broadly as a general
2971 * context checking code. This tests GFP_FS recursion (a lock taken
2972 * during reclaim for a GFP_FS allocation is held over a GFP_FS
2973 * allocation).
2974 */
2975 if (!hlock->trylock && (curr->lockdep_reclaim_gfp & __GFP_FS)) {
2976 if (hlock->read) {
2977 if (!mark_lock(curr, hlock, LOCK_USED_IN_RECLAIM_FS_READ))
2978 return 0;
2979 } else {
2980 if (!mark_lock(curr, hlock, LOCK_USED_IN_RECLAIM_FS))
2981 return 0;
2982 }
2983 }
2984
2985 return 1; 3101 return 1;
2986} 3102}
2987 3103
@@ -3040,10 +3156,6 @@ static inline int separate_irq_context(struct task_struct *curr,
3040 return 0; 3156 return 0;
3041} 3157}
3042 3158
3043void lockdep_trace_alloc(gfp_t gfp_mask)
3044{
3045}
3046
3047#endif /* defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) */ 3159#endif /* defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) */
3048 3160
3049/* 3161/*
@@ -3116,7 +3228,7 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
3116/* 3228/*
3117 * Initialize a lock instance's lock-class mapping info: 3229 * Initialize a lock instance's lock-class mapping info:
3118 */ 3230 */
3119void lockdep_init_map(struct lockdep_map *lock, const char *name, 3231static void __lockdep_init_map(struct lockdep_map *lock, const char *name,
3120 struct lock_class_key *key, int subclass) 3232 struct lock_class_key *key, int subclass)
3121{ 3233{
3122 int i; 3234 int i;
@@ -3174,8 +3286,25 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name,
3174 raw_local_irq_restore(flags); 3286 raw_local_irq_restore(flags);
3175 } 3287 }
3176} 3288}
3289
3290void lockdep_init_map(struct lockdep_map *lock, const char *name,
3291 struct lock_class_key *key, int subclass)
3292{
3293 cross_init(lock, 0);
3294 __lockdep_init_map(lock, name, key, subclass);
3295}
3177EXPORT_SYMBOL_GPL(lockdep_init_map); 3296EXPORT_SYMBOL_GPL(lockdep_init_map);
3178 3297
3298#ifdef CONFIG_LOCKDEP_CROSSRELEASE
3299void lockdep_init_map_crosslock(struct lockdep_map *lock, const char *name,
3300 struct lock_class_key *key, int subclass)
3301{
3302 cross_init(lock, 1);
3303 __lockdep_init_map(lock, name, key, subclass);
3304}
3305EXPORT_SYMBOL_GPL(lockdep_init_map_crosslock);
3306#endif
3307
3179struct lock_class_key __lockdep_no_validate__; 3308struct lock_class_key __lockdep_no_validate__;
3180EXPORT_SYMBOL_GPL(__lockdep_no_validate__); 3309EXPORT_SYMBOL_GPL(__lockdep_no_validate__);
3181 3310
@@ -3231,6 +3360,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
3231 int chain_head = 0; 3360 int chain_head = 0;
3232 int class_idx; 3361 int class_idx;
3233 u64 chain_key; 3362 u64 chain_key;
3363 int ret;
3234 3364
3235 if (unlikely(!debug_locks)) 3365 if (unlikely(!debug_locks))
3236 return 0; 3366 return 0;
@@ -3279,7 +3409,8 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
3279 3409
3280 class_idx = class - lock_classes + 1; 3410 class_idx = class - lock_classes + 1;
3281 3411
3282 if (depth) { 3412 /* TODO: nest_lock is not implemented for crosslock yet. */
3413 if (depth && !cross_lock(lock)) {
3283 hlock = curr->held_locks + depth - 1; 3414 hlock = curr->held_locks + depth - 1;
3284 if (hlock->class_idx == class_idx && nest_lock) { 3415 if (hlock->class_idx == class_idx && nest_lock) {
3285 if (hlock->references) { 3416 if (hlock->references) {
@@ -3367,6 +3498,14 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
3367 if (!validate_chain(curr, lock, hlock, chain_head, chain_key)) 3498 if (!validate_chain(curr, lock, hlock, chain_head, chain_key))
3368 return 0; 3499 return 0;
3369 3500
3501 ret = lock_acquire_crosslock(hlock);
3502 /*
3503 * 2 means normal acquire operations are needed. Otherwise, it's
3504 * ok just to return with '0:fail, 1:success'.
3505 */
3506 if (ret != 2)
3507 return ret;
3508
3370 curr->curr_chain_key = chain_key; 3509 curr->curr_chain_key = chain_key;
3371 curr->lockdep_depth++; 3510 curr->lockdep_depth++;
3372 check_chain_key(curr); 3511 check_chain_key(curr);
@@ -3604,11 +3743,19 @@ __lock_release(struct lockdep_map *lock, int nested, unsigned long ip)
3604 struct task_struct *curr = current; 3743 struct task_struct *curr = current;
3605 struct held_lock *hlock; 3744 struct held_lock *hlock;
3606 unsigned int depth; 3745 unsigned int depth;
3607 int i; 3746 int ret, i;
3608 3747
3609 if (unlikely(!debug_locks)) 3748 if (unlikely(!debug_locks))
3610 return 0; 3749 return 0;
3611 3750
3751 ret = lock_release_crosslock(lock);
3752 /*
3753 * 2 means normal release operations are needed. Otherwise, it's
3754 * ok just to return with '0:fail, 1:success'.
3755 */
3756 if (ret != 2)
3757 return ret;
3758
3612 depth = curr->lockdep_depth; 3759 depth = curr->lockdep_depth;
3613 /* 3760 /*
3614 * So we're all set to release this lock.. wait what lock? We don't 3761 * So we're all set to release this lock.. wait what lock? We don't
@@ -3952,18 +4099,6 @@ void lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie cookie)
3952} 4099}
3953EXPORT_SYMBOL_GPL(lock_unpin_lock); 4100EXPORT_SYMBOL_GPL(lock_unpin_lock);
3954 4101
3955void lockdep_set_current_reclaim_state(gfp_t gfp_mask)
3956{
3957 current->lockdep_reclaim_gfp = current_gfp_context(gfp_mask);
3958}
3959EXPORT_SYMBOL_GPL(lockdep_set_current_reclaim_state);
3960
3961void lockdep_clear_current_reclaim_state(void)
3962{
3963 current->lockdep_reclaim_gfp = 0;
3964}
3965EXPORT_SYMBOL_GPL(lockdep_clear_current_reclaim_state);
3966
3967#ifdef CONFIG_LOCK_STAT 4102#ifdef CONFIG_LOCK_STAT
3968static int 4103static int
3969print_lock_contention_bug(struct task_struct *curr, struct lockdep_map *lock, 4104print_lock_contention_bug(struct task_struct *curr, struct lockdep_map *lock,
@@ -4484,6 +4619,17 @@ asmlinkage __visible void lockdep_sys_exit(void)
4484 curr->comm, curr->pid); 4619 curr->comm, curr->pid);
4485 lockdep_print_held_locks(curr); 4620 lockdep_print_held_locks(curr);
4486 } 4621 }
4622
4623 /*
4624 * The lock history for each syscall should be independent. So wipe the
4625 * slate clean on return to userspace.
4626 *
4627 * crossrelease_hist_end() works well here even when getting here
4628 * without starting (i.e. just after forking), because it rolls back
4629 * the index to point to the last entry, which is already invalid.
4630 */
4631 crossrelease_hist_end(XHLOCK_PROC);
4632 crossrelease_hist_start(XHLOCK_PROC);
4487} 4633}
4488 4634
4489void lockdep_rcu_suspicious(const char *file, const int line, const char *s) 4635void lockdep_rcu_suspicious(const char *file, const int line, const char *s)
@@ -4532,3 +4678,470 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s)
4532 dump_stack(); 4678 dump_stack();
4533} 4679}
4534EXPORT_SYMBOL_GPL(lockdep_rcu_suspicious); 4680EXPORT_SYMBOL_GPL(lockdep_rcu_suspicious);
4681
4682#ifdef CONFIG_LOCKDEP_CROSSRELEASE
4683
4684/*
4685 * Crossrelease works by recording a lock history for each thread and
4686 * connecting those historic locks that were taken after the
4687 * wait_for_completion() in the complete() context.
4688 *
4689 * Task-A Task-B
4690 *
4691 * mutex_lock(&A);
4692 * mutex_unlock(&A);
4693 *
4694 * wait_for_completion(&C);
4695 * lock_acquire_crosslock();
4696 * atomic_inc_return(&cross_gen_id);
4697 * |
4698 * | mutex_lock(&B);
4699 * | mutex_unlock(&B);
4700 * |
4701 * | complete(&C);
4702 * `-- lock_commit_crosslock();
4703 *
4704 * Which will then add a dependency between B and C.
4705 */
4706
4707#define xhlock(i) (current->xhlocks[(i) % MAX_XHLOCKS_NR])
4708
4709/*
4710 * Whenever a crosslock is held, cross_gen_id will be increased.
4711 */
4712static atomic_t cross_gen_id; /* Can be wrapped */
4713
4714/*
4715 * Make an entry of the ring buffer invalid.
4716 */
4717static inline void invalidate_xhlock(struct hist_lock *xhlock)
4718{
4719 /*
4720 * Normally, xhlock->hlock.instance must be !NULL.
4721 */
4722 xhlock->hlock.instance = NULL;
4723}
4724
4725/*
4726 * Lock history stacks; we have 3 nested lock history stacks:
4727 *
4728 * Hard IRQ
4729 * Soft IRQ
4730 * History / Task
4731 *
4732 * The thing is that once we complete a (Hard/Soft) IRQ the future task locks
4733 * should not depend on any of the locks observed while running the IRQ.
4734 *
4735 * So what we do is rewind the history buffer and erase all our knowledge of
4736 * that temporal event.
4737 */
4738
4739/*
4740 * We need this to annotate lock history boundaries. Take for instance
4741 * workqueues; each work is independent of the last. The completion of a future
4742 * work does not depend on the completion of a past work (in general).
4743 * Therefore we must not carry that (lock) dependency across works.
4744 *
4745 * This is true for many things; pretty much all kthreads fall into this
4746 * pattern, where they have an 'idle' state and future completions do not
4747 * depend on past completions. Its just that since they all have the 'same'
4748 * form -- the kthread does the same over and over -- it doesn't typically
4749 * matter.
4750 *
4751 * The same is true for system-calls, once a system call is completed (we've
4752 * returned to userspace) the next system call does not depend on the lock
4753 * history of the previous system call.
4754 */
4755void crossrelease_hist_start(enum xhlock_context_t c)
4756{
4757 struct task_struct *cur = current;
4758
4759 if (cur->xhlocks) {
4760 cur->xhlock_idx_hist[c] = cur->xhlock_idx;
4761 cur->hist_id_save[c] = cur->hist_id;
4762 }
4763}
4764
4765void crossrelease_hist_end(enum xhlock_context_t c)
4766{
4767 struct task_struct *cur = current;
4768
4769 if (cur->xhlocks) {
4770 unsigned int idx = cur->xhlock_idx_hist[c];
4771 struct hist_lock *h = &xhlock(idx);
4772
4773 cur->xhlock_idx = idx;
4774
4775 /* Check if the ring was overwritten. */
4776 if (h->hist_id != cur->hist_id_save[c])
4777 invalidate_xhlock(h);
4778 }
4779}
4780
4781static int cross_lock(struct lockdep_map *lock)
4782{
4783 return lock ? lock->cross : 0;
4784}
4785
4786/*
4787 * This is needed to decide the relationship between wrapable variables.
4788 */
4789static inline int before(unsigned int a, unsigned int b)
4790{
4791 return (int)(a - b) < 0;
4792}
4793
4794static inline struct lock_class *xhlock_class(struct hist_lock *xhlock)
4795{
4796 return hlock_class(&xhlock->hlock);
4797}
4798
4799static inline struct lock_class *xlock_class(struct cross_lock *xlock)
4800{
4801 return hlock_class(&xlock->hlock);
4802}
4803
4804/*
4805 * Should we check a dependency with previous one?
4806 */
4807static inline int depend_before(struct held_lock *hlock)
4808{
4809 return hlock->read != 2 && hlock->check && !hlock->trylock;
4810}
4811
4812/*
4813 * Should we check a dependency with next one?
4814 */
4815static inline int depend_after(struct held_lock *hlock)
4816{
4817 return hlock->read != 2 && hlock->check;
4818}
4819
4820/*
4821 * Check if the xhlock is valid, which would be false if,
4822 *
4823 * 1. Has not used after initializaion yet.
4824 * 2. Got invalidated.
4825 *
4826 * Remind hist_lock is implemented as a ring buffer.
4827 */
4828static inline int xhlock_valid(struct hist_lock *xhlock)
4829{
4830 /*
4831 * xhlock->hlock.instance must be !NULL.
4832 */
4833 return !!xhlock->hlock.instance;
4834}
4835
4836/*
4837 * Record a hist_lock entry.
4838 *
4839 * Irq disable is only required.
4840 */
4841static void add_xhlock(struct held_lock *hlock)
4842{
4843 unsigned int idx = ++current->xhlock_idx;
4844 struct hist_lock *xhlock = &xhlock(idx);
4845
4846#ifdef CONFIG_DEBUG_LOCKDEP
4847 /*
4848 * This can be done locklessly because they are all task-local
4849 * state, we must however ensure IRQs are disabled.
4850 */
4851 WARN_ON_ONCE(!irqs_disabled());
4852#endif
4853
4854 /* Initialize hist_lock's members */
4855 xhlock->hlock = *hlock;
4856 xhlock->hist_id = ++current->hist_id;
4857
4858 xhlock->trace.nr_entries = 0;
4859 xhlock->trace.max_entries = MAX_XHLOCK_TRACE_ENTRIES;
4860 xhlock->trace.entries = xhlock->trace_entries;
4861 xhlock->trace.skip = 3;
4862 save_stack_trace(&xhlock->trace);
4863}
4864
4865static inline int same_context_xhlock(struct hist_lock *xhlock)
4866{
4867 return xhlock->hlock.irq_context == task_irq_context(current);
4868}
4869
4870/*
4871 * This should be lockless as far as possible because this would be
4872 * called very frequently.
4873 */
4874static void check_add_xhlock(struct held_lock *hlock)
4875{
4876 /*
4877 * Record a hist_lock, only in case that acquisitions ahead
4878 * could depend on the held_lock. For example, if the held_lock
4879 * is trylock then acquisitions ahead never depends on that.
4880 * In that case, we don't need to record it. Just return.
4881 */
4882 if (!current->xhlocks || !depend_before(hlock))
4883 return;
4884
4885 add_xhlock(hlock);
4886}
4887
4888/*
4889 * For crosslock.
4890 */
4891static int add_xlock(struct held_lock *hlock)
4892{
4893 struct cross_lock *xlock;
4894 unsigned int gen_id;
4895
4896 if (!graph_lock())
4897 return 0;
4898
4899 xlock = &((struct lockdep_map_cross *)hlock->instance)->xlock;
4900
4901 /*
4902 * When acquisitions for a crosslock are overlapped, we use
4903 * nr_acquire to perform commit for them, based on cross_gen_id
4904 * of the first acquisition, which allows to add additional
4905 * dependencies.
4906 *
4907 * Moreover, when no acquisition of a crosslock is in progress,
4908 * we should not perform commit because the lock might not exist
4909 * any more, which might cause incorrect memory access. So we
4910 * have to track the number of acquisitions of a crosslock.
4911 *
4912 * depend_after() is necessary to initialize only the first
4913 * valid xlock so that the xlock can be used on its commit.
4914 */
4915 if (xlock->nr_acquire++ && depend_after(&xlock->hlock))
4916 goto unlock;
4917
4918 gen_id = (unsigned int)atomic_inc_return(&cross_gen_id);
4919 xlock->hlock = *hlock;
4920 xlock->hlock.gen_id = gen_id;
4921unlock:
4922 graph_unlock();
4923 return 1;
4924}
4925
4926/*
4927 * Called for both normal and crosslock acquires. Normal locks will be
4928 * pushed on the hist_lock queue. Cross locks will record state and
4929 * stop regular lock_acquire() to avoid being placed on the held_lock
4930 * stack.
4931 *
4932 * Return: 0 - failure;
4933 * 1 - crosslock, done;
4934 * 2 - normal lock, continue to held_lock[] ops.
4935 */
4936static int lock_acquire_crosslock(struct held_lock *hlock)
4937{
4938 /*
4939 * CONTEXT 1 CONTEXT 2
4940 * --------- ---------
4941 * lock A (cross)
4942 * X = atomic_inc_return(&cross_gen_id)
4943 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
4944 * Y = atomic_read_acquire(&cross_gen_id)
4945 * lock B
4946 *
4947 * atomic_read_acquire() is for ordering between A and B,
4948 * IOW, A happens before B, when CONTEXT 2 see Y >= X.
4949 *
4950 * Pairs with atomic_inc_return() in add_xlock().
4951 */
4952 hlock->gen_id = (unsigned int)atomic_read_acquire(&cross_gen_id);
4953
4954 if (cross_lock(hlock->instance))
4955 return add_xlock(hlock);
4956
4957 check_add_xhlock(hlock);
4958 return 2;
4959}
4960
4961static int copy_trace(struct stack_trace *trace)
4962{
4963 unsigned long *buf = stack_trace + nr_stack_trace_entries;
4964 unsigned int max_nr = MAX_STACK_TRACE_ENTRIES - nr_stack_trace_entries;
4965 unsigned int nr = min(max_nr, trace->nr_entries);
4966
4967 trace->nr_entries = nr;
4968 memcpy(buf, trace->entries, nr * sizeof(trace->entries[0]));
4969 trace->entries = buf;
4970 nr_stack_trace_entries += nr;
4971
4972 if (nr_stack_trace_entries >= MAX_STACK_TRACE_ENTRIES-1) {
4973 if (!debug_locks_off_graph_unlock())
4974 return 0;
4975
4976 print_lockdep_off("BUG: MAX_STACK_TRACE_ENTRIES too low!");
4977 dump_stack();
4978
4979 return 0;
4980 }
4981
4982 return 1;
4983}
4984
4985static int commit_xhlock(struct cross_lock *xlock, struct hist_lock *xhlock)
4986{
4987 unsigned int xid, pid;
4988 u64 chain_key;
4989
4990 xid = xlock_class(xlock) - lock_classes;
4991 chain_key = iterate_chain_key((u64)0, xid);
4992 pid = xhlock_class(xhlock) - lock_classes;
4993 chain_key = iterate_chain_key(chain_key, pid);
4994
4995 if (lookup_chain_cache(chain_key))
4996 return 1;
4997
4998 if (!add_chain_cache_classes(xid, pid, xhlock->hlock.irq_context,
4999 chain_key))
5000 return 0;
5001
5002 if (!check_prev_add(current, &xlock->hlock, &xhlock->hlock, 1,
5003 &xhlock->trace, copy_trace))
5004 return 0;
5005
5006 return 1;
5007}
5008
5009static void commit_xhlocks(struct cross_lock *xlock)
5010{
5011 unsigned int cur = current->xhlock_idx;
5012 unsigned int prev_hist_id = xhlock(cur).hist_id;
5013 unsigned int i;
5014
5015 if (!graph_lock())
5016 return;
5017
5018 if (xlock->nr_acquire) {
5019 for (i = 0; i < MAX_XHLOCKS_NR; i++) {
5020 struct hist_lock *xhlock = &xhlock(cur - i);
5021
5022 if (!xhlock_valid(xhlock))
5023 break;
5024
5025 if (before(xhlock->hlock.gen_id, xlock->hlock.gen_id))
5026 break;
5027
5028 if (!same_context_xhlock(xhlock))
5029 break;
5030
5031 /*
5032 * Filter out the cases where the ring buffer was
5033 * overwritten and the current entry has a bigger
5034 * hist_id than the previous one, which is impossible
5035 * otherwise:
5036 */
5037 if (unlikely(before(prev_hist_id, xhlock->hist_id)))
5038 break;
5039
5040 prev_hist_id = xhlock->hist_id;
5041
5042 /*
5043 * commit_xhlock() returns 0 with graph_lock already
5044 * released if fail.
5045 */
5046 if (!commit_xhlock(xlock, xhlock))
5047 return;
5048 }
5049 }
5050
5051 graph_unlock();
5052}
5053
5054void lock_commit_crosslock(struct lockdep_map *lock)
5055{
5056 struct cross_lock *xlock;
5057 unsigned long flags;
5058
5059 if (unlikely(!debug_locks || current->lockdep_recursion))
5060 return;
5061
5062 if (!current->xhlocks)
5063 return;
5064
5065 /*
5066 * Do commit hist_locks with the cross_lock, only in case that
5067 * the cross_lock could depend on acquisitions after that.
5068 *
5069 * For example, if the cross_lock does not have the 'check' flag
5070 * then we don't need to check dependencies and commit for that.
5071 * Just skip it. In that case, of course, the cross_lock does
5072 * not depend on acquisitions ahead, either.
5073 *
5074 * WARNING: Don't do that in add_xlock() in advance. When an
5075 * acquisition context is different from the commit context,
5076 * invalid(skipped) cross_lock might be accessed.
5077 */
5078 if (!depend_after(&((struct lockdep_map_cross *)lock)->xlock.hlock))
5079 return;
5080
5081 raw_local_irq_save(flags);
5082 check_flags(flags);
5083 current->lockdep_recursion = 1;
5084 xlock = &((struct lockdep_map_cross *)lock)->xlock;
5085 commit_xhlocks(xlock);
5086 current->lockdep_recursion = 0;
5087 raw_local_irq_restore(flags);
5088}
5089EXPORT_SYMBOL_GPL(lock_commit_crosslock);
5090
5091/*
5092 * Return: 0 - failure;
5093 * 1 - crosslock, done;
5094 * 2 - normal lock, continue to held_lock[] ops.
5095 */
5096static int lock_release_crosslock(struct lockdep_map *lock)
5097{
5098 if (cross_lock(lock)) {
5099 if (!graph_lock())
5100 return 0;
5101 ((struct lockdep_map_cross *)lock)->xlock.nr_acquire--;
5102 graph_unlock();
5103 return 1;
5104 }
5105 return 2;
5106}
5107
5108static void cross_init(struct lockdep_map *lock, int cross)
5109{
5110 if (cross)
5111 ((struct lockdep_map_cross *)lock)->xlock.nr_acquire = 0;
5112
5113 lock->cross = cross;
5114
5115 /*
5116 * Crossrelease assumes that the ring buffer size of xhlocks
5117 * is aligned with power of 2. So force it on build.
5118 */
5119 BUILD_BUG_ON(MAX_XHLOCKS_NR & (MAX_XHLOCKS_NR - 1));
5120}
5121
5122void lockdep_init_task(struct task_struct *task)
5123{
5124 int i;
5125
5126 task->xhlock_idx = UINT_MAX;
5127 task->hist_id = 0;
5128
5129 for (i = 0; i < XHLOCK_CTX_NR; i++) {
5130 task->xhlock_idx_hist[i] = UINT_MAX;
5131 task->hist_id_save[i] = 0;
5132 }
5133
5134 task->xhlocks = kzalloc(sizeof(struct hist_lock) * MAX_XHLOCKS_NR,
5135 GFP_KERNEL);
5136}
5137
5138void lockdep_free_task(struct task_struct *task)
5139{
5140 if (task->xhlocks) {
5141 void *tmp = task->xhlocks;
5142 /* Diable crossrelease for current */
5143 task->xhlocks = NULL;
5144 kfree(tmp);
5145 }
5146}
5147#endif
diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h
index c08fbd2f5ba9..1da4669d57a7 100644
--- a/kernel/locking/lockdep_internals.h
+++ b/kernel/locking/lockdep_internals.h
@@ -143,6 +143,8 @@ struct lockdep_stats {
143 int redundant_softirqs_on; 143 int redundant_softirqs_on;
144 int redundant_softirqs_off; 144 int redundant_softirqs_off;
145 int nr_unused_locks; 145 int nr_unused_locks;
146 int nr_redundant_checks;
147 int nr_redundant;
146 int nr_cyclic_checks; 148 int nr_cyclic_checks;
147 int nr_cyclic_check_recursions; 149 int nr_cyclic_check_recursions;
148 int nr_find_usage_forwards_checks; 150 int nr_find_usage_forwards_checks;
diff --git a/kernel/locking/lockdep_proc.c b/kernel/locking/lockdep_proc.c
index 6d1fcc786081..68d9e267ccd4 100644
--- a/kernel/locking/lockdep_proc.c
+++ b/kernel/locking/lockdep_proc.c
@@ -201,6 +201,10 @@ static void lockdep_stats_debug_show(struct seq_file *m)
201 debug_atomic_read(chain_lookup_hits)); 201 debug_atomic_read(chain_lookup_hits));
202 seq_printf(m, " cyclic checks: %11llu\n", 202 seq_printf(m, " cyclic checks: %11llu\n",
203 debug_atomic_read(nr_cyclic_checks)); 203 debug_atomic_read(nr_cyclic_checks));
204 seq_printf(m, " redundant checks: %11llu\n",
205 debug_atomic_read(nr_redundant_checks));
206 seq_printf(m, " redundant links: %11llu\n",
207 debug_atomic_read(nr_redundant));
204 seq_printf(m, " find-mask forwards checks: %11llu\n", 208 seq_printf(m, " find-mask forwards checks: %11llu\n",
205 debug_atomic_read(nr_find_usage_forwards_checks)); 209 debug_atomic_read(nr_find_usage_forwards_checks));
206 seq_printf(m, " find-mask backwards checks: %11llu\n", 210 seq_printf(m, " find-mask backwards checks: %11llu\n",
diff --git a/kernel/locking/lockdep_states.h b/kernel/locking/lockdep_states.h
index 995b0cc2b84c..35ca09f2ed0b 100644
--- a/kernel/locking/lockdep_states.h
+++ b/kernel/locking/lockdep_states.h
@@ -6,4 +6,3 @@
6 */ 6 */
7LOCKDEP_STATE(HARDIRQ) 7LOCKDEP_STATE(HARDIRQ)
8LOCKDEP_STATE(SOFTIRQ) 8LOCKDEP_STATE(SOFTIRQ)
9LOCKDEP_STATE(RECLAIM_FS)
diff --git a/kernel/locking/osq_lock.c b/kernel/locking/osq_lock.c
index a3167941093b..a74ee6abd039 100644
--- a/kernel/locking/osq_lock.c
+++ b/kernel/locking/osq_lock.c
@@ -109,6 +109,19 @@ bool osq_lock(struct optimistic_spin_queue *lock)
109 109
110 prev = decode_cpu(old); 110 prev = decode_cpu(old);
111 node->prev = prev; 111 node->prev = prev;
112
113 /*
114 * osq_lock() unqueue
115 *
116 * node->prev = prev osq_wait_next()
117 * WMB MB
118 * prev->next = node next->prev = prev // unqueue-C
119 *
120 * Here 'node->prev' and 'next->prev' are the same variable and we need
121 * to ensure these stores happen in-order to avoid corrupting the list.
122 */
123 smp_wmb();
124
112 WRITE_ONCE(prev->next, node); 125 WRITE_ONCE(prev->next, node);
113 126
114 /* 127 /*
diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h
index 72ad45a9a794..8d039b928d61 100644
--- a/kernel/locking/rtmutex_common.h
+++ b/kernel/locking/rtmutex_common.h
@@ -40,6 +40,9 @@ struct rt_mutex_waiter {
40/* 40/*
41 * Various helpers to access the waiters-tree: 41 * Various helpers to access the waiters-tree:
42 */ 42 */
43
44#ifdef CONFIG_RT_MUTEXES
45
43static inline int rt_mutex_has_waiters(struct rt_mutex *lock) 46static inline int rt_mutex_has_waiters(struct rt_mutex *lock)
44{ 47{
45 return !RB_EMPTY_ROOT(&lock->waiters); 48 return !RB_EMPTY_ROOT(&lock->waiters);
@@ -69,6 +72,32 @@ task_top_pi_waiter(struct task_struct *p)
69 pi_tree_entry); 72 pi_tree_entry);
70} 73}
71 74
75#else
76
77static inline int rt_mutex_has_waiters(struct rt_mutex *lock)
78{
79 return false;
80}
81
82static inline struct rt_mutex_waiter *
83rt_mutex_top_waiter(struct rt_mutex *lock)
84{
85 return NULL;
86}
87
88static inline int task_has_pi_waiters(struct task_struct *p)
89{
90 return false;
91}
92
93static inline struct rt_mutex_waiter *
94task_top_pi_waiter(struct task_struct *p)
95{
96 return NULL;
97}
98
99#endif
100
72/* 101/*
73 * lock->owner state tracking: 102 * lock->owner state tracking:
74 */ 103 */
diff --git a/kernel/locking/rwsem-spinlock.c b/kernel/locking/rwsem-spinlock.c
index 20819df98125..0848634c5512 100644
--- a/kernel/locking/rwsem-spinlock.c
+++ b/kernel/locking/rwsem-spinlock.c
@@ -126,7 +126,7 @@ __rwsem_wake_one_writer(struct rw_semaphore *sem)
126/* 126/*
127 * get a read lock on the semaphore 127 * get a read lock on the semaphore
128 */ 128 */
129void __sched __down_read(struct rw_semaphore *sem) 129int __sched __down_read_common(struct rw_semaphore *sem, int state)
130{ 130{
131 struct rwsem_waiter waiter; 131 struct rwsem_waiter waiter;
132 unsigned long flags; 132 unsigned long flags;
@@ -140,8 +140,6 @@ void __sched __down_read(struct rw_semaphore *sem)
140 goto out; 140 goto out;
141 } 141 }
142 142
143 set_current_state(TASK_UNINTERRUPTIBLE);
144
145 /* set up my own style of waitqueue */ 143 /* set up my own style of waitqueue */
146 waiter.task = current; 144 waiter.task = current;
147 waiter.type = RWSEM_WAITING_FOR_READ; 145 waiter.type = RWSEM_WAITING_FOR_READ;
@@ -149,20 +147,41 @@ void __sched __down_read(struct rw_semaphore *sem)
149 147
150 list_add_tail(&waiter.list, &sem->wait_list); 148 list_add_tail(&waiter.list, &sem->wait_list);
151 149
152 /* we don't need to touch the semaphore struct anymore */
153 raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
154
155 /* wait to be given the lock */ 150 /* wait to be given the lock */
156 for (;;) { 151 for (;;) {
157 if (!waiter.task) 152 if (!waiter.task)
158 break; 153 break;
154 if (signal_pending_state(state, current))
155 goto out_nolock;
156 set_current_state(state);
157 raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
159 schedule(); 158 schedule();
160 set_current_state(TASK_UNINTERRUPTIBLE); 159 raw_spin_lock_irqsave(&sem->wait_lock, flags);
161 } 160 }
162 161
163 __set_current_state(TASK_RUNNING); 162 raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
164 out: 163 out:
165 ; 164 return 0;
165
166out_nolock:
167 /*
168 * We didn't take the lock, so that there is a writer, which
169 * is owner or the first waiter of the sem. If it's a waiter,
170 * it will be woken by current owner. Not need to wake anybody.
171 */
172 list_del(&waiter.list);
173 raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
174 return -EINTR;
175}
176
177void __sched __down_read(struct rw_semaphore *sem)
178{
179 __down_read_common(sem, TASK_UNINTERRUPTIBLE);
180}
181
182int __sched __down_read_killable(struct rw_semaphore *sem)
183{
184 return __down_read_common(sem, TASK_KILLABLE);
166} 185}
167 186
168/* 187/*
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index 34e727f18e49..02f660666ab8 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -221,8 +221,8 @@ static void __rwsem_mark_wake(struct rw_semaphore *sem,
221/* 221/*
222 * Wait for the read lock to be granted 222 * Wait for the read lock to be granted
223 */ 223 */
224__visible 224static inline struct rw_semaphore __sched *
225struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) 225__rwsem_down_read_failed_common(struct rw_semaphore *sem, int state)
226{ 226{
227 long count, adjustment = -RWSEM_ACTIVE_READ_BIAS; 227 long count, adjustment = -RWSEM_ACTIVE_READ_BIAS;
228 struct rwsem_waiter waiter; 228 struct rwsem_waiter waiter;
@@ -255,17 +255,44 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
255 255
256 /* wait to be given the lock */ 256 /* wait to be given the lock */
257 while (true) { 257 while (true) {
258 set_current_state(TASK_UNINTERRUPTIBLE); 258 set_current_state(state);
259 if (!waiter.task) 259 if (!waiter.task)
260 break; 260 break;
261 if (signal_pending_state(state, current)) {
262 raw_spin_lock_irq(&sem->wait_lock);
263 if (waiter.task)
264 goto out_nolock;
265 raw_spin_unlock_irq(&sem->wait_lock);
266 break;
267 }
261 schedule(); 268 schedule();
262 } 269 }
263 270
264 __set_current_state(TASK_RUNNING); 271 __set_current_state(TASK_RUNNING);
265 return sem; 272 return sem;
273out_nolock:
274 list_del(&waiter.list);
275 if (list_empty(&sem->wait_list))
276 atomic_long_add(-RWSEM_WAITING_BIAS, &sem->count);
277 raw_spin_unlock_irq(&sem->wait_lock);
278 __set_current_state(TASK_RUNNING);
279 return ERR_PTR(-EINTR);
280}
281
282__visible struct rw_semaphore * __sched
283rwsem_down_read_failed(struct rw_semaphore *sem)
284{
285 return __rwsem_down_read_failed_common(sem, TASK_UNINTERRUPTIBLE);
266} 286}
267EXPORT_SYMBOL(rwsem_down_read_failed); 287EXPORT_SYMBOL(rwsem_down_read_failed);
268 288
289__visible struct rw_semaphore * __sched
290rwsem_down_read_failed_killable(struct rw_semaphore *sem)
291{
292 return __rwsem_down_read_failed_common(sem, TASK_KILLABLE);
293}
294EXPORT_SYMBOL(rwsem_down_read_failed_killable);
295
269/* 296/*
270 * This function must be called with the sem->wait_lock held to prevent 297 * This function must be called with the sem->wait_lock held to prevent
271 * race conditions between checking the rwsem wait list and setting the 298 * race conditions between checking the rwsem wait list and setting the
diff --git a/kernel/panic.c b/kernel/panic.c
index a58932b41700..bdd18afa19a4 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -26,6 +26,7 @@
26#include <linux/nmi.h> 26#include <linux/nmi.h>
27#include <linux/console.h> 27#include <linux/console.h>
28#include <linux/bug.h> 28#include <linux/bug.h>
29#include <linux/ratelimit.h>
29 30
30#define PANIC_TIMER_STEP 100 31#define PANIC_TIMER_STEP 100
31#define PANIC_BLINK_SPD 18 32#define PANIC_BLINK_SPD 18
@@ -601,6 +602,17 @@ EXPORT_SYMBOL(__stack_chk_fail);
601 602
602#endif 603#endif
603 604
605#ifdef CONFIG_ARCH_HAS_REFCOUNT
606void refcount_error_report(struct pt_regs *regs, const char *err)
607{
608 WARN_RATELIMIT(1, "refcount_t %s at %pB in %s[%d], uid/euid: %u/%u\n",
609 err, (void *)instruction_pointer(regs),
610 current->comm, task_pid_nr(current),
611 from_kuid_munged(&init_user_ns, current_uid()),
612 from_kuid_munged(&init_user_ns, current_euid()));
613}
614#endif
615
604core_param(panic, panic_timeout, int, 0644); 616core_param(panic, panic_timeout, int, 0644);
605core_param(pause_on_oops, pause_on_oops, int, 0644); 617core_param(pause_on_oops, pause_on_oops, int, 0644);
606core_param(panic_on_warn, panic_on_warn, int, 0644); 618core_param(panic_on_warn, panic_on_warn, int, 0644);
diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c
index 13fc5ae9bf2f..566b6ec7b6fe 100644
--- a/kernel/sched/completion.c
+++ b/kernel/sched/completion.c
@@ -32,6 +32,12 @@ void complete(struct completion *x)
32 unsigned long flags; 32 unsigned long flags;
33 33
34 spin_lock_irqsave(&x->wait.lock, flags); 34 spin_lock_irqsave(&x->wait.lock, flags);
35
36 /*
37 * Perform commit of crossrelease here.
38 */
39 complete_release_commit(x);
40
35 if (x->done != UINT_MAX) 41 if (x->done != UINT_MAX)
36 x->done++; 42 x->done++;
37 __wake_up_locked(&x->wait, TASK_NORMAL, 1); 43 __wake_up_locked(&x->wait, TASK_NORMAL, 1);
@@ -92,9 +98,14 @@ __wait_for_common(struct completion *x,
92{ 98{
93 might_sleep(); 99 might_sleep();
94 100
101 complete_acquire(x);
102
95 spin_lock_irq(&x->wait.lock); 103 spin_lock_irq(&x->wait.lock);
96 timeout = do_wait_for_common(x, action, timeout, state); 104 timeout = do_wait_for_common(x, action, timeout, state);
97 spin_unlock_irq(&x->wait.lock); 105 spin_unlock_irq(&x->wait.lock);
106
107 complete_release(x);
108
98 return timeout; 109 return timeout;
99} 110}
100 111
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 0869b20fba81..9fece583a1f0 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1967,8 +1967,8 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
1967 * reordered with p->state check below. This pairs with mb() in 1967 * reordered with p->state check below. This pairs with mb() in
1968 * set_current_state() the waiting thread does. 1968 * set_current_state() the waiting thread does.
1969 */ 1969 */
1970 smp_mb__before_spinlock();
1971 raw_spin_lock_irqsave(&p->pi_lock, flags); 1970 raw_spin_lock_irqsave(&p->pi_lock, flags);
1971 smp_mb__after_spinlock();
1972 if (!(p->state & state)) 1972 if (!(p->state & state))
1973 goto out; 1973 goto out;
1974 1974
@@ -3281,8 +3281,8 @@ static void __sched notrace __schedule(bool preempt)
3281 * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE) 3281 * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE)
3282 * done by the caller to avoid the race with signal_wake_up(). 3282 * done by the caller to avoid the race with signal_wake_up().
3283 */ 3283 */
3284 smp_mb__before_spinlock();
3285 rq_lock(rq, &rf); 3284 rq_lock(rq, &rf);
3285 smp_mb__after_spinlock();
3286 3286
3287 /* Promote REQ to ACT */ 3287 /* Promote REQ to ACT */
3288 rq->clock_update_flags <<= 1; 3288 rq->clock_update_flags <<= 1;
diff --git a/kernel/sched/swait.c b/kernel/sched/swait.c
index 3d5610dcce11..2227e183e202 100644
--- a/kernel/sched/swait.c
+++ b/kernel/sched/swait.c
@@ -33,9 +33,6 @@ void swake_up(struct swait_queue_head *q)
33{ 33{
34 unsigned long flags; 34 unsigned long flags;
35 35
36 if (!swait_active(q))
37 return;
38
39 raw_spin_lock_irqsave(&q->lock, flags); 36 raw_spin_lock_irqsave(&q->lock, flags);
40 swake_up_locked(q); 37 swake_up_locked(q);
41 raw_spin_unlock_irqrestore(&q->lock, flags); 38 raw_spin_unlock_irqrestore(&q->lock, flags);
@@ -51,9 +48,6 @@ void swake_up_all(struct swait_queue_head *q)
51 struct swait_queue *curr; 48 struct swait_queue *curr;
52 LIST_HEAD(tmp); 49 LIST_HEAD(tmp);
53 50
54 if (!swait_active(q))
55 return;
56
57 raw_spin_lock_irq(&q->lock); 51 raw_spin_lock_irq(&q->lock);
58 list_splice_init(&q->task_list, &tmp); 52 list_splice_init(&q->task_list, &tmp);
59 while (!list_empty(&tmp)) { 53 while (!list_empty(&tmp)) {
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index ca937b0c3a96..f128b3becfe1 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -2093,6 +2093,7 @@ __acquires(&pool->lock)
2093 2093
2094 lock_map_acquire_read(&pwq->wq->lockdep_map); 2094 lock_map_acquire_read(&pwq->wq->lockdep_map);
2095 lock_map_acquire(&lockdep_map); 2095 lock_map_acquire(&lockdep_map);
2096 crossrelease_hist_start(XHLOCK_PROC);
2096 trace_workqueue_execute_start(work); 2097 trace_workqueue_execute_start(work);
2097 worker->current_func(work); 2098 worker->current_func(work);
2098 /* 2099 /*
@@ -2100,6 +2101,7 @@ __acquires(&pool->lock)
2100 * point will only record its address. 2101 * point will only record its address.
2101 */ 2102 */
2102 trace_workqueue_execute_end(work); 2103 trace_workqueue_execute_end(work);
2104 crossrelease_hist_end(XHLOCK_PROC);
2103 lock_map_release(&lockdep_map); 2105 lock_map_release(&lockdep_map);
2104 lock_map_release(&pwq->wq->lockdep_map); 2106 lock_map_release(&pwq->wq->lockdep_map);
2105 2107
@@ -2474,7 +2476,16 @@ static void insert_wq_barrier(struct pool_workqueue *pwq,
2474 */ 2476 */
2475 INIT_WORK_ONSTACK(&barr->work, wq_barrier_func); 2477 INIT_WORK_ONSTACK(&barr->work, wq_barrier_func);
2476 __set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&barr->work)); 2478 __set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&barr->work));
2477 init_completion(&barr->done); 2479
2480 /*
2481 * Explicitly init the crosslock for wq_barrier::done, make its lock
2482 * key a subkey of the corresponding work. As a result we won't
2483 * build a dependency between wq_barrier::done and unrelated work.
2484 */
2485 lockdep_init_map_crosslock((struct lockdep_map *)&barr->done.map,
2486 "(complete)wq_barr::done",
2487 target->lockdep_map.key, 1);
2488 __init_completion(&barr->done);
2478 barr->task = current; 2489 barr->task = current;
2479 2490
2480 /* 2491 /*
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index c617b9d1d6cb..e718df3cbd46 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1088,6 +1088,8 @@ config PROVE_LOCKING
1088 select DEBUG_MUTEXES 1088 select DEBUG_MUTEXES
1089 select DEBUG_RT_MUTEXES if RT_MUTEXES 1089 select DEBUG_RT_MUTEXES if RT_MUTEXES
1090 select DEBUG_LOCK_ALLOC 1090 select DEBUG_LOCK_ALLOC
1091 select LOCKDEP_CROSSRELEASE
1092 select LOCKDEP_COMPLETIONS
1091 select TRACE_IRQFLAGS 1093 select TRACE_IRQFLAGS
1092 default n 1094 default n
1093 help 1095 help
@@ -1157,6 +1159,22 @@ config LOCK_STAT
1157 CONFIG_LOCK_STAT defines "contended" and "acquired" lock events. 1159 CONFIG_LOCK_STAT defines "contended" and "acquired" lock events.
1158 (CONFIG_LOCKDEP defines "acquire" and "release" events.) 1160 (CONFIG_LOCKDEP defines "acquire" and "release" events.)
1159 1161
1162config LOCKDEP_CROSSRELEASE
1163 bool
1164 help
1165 This makes lockdep work for crosslock which is a lock allowed to
1166 be released in a different context from the acquisition context.
1167 Normally a lock must be released in the context acquiring the lock.
1168 However, relexing this constraint helps synchronization primitives
1169 such as page locks or completions can use the lock correctness
1170 detector, lockdep.
1171
1172config LOCKDEP_COMPLETIONS
1173 bool
1174 help
1175 A deadlock caused by wait_for_completion() and complete() can be
1176 detected by lockdep using crossrelease feature.
1177
1160config DEBUG_LOCKDEP 1178config DEBUG_LOCKDEP
1161 bool "Lock dependency engine debugging" 1179 bool "Lock dependency engine debugging"
1162 depends on DEBUG_KERNEL && LOCKDEP 1180 depends on DEBUG_KERNEL && LOCKDEP
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 90731e3b7e58..3644ff918434 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1510,8 +1510,15 @@ int do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd)
1510 } 1510 }
1511 1511
1512 /* 1512 /*
1513 * The page_table_lock above provides a memory barrier 1513 * Since we took the NUMA fault, we must have observed the !accessible
1514 * with change_protection_range. 1514 * bit. Make sure all other CPUs agree with that, to avoid them
1515 * modifying the page we're about to migrate.
1516 *
1517 * Must be done under PTL such that we'll observe the relevant
1518 * inc_tlb_flush_pending().
1519 *
1520 * We are not sure a pending tlb flush here is for a huge page
1521 * mapping or not. Hence use the tlb range variant
1515 */ 1522 */
1516 if (mm_tlb_flush_pending(vma->vm_mm)) 1523 if (mm_tlb_flush_pending(vma->vm_mm))
1517 flush_tlb_range(vma, haddr, haddr + HPAGE_PMD_SIZE); 1524 flush_tlb_range(vma, haddr, haddr + HPAGE_PMD_SIZE);
@@ -1521,6 +1528,7 @@ int do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd)
1521 * and access rights restored. 1528 * and access rights restored.
1522 */ 1529 */
1523 spin_unlock(vmf->ptl); 1530 spin_unlock(vmf->ptl);
1531
1524 migrated = migrate_misplaced_transhuge_page(vma->vm_mm, vma, 1532 migrated = migrate_misplaced_transhuge_page(vma->vm_mm, vma,
1525 vmf->pmd, pmd, vmf->address, page, target_nid); 1533 vmf->pmd, pmd, vmf->address, page, target_nid);
1526 if (migrated) { 1534 if (migrated) {
diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c
index ca11bc4ce205..6f319fb81718 100644
--- a/mm/kasan/kasan.c
+++ b/mm/kasan/kasan.c
@@ -267,13 +267,13 @@ static void check_memory_region(unsigned long addr,
267 check_memory_region_inline(addr, size, write, ret_ip); 267 check_memory_region_inline(addr, size, write, ret_ip);
268} 268}
269 269
270void kasan_check_read(const void *p, unsigned int size) 270void kasan_check_read(const volatile void *p, unsigned int size)
271{ 271{
272 check_memory_region((unsigned long)p, size, false, _RET_IP_); 272 check_memory_region((unsigned long)p, size, false, _RET_IP_);
273} 273}
274EXPORT_SYMBOL(kasan_check_read); 274EXPORT_SYMBOL(kasan_check_read);
275 275
276void kasan_check_write(const void *p, unsigned int size) 276void kasan_check_write(const volatile void *p, unsigned int size)
277{ 277{
278 check_memory_region((unsigned long)p, size, true, _RET_IP_); 278 check_memory_region((unsigned long)p, size, true, _RET_IP_);
279} 279}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 1bad301820c7..471b0526b876 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -66,6 +66,7 @@
66#include <linux/kthread.h> 66#include <linux/kthread.h>
67#include <linux/memcontrol.h> 67#include <linux/memcontrol.h>
68#include <linux/ftrace.h> 68#include <linux/ftrace.h>
69#include <linux/lockdep.h>
69 70
70#include <asm/sections.h> 71#include <asm/sections.h>
71#include <asm/tlbflush.h> 72#include <asm/tlbflush.h>
@@ -3494,6 +3495,47 @@ should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_fla
3494} 3495}
3495#endif /* CONFIG_COMPACTION */ 3496#endif /* CONFIG_COMPACTION */
3496 3497
3498#ifdef CONFIG_LOCKDEP
3499struct lockdep_map __fs_reclaim_map =
3500 STATIC_LOCKDEP_MAP_INIT("fs_reclaim", &__fs_reclaim_map);
3501
3502static bool __need_fs_reclaim(gfp_t gfp_mask)
3503{
3504 gfp_mask = current_gfp_context(gfp_mask);
3505
3506 /* no reclaim without waiting on it */
3507 if (!(gfp_mask & __GFP_DIRECT_RECLAIM))
3508 return false;
3509
3510 /* this guy won't enter reclaim */
3511 if ((current->flags & PF_MEMALLOC) && !(gfp_mask & __GFP_NOMEMALLOC))
3512 return false;
3513
3514 /* We're only interested __GFP_FS allocations for now */
3515 if (!(gfp_mask & __GFP_FS))
3516 return false;
3517
3518 if (gfp_mask & __GFP_NOLOCKDEP)
3519 return false;
3520
3521 return true;
3522}
3523
3524void fs_reclaim_acquire(gfp_t gfp_mask)
3525{
3526 if (__need_fs_reclaim(gfp_mask))
3527 lock_map_acquire(&__fs_reclaim_map);
3528}
3529EXPORT_SYMBOL_GPL(fs_reclaim_acquire);
3530
3531void fs_reclaim_release(gfp_t gfp_mask)
3532{
3533 if (__need_fs_reclaim(gfp_mask))
3534 lock_map_release(&__fs_reclaim_map);
3535}
3536EXPORT_SYMBOL_GPL(fs_reclaim_release);
3537#endif
3538
3497/* Perform direct synchronous page reclaim */ 3539/* Perform direct synchronous page reclaim */
3498static int 3540static int
3499__perform_reclaim(gfp_t gfp_mask, unsigned int order, 3541__perform_reclaim(gfp_t gfp_mask, unsigned int order,
@@ -3508,7 +3550,7 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order,
3508 /* We now go into synchronous reclaim */ 3550 /* We now go into synchronous reclaim */
3509 cpuset_memory_pressure_bump(); 3551 cpuset_memory_pressure_bump();
3510 noreclaim_flag = memalloc_noreclaim_save(); 3552 noreclaim_flag = memalloc_noreclaim_save();
3511 lockdep_set_current_reclaim_state(gfp_mask); 3553 fs_reclaim_acquire(gfp_mask);
3512 reclaim_state.reclaimed_slab = 0; 3554 reclaim_state.reclaimed_slab = 0;
3513 current->reclaim_state = &reclaim_state; 3555 current->reclaim_state = &reclaim_state;
3514 3556
@@ -3516,7 +3558,7 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order,
3516 ac->nodemask); 3558 ac->nodemask);
3517 3559
3518 current->reclaim_state = NULL; 3560 current->reclaim_state = NULL;
3519 lockdep_clear_current_reclaim_state(); 3561 fs_reclaim_release(gfp_mask);
3520 memalloc_noreclaim_restore(noreclaim_flag); 3562 memalloc_noreclaim_restore(noreclaim_flag);
3521 3563
3522 cond_resched(); 3564 cond_resched();
@@ -4045,7 +4087,8 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
4045 *alloc_flags |= ALLOC_CPUSET; 4087 *alloc_flags |= ALLOC_CPUSET;
4046 } 4088 }
4047 4089
4048 lockdep_trace_alloc(gfp_mask); 4090 fs_reclaim_acquire(gfp_mask);
4091 fs_reclaim_release(gfp_mask);
4049 4092
4050 might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM); 4093 might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM);
4051 4094
diff --git a/mm/slab.h b/mm/slab.h
index 6885e1192ec5..073362816acc 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -43,6 +43,7 @@ struct kmem_cache {
43#include <linux/kasan.h> 43#include <linux/kasan.h>
44#include <linux/kmemleak.h> 44#include <linux/kmemleak.h>
45#include <linux/random.h> 45#include <linux/random.h>
46#include <linux/sched/mm.h>
46 47
47/* 48/*
48 * State of the slab allocator. 49 * State of the slab allocator.
@@ -412,7 +413,10 @@ static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s,
412 gfp_t flags) 413 gfp_t flags)
413{ 414{
414 flags &= gfp_allowed_mask; 415 flags &= gfp_allowed_mask;
415 lockdep_trace_alloc(flags); 416
417 fs_reclaim_acquire(flags);
418 fs_reclaim_release(flags);
419
416 might_sleep_if(gfpflags_allow_blocking(flags)); 420 might_sleep_if(gfpflags_allow_blocking(flags));
417 421
418 if (should_failslab(s, flags)) 422 if (should_failslab(s, flags))
diff --git a/mm/slob.c b/mm/slob.c
index 1bae78d71096..a8bd6fa11a66 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -432,7 +432,8 @@ __do_kmalloc_node(size_t size, gfp_t gfp, int node, unsigned long caller)
432 432
433 gfp &= gfp_allowed_mask; 433 gfp &= gfp_allowed_mask;
434 434
435 lockdep_trace_alloc(gfp); 435 fs_reclaim_acquire(gfp);
436 fs_reclaim_release(gfp);
436 437
437 if (size < PAGE_SIZE - align) { 438 if (size < PAGE_SIZE - align) {
438 if (!size) 439 if (!size)
@@ -538,7 +539,8 @@ static void *slob_alloc_node(struct kmem_cache *c, gfp_t flags, int node)
538 539
539 flags &= gfp_allowed_mask; 540 flags &= gfp_allowed_mask;
540 541
541 lockdep_trace_alloc(flags); 542 fs_reclaim_acquire(flags);
543 fs_reclaim_release(flags);
542 544
543 if (c->size < PAGE_SIZE) { 545 if (c->size < PAGE_SIZE) {
544 b = slob_alloc(c->size, flags, c->align, node); 546 b = slob_alloc(c->size, flags, c->align, node);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index a1af041930a6..f957afe900ec 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -3525,8 +3525,6 @@ static int kswapd(void *p)
3525 }; 3525 };
3526 const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); 3526 const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
3527 3527
3528 lockdep_set_current_reclaim_state(GFP_KERNEL);
3529
3530 if (!cpumask_empty(cpumask)) 3528 if (!cpumask_empty(cpumask))
3531 set_cpus_allowed_ptr(tsk, cpumask); 3529 set_cpus_allowed_ptr(tsk, cpumask);
3532 current->reclaim_state = &reclaim_state; 3530 current->reclaim_state = &reclaim_state;
@@ -3585,14 +3583,15 @@ kswapd_try_sleep:
3585 */ 3583 */
3586 trace_mm_vmscan_kswapd_wake(pgdat->node_id, classzone_idx, 3584 trace_mm_vmscan_kswapd_wake(pgdat->node_id, classzone_idx,
3587 alloc_order); 3585 alloc_order);
3586 fs_reclaim_acquire(GFP_KERNEL);
3588 reclaim_order = balance_pgdat(pgdat, alloc_order, classzone_idx); 3587 reclaim_order = balance_pgdat(pgdat, alloc_order, classzone_idx);
3588 fs_reclaim_release(GFP_KERNEL);
3589 if (reclaim_order < alloc_order) 3589 if (reclaim_order < alloc_order)
3590 goto kswapd_try_sleep; 3590 goto kswapd_try_sleep;
3591 } 3591 }
3592 3592
3593 tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD); 3593 tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD);
3594 current->reclaim_state = NULL; 3594 current->reclaim_state = NULL;
3595 lockdep_clear_current_reclaim_state();
3596 3595
3597 return 0; 3596 return 0;
3598} 3597}
@@ -3655,14 +3654,14 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
3655 unsigned int noreclaim_flag; 3654 unsigned int noreclaim_flag;
3656 3655
3657 noreclaim_flag = memalloc_noreclaim_save(); 3656 noreclaim_flag = memalloc_noreclaim_save();
3658 lockdep_set_current_reclaim_state(sc.gfp_mask); 3657 fs_reclaim_acquire(sc.gfp_mask);
3659 reclaim_state.reclaimed_slab = 0; 3658 reclaim_state.reclaimed_slab = 0;
3660 p->reclaim_state = &reclaim_state; 3659 p->reclaim_state = &reclaim_state;
3661 3660
3662 nr_reclaimed = do_try_to_free_pages(zonelist, &sc); 3661 nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
3663 3662
3664 p->reclaim_state = NULL; 3663 p->reclaim_state = NULL;
3665 lockdep_clear_current_reclaim_state(); 3664 fs_reclaim_release(sc.gfp_mask);
3666 memalloc_noreclaim_restore(noreclaim_flag); 3665 memalloc_noreclaim_restore(noreclaim_flag);
3667 3666
3668 return nr_reclaimed; 3667 return nr_reclaimed;
@@ -3847,7 +3846,7 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
3847 */ 3846 */
3848 noreclaim_flag = memalloc_noreclaim_save(); 3847 noreclaim_flag = memalloc_noreclaim_save();
3849 p->flags |= PF_SWAPWRITE; 3848 p->flags |= PF_SWAPWRITE;
3850 lockdep_set_current_reclaim_state(sc.gfp_mask); 3849 fs_reclaim_acquire(sc.gfp_mask);
3851 reclaim_state.reclaimed_slab = 0; 3850 reclaim_state.reclaimed_slab = 0;
3852 p->reclaim_state = &reclaim_state; 3851 p->reclaim_state = &reclaim_state;
3853 3852
@@ -3862,9 +3861,9 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
3862 } 3861 }
3863 3862
3864 p->reclaim_state = NULL; 3863 p->reclaim_state = NULL;
3864 fs_reclaim_release(gfp_mask);
3865 current->flags &= ~PF_SWAPWRITE; 3865 current->flags &= ~PF_SWAPWRITE;
3866 memalloc_noreclaim_restore(noreclaim_flag); 3866 memalloc_noreclaim_restore(noreclaim_flag);
3867 lockdep_clear_current_reclaim_state();
3868 return sc.nr_reclaimed >= nr_pages; 3867 return sc.nr_reclaimed >= nr_pages;
3869} 3868}
3870 3869
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index cd1d044a7fa5..ebe46ed997cb 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1810,8 +1810,7 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
1810static struct static_key udp_encap_needed __read_mostly; 1810static struct static_key udp_encap_needed __read_mostly;
1811void udp_encap_enable(void) 1811void udp_encap_enable(void)
1812{ 1812{
1813 if (!static_key_enabled(&udp_encap_needed)) 1813 static_key_enable(&udp_encap_needed);
1814 static_key_slow_inc(&udp_encap_needed);
1815} 1814}
1816EXPORT_SYMBOL(udp_encap_enable); 1815EXPORT_SYMBOL(udp_encap_enable);
1817 1816
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 20039c8501eb..8cd9b628cdc7 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -575,8 +575,7 @@ static __inline__ void udpv6_err(struct sk_buff *skb,
575static struct static_key udpv6_encap_needed __read_mostly; 575static struct static_key udpv6_encap_needed __read_mostly;
576void udpv6_encap_enable(void) 576void udpv6_encap_enable(void)
577{ 577{
578 if (!static_key_enabled(&udpv6_encap_needed)) 578 static_key_enable(&udpv6_encap_needed);
579 static_key_slow_inc(&udpv6_encap_needed);
580} 579}
581EXPORT_SYMBOL(udpv6_encap_enable); 580EXPORT_SYMBOL(udpv6_encap_enable);
582 581
diff --git a/scripts/Makefile.build b/scripts/Makefile.build
index f6152c70f7f4..a18cb4496e1e 100644
--- a/scripts/Makefile.build
+++ b/scripts/Makefile.build
@@ -262,6 +262,9 @@ objtool_args = check
262ifndef CONFIG_FRAME_POINTER 262ifndef CONFIG_FRAME_POINTER
263objtool_args += --no-fp 263objtool_args += --no-fp
264endif 264endif
265ifdef CONFIG_GCOV_KERNEL
266objtool_args += --no-unreachable
267endif
265 268
266# 'OBJECT_FILES_NON_STANDARD := y': skip objtool checking for a directory 269# 'OBJECT_FILES_NON_STANDARD := y': skip objtool checking for a directory
267# 'OBJECT_FILES_NON_STANDARD_foo.o := 'y': skip objtool checking for a file 270# 'OBJECT_FILES_NON_STANDARD_foo.o := 'y': skip objtool checking for a file
diff --git a/tools/objtool/Build b/tools/objtool/Build
index 6f2e1987c4d9..749becdf5b90 100644
--- a/tools/objtool/Build
+++ b/tools/objtool/Build
@@ -1,6 +1,9 @@
1objtool-y += arch/$(SRCARCH)/ 1objtool-y += arch/$(SRCARCH)/
2objtool-y += builtin-check.o 2objtool-y += builtin-check.o
3objtool-y += builtin-orc.o
3objtool-y += check.o 4objtool-y += check.o
5objtool-y += orc_gen.o
6objtool-y += orc_dump.o
4objtool-y += elf.o 7objtool-y += elf.o
5objtool-y += special.o 8objtool-y += special.o
6objtool-y += objtool.o 9objtool-y += objtool.o
diff --git a/tools/objtool/Documentation/stack-validation.txt b/tools/objtool/Documentation/stack-validation.txt
index 17c1195f11f4..6a1af43862df 100644
--- a/tools/objtool/Documentation/stack-validation.txt
+++ b/tools/objtool/Documentation/stack-validation.txt
@@ -11,9 +11,6 @@ analyzes every .o file and ensures the validity of its stack metadata.
11It enforces a set of rules on asm code and C inline assembly code so 11It enforces a set of rules on asm code and C inline assembly code so
12that stack traces can be reliable. 12that stack traces can be reliable.
13 13
14Currently it only checks frame pointer usage, but there are plans to add
15CFI validation for C files and CFI generation for asm files.
16
17For each function, it recursively follows all possible code paths and 14For each function, it recursively follows all possible code paths and
18validates the correct frame pointer state at each instruction. 15validates the correct frame pointer state at each instruction.
19 16
@@ -23,6 +20,10 @@ alternative execution paths to a given instruction (or set of
23instructions). Similarly, it knows how to follow switch statements, for 20instructions). Similarly, it knows how to follow switch statements, for
24which gcc sometimes uses jump tables. 21which gcc sometimes uses jump tables.
25 22
23(Objtool also has an 'orc generate' subcommand which generates debuginfo
24for the ORC unwinder. See Documentation/x86/orc-unwinder.txt in the
25kernel tree for more details.)
26
26 27
27Why do we need stack metadata validation? 28Why do we need stack metadata validation?
28----------------------------------------- 29-----------------------------------------
@@ -93,37 +94,14 @@ a) More reliable stack traces for frame pointer enabled kernels
93 or at the very end of the function after the stack frame has been 94 or at the very end of the function after the stack frame has been
94 destroyed. This is an inherent limitation of frame pointers. 95 destroyed. This is an inherent limitation of frame pointers.
95 96
96b) 100% reliable stack traces for DWARF enabled kernels 97b) ORC (Oops Rewind Capability) unwind table generation
97
98 (NOTE: This is not yet implemented)
99
100 As an alternative to frame pointers, DWARF Call Frame Information
101 (CFI) metadata can be used to walk the stack. Unlike frame pointers,
102 CFI metadata is out of band. So it doesn't affect runtime
103 performance and it can be reliable even when interrupts or exceptions
104 are involved.
105
106 For C code, gcc automatically generates DWARF CFI metadata. But for
107 asm code, generating CFI is a tedious manual approach which requires
108 manually placed .cfi assembler macros to be scattered throughout the
109 code. It's clumsy and very easy to get wrong, and it makes the real
110 code harder to read.
111
112 Stacktool will improve this situation in several ways. For code
113 which already has CFI annotations, it will validate them. For code
114 which doesn't have CFI annotations, it will generate them. So an
115 architecture can opt to strip out all the manual .cfi annotations
116 from their asm code and have objtool generate them instead.
117 98
118 We might also add a runtime stack validation debug option where we 99 An alternative to frame pointers and DWARF, ORC unwind data can be
119 periodically walk the stack from schedule() and/or an NMI to ensure 100 used to walk the stack. Unlike frame pointers, ORC data is out of
120 that the stack metadata is sane and that we reach the bottom of the 101 band. So it doesn't affect runtime performance and it can be
121 stack. 102 reliable even when interrupts or exceptions are involved.
122 103
123 So the benefit of objtool here will be that external tooling should 104 For more details, see Documentation/x86/orc-unwinder.txt.
124 always show perfect stack traces. And the same will be true for
125 kernel warning/oops traces if the architecture has a runtime DWARF
126 unwinder.
127 105
128c) Higher live patching compatibility rate 106c) Higher live patching compatibility rate
129 107
@@ -211,7 +189,7 @@ they mean, and suggestions for how to fix them.
211 function, add proper frame pointer logic using the FRAME_BEGIN and 189 function, add proper frame pointer logic using the FRAME_BEGIN and
212 FRAME_END macros. Otherwise, if it's not a callable function, remove 190 FRAME_END macros. Otherwise, if it's not a callable function, remove
213 its ELF function annotation by changing ENDPROC to END, and instead 191 its ELF function annotation by changing ENDPROC to END, and instead
214 use the manual CFI hint macros in asm/undwarf.h. 192 use the manual unwind hint macros in asm/unwind_hints.h.
215 193
216 If it's a GCC-compiled .c file, the error may be because the function 194 If it's a GCC-compiled .c file, the error may be because the function
217 uses an inline asm() statement which has a "call" instruction. An 195 uses an inline asm() statement which has a "call" instruction. An
@@ -231,8 +209,8 @@ they mean, and suggestions for how to fix them.
231 If the error is for an asm file, and the instruction is inside (or 209 If the error is for an asm file, and the instruction is inside (or
232 reachable from) a callable function, the function should be annotated 210 reachable from) a callable function, the function should be annotated
233 with the ENTRY/ENDPROC macros (ENDPROC is the important one). 211 with the ENTRY/ENDPROC macros (ENDPROC is the important one).
234 Otherwise, the code should probably be annotated with the CFI hint 212 Otherwise, the code should probably be annotated with the unwind hint
235 macros in asm/undwarf.h so objtool and the unwinder can know the 213 macros in asm/unwind_hints.h so objtool and the unwinder can know the
236 stack state associated with the code. 214 stack state associated with the code.
237 215
238 If you're 100% sure the code won't affect stack traces, or if you're 216 If you're 100% sure the code won't affect stack traces, or if you're
@@ -258,7 +236,7 @@ they mean, and suggestions for how to fix them.
258 instructions aren't allowed in a callable function, and are most 236 instructions aren't allowed in a callable function, and are most
259 likely part of the kernel entry code. They should usually not have 237 likely part of the kernel entry code. They should usually not have
260 the callable function annotation (ENDPROC) and should always be 238 the callable function annotation (ENDPROC) and should always be
261 annotated with the CFI hint macros in asm/undwarf.h. 239 annotated with the unwind hint macros in asm/unwind_hints.h.
262 240
263 241
2646. file.o: warning: objtool: func()+0x26: sibling call from callable instruction with modified stack frame 2426. file.o: warning: objtool: func()+0x26: sibling call from callable instruction with modified stack frame
@@ -272,7 +250,7 @@ they mean, and suggestions for how to fix them.
272 250
273 If the instruction is not actually in a callable function (e.g. 251 If the instruction is not actually in a callable function (e.g.
274 kernel entry code), change ENDPROC to END and annotate manually with 252 kernel entry code), change ENDPROC to END and annotate manually with
275 the CFI hint macros in asm/undwarf.h. 253 the unwind hint macros in asm/unwind_hints.h.
276 254
277 255
2787. file: warning: objtool: func()+0x5c: stack state mismatch 2567. file: warning: objtool: func()+0x5c: stack state mismatch
@@ -288,8 +266,8 @@ they mean, and suggestions for how to fix them.
288 266
289 Another possibility is that the code has some asm or inline asm which 267 Another possibility is that the code has some asm or inline asm which
290 does some unusual things to the stack or the frame pointer. In such 268 does some unusual things to the stack or the frame pointer. In such
291 cases it's probably appropriate to use the CFI hint macros in 269 cases it's probably appropriate to use the unwind hint macros in
292 asm/undwarf.h. 270 asm/unwind_hints.h.
293 271
294 272
2958. file.o: warning: objtool: funcA() falls through to next function funcB() 2738. file.o: warning: objtool: funcA() falls through to next function funcB()
diff --git a/tools/objtool/Makefile b/tools/objtool/Makefile
index 0e2765e243c0..3a6425fefc43 100644
--- a/tools/objtool/Makefile
+++ b/tools/objtool/Makefile
@@ -52,6 +52,9 @@ $(OBJTOOL): $(LIBSUBCMD) $(OBJTOOL_IN)
52 diff -I'^#include' arch/x86/insn/inat.h ../../arch/x86/include/asm/inat.h >/dev/null && \ 52 diff -I'^#include' arch/x86/insn/inat.h ../../arch/x86/include/asm/inat.h >/dev/null && \
53 diff -I'^#include' arch/x86/insn/inat_types.h ../../arch/x86/include/asm/inat_types.h >/dev/null) \ 53 diff -I'^#include' arch/x86/insn/inat_types.h ../../arch/x86/include/asm/inat_types.h >/dev/null) \
54 || echo "warning: objtool: x86 instruction decoder differs from kernel" >&2 )) || true 54 || echo "warning: objtool: x86 instruction decoder differs from kernel" >&2 )) || true
55 @(test -d ../../kernel -a -d ../../tools -a -d ../objtool && (( \
56 diff ../../arch/x86/include/asm/orc_types.h orc_types.h >/dev/null) \
57 || echo "warning: objtool: orc_types.h differs from kernel" >&2 )) || true
55 $(QUIET_LINK)$(CC) $(OBJTOOL_IN) $(LDFLAGS) -o $@ 58 $(QUIET_LINK)$(CC) $(OBJTOOL_IN) $(LDFLAGS) -o $@
56 59
57 60
diff --git a/tools/objtool/builtin-check.c b/tools/objtool/builtin-check.c
index 365c34ecab26..57254f5b2779 100644
--- a/tools/objtool/builtin-check.c
+++ b/tools/objtool/builtin-check.c
@@ -29,7 +29,7 @@
29#include "builtin.h" 29#include "builtin.h"
30#include "check.h" 30#include "check.h"
31 31
32bool nofp; 32bool no_fp, no_unreachable;
33 33
34static const char * const check_usage[] = { 34static const char * const check_usage[] = {
35 "objtool check [<options>] file.o", 35 "objtool check [<options>] file.o",
@@ -37,7 +37,8 @@ static const char * const check_usage[] = {
37}; 37};
38 38
39const struct option check_options[] = { 39const struct option check_options[] = {
40 OPT_BOOLEAN('f', "no-fp", &nofp, "Skip frame pointer validation"), 40 OPT_BOOLEAN('f', "no-fp", &no_fp, "Skip frame pointer validation"),
41 OPT_BOOLEAN('u', "no-unreachable", &no_unreachable, "Skip 'unreachable instruction' warnings"),
41 OPT_END(), 42 OPT_END(),
42}; 43};
43 44
@@ -52,5 +53,5 @@ int cmd_check(int argc, const char **argv)
52 53
53 objname = argv[0]; 54 objname = argv[0];
54 55
55 return check(objname, nofp); 56 return check(objname, no_fp, no_unreachable, false);
56} 57}
diff --git a/tools/objtool/builtin-orc.c b/tools/objtool/builtin-orc.c
new file mode 100644
index 000000000000..4c6b5c9ef073
--- /dev/null
+++ b/tools/objtool/builtin-orc.c
@@ -0,0 +1,70 @@
1/*
2 * Copyright (C) 2017 Josh Poimboeuf <jpoimboe@redhat.com>
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version 2
7 * of the License, or (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, see <http://www.gnu.org/licenses/>.
16 */
17
18/*
19 * objtool orc:
20 *
21 * This command analyzes a .o file and adds .orc_unwind and .orc_unwind_ip
22 * sections to it, which is used by the in-kernel ORC unwinder.
23 *
24 * This command is a superset of "objtool check".
25 */
26
27#include <string.h>
28#include <subcmd/parse-options.h>
29#include "builtin.h"
30#include "check.h"
31
32
33static const char *orc_usage[] = {
34 "objtool orc generate [<options>] file.o",
35 "objtool orc dump file.o",
36 NULL,
37};
38
39extern const struct option check_options[];
40extern bool no_fp, no_unreachable;
41
42int cmd_orc(int argc, const char **argv)
43{
44 const char *objname;
45
46 argc--; argv++;
47 if (!strncmp(argv[0], "gen", 3)) {
48 argc = parse_options(argc, argv, check_options, orc_usage, 0);
49 if (argc != 1)
50 usage_with_options(orc_usage, check_options);
51
52 objname = argv[0];
53
54 return check(objname, no_fp, no_unreachable, true);
55
56 }
57
58 if (!strcmp(argv[0], "dump")) {
59 if (argc != 2)
60 usage_with_options(orc_usage, check_options);
61
62 objname = argv[1];
63
64 return orc_dump(objname);
65 }
66
67 usage_with_options(orc_usage, check_options);
68
69 return 0;
70}
diff --git a/tools/objtool/builtin.h b/tools/objtool/builtin.h
index 34d2ba78a616..dd526067fed5 100644
--- a/tools/objtool/builtin.h
+++ b/tools/objtool/builtin.h
@@ -18,5 +18,6 @@
18#define _BUILTIN_H 18#define _BUILTIN_H
19 19
20extern int cmd_check(int argc, const char **argv); 20extern int cmd_check(int argc, const char **argv);
21extern int cmd_orc(int argc, const char **argv);
21 22
22#endif /* _BUILTIN_H */ 23#endif /* _BUILTIN_H */
diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index 2c6d74880403..3436a942b606 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -33,11 +33,11 @@ struct alternative {
33}; 33};
34 34
35const char *objname; 35const char *objname;
36static bool nofp; 36static bool no_fp;
37struct cfi_state initial_func_cfi; 37struct cfi_state initial_func_cfi;
38 38
39static struct instruction *find_insn(struct objtool_file *file, 39struct instruction *find_insn(struct objtool_file *file,
40 struct section *sec, unsigned long offset) 40 struct section *sec, unsigned long offset)
41{ 41{
42 struct instruction *insn; 42 struct instruction *insn;
43 43
@@ -59,19 +59,6 @@ static struct instruction *next_insn_same_sec(struct objtool_file *file,
59 return next; 59 return next;
60} 60}
61 61
62static bool gcov_enabled(struct objtool_file *file)
63{
64 struct section *sec;
65 struct symbol *sym;
66
67 for_each_sec(file, sec)
68 list_for_each_entry(sym, &sec->symbol_list, list)
69 if (!strncmp(sym->name, "__gcov_.", 8))
70 return true;
71
72 return false;
73}
74
75#define func_for_each_insn(file, func, insn) \ 62#define func_for_each_insn(file, func, insn) \
76 for (insn = find_insn(file, func->sec, func->offset); \ 63 for (insn = find_insn(file, func->sec, func->offset); \
77 insn && &insn->list != &file->insn_list && \ 64 insn && &insn->list != &file->insn_list && \
@@ -100,7 +87,6 @@ static bool gcov_enabled(struct objtool_file *file)
100static bool ignore_func(struct objtool_file *file, struct symbol *func) 87static bool ignore_func(struct objtool_file *file, struct symbol *func)
101{ 88{
102 struct rela *rela; 89 struct rela *rela;
103 struct instruction *insn;
104 90
105 /* check for STACK_FRAME_NON_STANDARD */ 91 /* check for STACK_FRAME_NON_STANDARD */
106 if (file->whitelist && file->whitelist->rela) 92 if (file->whitelist && file->whitelist->rela)
@@ -113,11 +99,6 @@ static bool ignore_func(struct objtool_file *file, struct symbol *func)
113 return true; 99 return true;
114 } 100 }
115 101
116 /* check if it has a context switching instruction */
117 func_for_each_insn(file, func, insn)
118 if (insn->type == INSN_CONTEXT_SWITCH)
119 return true;
120
121 return false; 102 return false;
122} 103}
123 104
@@ -259,6 +240,11 @@ static int decode_instructions(struct objtool_file *file)
259 if (!(sec->sh.sh_flags & SHF_EXECINSTR)) 240 if (!(sec->sh.sh_flags & SHF_EXECINSTR))
260 continue; 241 continue;
261 242
243 if (strcmp(sec->name, ".altinstr_replacement") &&
244 strcmp(sec->name, ".altinstr_aux") &&
245 strncmp(sec->name, ".discard.", 9))
246 sec->text = true;
247
262 for (offset = 0; offset < sec->len; offset += insn->len) { 248 for (offset = 0; offset < sec->len; offset += insn->len) {
263 insn = malloc(sizeof(*insn)); 249 insn = malloc(sizeof(*insn));
264 if (!insn) { 250 if (!insn) {
@@ -874,6 +860,99 @@ static int add_switch_table_alts(struct objtool_file *file)
874 return 0; 860 return 0;
875} 861}
876 862
863static int read_unwind_hints(struct objtool_file *file)
864{
865 struct section *sec, *relasec;
866 struct rela *rela;
867 struct unwind_hint *hint;
868 struct instruction *insn;
869 struct cfi_reg *cfa;
870 int i;
871
872 sec = find_section_by_name(file->elf, ".discard.unwind_hints");
873 if (!sec)
874 return 0;
875
876 relasec = sec->rela;
877 if (!relasec) {
878 WARN("missing .rela.discard.unwind_hints section");
879 return -1;
880 }
881
882 if (sec->len % sizeof(struct unwind_hint)) {
883 WARN("struct unwind_hint size mismatch");
884 return -1;
885 }
886
887 file->hints = true;
888
889 for (i = 0; i < sec->len / sizeof(struct unwind_hint); i++) {
890 hint = (struct unwind_hint *)sec->data->d_buf + i;
891
892 rela = find_rela_by_dest(sec, i * sizeof(*hint));
893 if (!rela) {
894 WARN("can't find rela for unwind_hints[%d]", i);
895 return -1;
896 }
897
898 insn = find_insn(file, rela->sym->sec, rela->addend);
899 if (!insn) {
900 WARN("can't find insn for unwind_hints[%d]", i);
901 return -1;
902 }
903
904 cfa = &insn->state.cfa;
905
906 if (hint->type == UNWIND_HINT_TYPE_SAVE) {
907 insn->save = true;
908 continue;
909
910 } else if (hint->type == UNWIND_HINT_TYPE_RESTORE) {
911 insn->restore = true;
912 insn->hint = true;
913 continue;
914 }
915
916 insn->hint = true;
917
918 switch (hint->sp_reg) {
919 case ORC_REG_UNDEFINED:
920 cfa->base = CFI_UNDEFINED;
921 break;
922 case ORC_REG_SP:
923 cfa->base = CFI_SP;
924 break;
925 case ORC_REG_BP:
926 cfa->base = CFI_BP;
927 break;
928 case ORC_REG_SP_INDIRECT:
929 cfa->base = CFI_SP_INDIRECT;
930 break;
931 case ORC_REG_R10:
932 cfa->base = CFI_R10;
933 break;
934 case ORC_REG_R13:
935 cfa->base = CFI_R13;
936 break;
937 case ORC_REG_DI:
938 cfa->base = CFI_DI;
939 break;
940 case ORC_REG_DX:
941 cfa->base = CFI_DX;
942 break;
943 default:
944 WARN_FUNC("unsupported unwind_hint sp base reg %d",
945 insn->sec, insn->offset, hint->sp_reg);
946 return -1;
947 }
948
949 cfa->offset = hint->sp_offset;
950 insn->state.type = hint->type;
951 }
952
953 return 0;
954}
955
877static int decode_sections(struct objtool_file *file) 956static int decode_sections(struct objtool_file *file)
878{ 957{
879 int ret; 958 int ret;
@@ -904,6 +983,10 @@ static int decode_sections(struct objtool_file *file)
904 if (ret) 983 if (ret)
905 return ret; 984 return ret;
906 985
986 ret = read_unwind_hints(file);
987 if (ret)
988 return ret;
989
907 return 0; 990 return 0;
908} 991}
909 992
@@ -947,6 +1030,30 @@ static bool has_valid_stack_frame(struct insn_state *state)
947 return false; 1030 return false;
948} 1031}
949 1032
1033static int update_insn_state_regs(struct instruction *insn, struct insn_state *state)
1034{
1035 struct cfi_reg *cfa = &state->cfa;
1036 struct stack_op *op = &insn->stack_op;
1037
1038 if (cfa->base != CFI_SP)
1039 return 0;
1040
1041 /* push */
1042 if (op->dest.type == OP_DEST_PUSH)
1043 cfa->offset += 8;
1044
1045 /* pop */
1046 if (op->src.type == OP_SRC_POP)
1047 cfa->offset -= 8;
1048
1049 /* add immediate to sp */
1050 if (op->dest.type == OP_DEST_REG && op->src.type == OP_SRC_ADD &&
1051 op->dest.reg == CFI_SP && op->src.reg == CFI_SP)
1052 cfa->offset -= op->src.offset;
1053
1054 return 0;
1055}
1056
950static void save_reg(struct insn_state *state, unsigned char reg, int base, 1057static void save_reg(struct insn_state *state, unsigned char reg, int base,
951 int offset) 1058 int offset)
952{ 1059{
@@ -1032,6 +1139,9 @@ static int update_insn_state(struct instruction *insn, struct insn_state *state)
1032 return 0; 1139 return 0;
1033 } 1140 }
1034 1141
1142 if (state->type == ORC_TYPE_REGS || state->type == ORC_TYPE_REGS_IRET)
1143 return update_insn_state_regs(insn, state);
1144
1035 switch (op->dest.type) { 1145 switch (op->dest.type) {
1036 1146
1037 case OP_DEST_REG: 1147 case OP_DEST_REG:
@@ -1051,7 +1161,7 @@ static int update_insn_state(struct instruction *insn, struct insn_state *state)
1051 regs[CFI_BP].base = CFI_BP; 1161 regs[CFI_BP].base = CFI_BP;
1052 regs[CFI_BP].offset = -state->stack_size; 1162 regs[CFI_BP].offset = -state->stack_size;
1053 state->bp_scratch = false; 1163 state->bp_scratch = false;
1054 } else if (!nofp) { 1164 } else if (!no_fp) {
1055 1165
1056 WARN_FUNC("unknown stack-related register move", 1166 WARN_FUNC("unknown stack-related register move",
1057 insn->sec, insn->offset); 1167 insn->sec, insn->offset);
@@ -1222,7 +1332,7 @@ static int update_insn_state(struct instruction *insn, struct insn_state *state)
1222 } 1332 }
1223 1333
1224 /* detect when asm code uses rbp as a scratch register */ 1334 /* detect when asm code uses rbp as a scratch register */
1225 if (!nofp && insn->func && op->src.reg == CFI_BP && 1335 if (!no_fp && insn->func && op->src.reg == CFI_BP &&
1226 cfa->base != CFI_BP) 1336 cfa->base != CFI_BP)
1227 state->bp_scratch = true; 1337 state->bp_scratch = true;
1228 break; 1338 break;
@@ -1323,6 +1433,10 @@ static bool insn_state_match(struct instruction *insn, struct insn_state *state)
1323 break; 1433 break;
1324 } 1434 }
1325 1435
1436 } else if (state1->type != state2->type) {
1437 WARN_FUNC("stack state mismatch: type1=%d type2=%d",
1438 insn->sec, insn->offset, state1->type, state2->type);
1439
1326 } else if (state1->drap != state2->drap || 1440 } else if (state1->drap != state2->drap ||
1327 (state1->drap && state1->drap_reg != state2->drap_reg)) { 1441 (state1->drap && state1->drap_reg != state2->drap_reg)) {
1328 WARN_FUNC("stack state mismatch: drap1=%d(%d) drap2=%d(%d)", 1442 WARN_FUNC("stack state mismatch: drap1=%d(%d) drap2=%d(%d)",
@@ -1346,7 +1460,7 @@ static int validate_branch(struct objtool_file *file, struct instruction *first,
1346 struct insn_state state) 1460 struct insn_state state)
1347{ 1461{
1348 struct alternative *alt; 1462 struct alternative *alt;
1349 struct instruction *insn; 1463 struct instruction *insn, *next_insn;
1350 struct section *sec; 1464 struct section *sec;
1351 struct symbol *func = NULL; 1465 struct symbol *func = NULL;
1352 int ret; 1466 int ret;
@@ -1361,6 +1475,8 @@ static int validate_branch(struct objtool_file *file, struct instruction *first,
1361 } 1475 }
1362 1476
1363 while (1) { 1477 while (1) {
1478 next_insn = next_insn_same_sec(file, insn);
1479
1364 if (file->c_file && insn->func) { 1480 if (file->c_file && insn->func) {
1365 if (func && func != insn->func) { 1481 if (func && func != insn->func) {
1366 WARN("%s() falls through to next function %s()", 1482 WARN("%s() falls through to next function %s()",
@@ -1378,13 +1494,54 @@ static int validate_branch(struct objtool_file *file, struct instruction *first,
1378 } 1494 }
1379 1495
1380 if (insn->visited) { 1496 if (insn->visited) {
1381 if (!!insn_state_match(insn, &state)) 1497 if (!insn->hint && !insn_state_match(insn, &state))
1382 return 1; 1498 return 1;
1383 1499
1384 return 0; 1500 return 0;
1385 } 1501 }
1386 1502
1387 insn->state = state; 1503 if (insn->hint) {
1504 if (insn->restore) {
1505 struct instruction *save_insn, *i;
1506
1507 i = insn;
1508 save_insn = NULL;
1509 func_for_each_insn_continue_reverse(file, func, i) {
1510 if (i->save) {
1511 save_insn = i;
1512 break;
1513 }
1514 }
1515
1516 if (!save_insn) {
1517 WARN_FUNC("no corresponding CFI save for CFI restore",
1518 sec, insn->offset);
1519 return 1;
1520 }
1521
1522 if (!save_insn->visited) {
1523 /*
1524 * Oops, no state to copy yet.
1525 * Hopefully we can reach this
1526 * instruction from another branch
1527 * after the save insn has been
1528 * visited.
1529 */
1530 if (insn == first)
1531 return 0;
1532
1533 WARN_FUNC("objtool isn't smart enough to handle this CFI save/restore combo",
1534 sec, insn->offset);
1535 return 1;
1536 }
1537
1538 insn->state = save_insn->state;
1539 }
1540
1541 state = insn->state;
1542
1543 } else
1544 insn->state = state;
1388 1545
1389 insn->visited = true; 1546 insn->visited = true;
1390 1547
@@ -1423,7 +1580,7 @@ static int validate_branch(struct objtool_file *file, struct instruction *first,
1423 1580
1424 /* fallthrough */ 1581 /* fallthrough */
1425 case INSN_CALL_DYNAMIC: 1582 case INSN_CALL_DYNAMIC:
1426 if (!nofp && func && !has_valid_stack_frame(&state)) { 1583 if (!no_fp && func && !has_valid_stack_frame(&state)) {
1427 WARN_FUNC("call without frame pointer save/setup", 1584 WARN_FUNC("call without frame pointer save/setup",
1428 sec, insn->offset); 1585 sec, insn->offset);
1429 return 1; 1586 return 1;
@@ -1461,6 +1618,14 @@ static int validate_branch(struct objtool_file *file, struct instruction *first,
1461 1618
1462 return 0; 1619 return 0;
1463 1620
1621 case INSN_CONTEXT_SWITCH:
1622 if (func && (!next_insn || !next_insn->hint)) {
1623 WARN_FUNC("unsupported instruction in callable function",
1624 sec, insn->offset);
1625 return 1;
1626 }
1627 return 0;
1628
1464 case INSN_STACK: 1629 case INSN_STACK:
1465 if (update_insn_state(insn, &state)) 1630 if (update_insn_state(insn, &state))
1466 return -1; 1631 return -1;
@@ -1474,7 +1639,7 @@ static int validate_branch(struct objtool_file *file, struct instruction *first,
1474 if (insn->dead_end) 1639 if (insn->dead_end)
1475 return 0; 1640 return 0;
1476 1641
1477 insn = next_insn_same_sec(file, insn); 1642 insn = next_insn;
1478 if (!insn) { 1643 if (!insn) {
1479 WARN("%s: unexpected end of section", sec->name); 1644 WARN("%s: unexpected end of section", sec->name);
1480 return 1; 1645 return 1;
@@ -1484,6 +1649,27 @@ static int validate_branch(struct objtool_file *file, struct instruction *first,
1484 return 0; 1649 return 0;
1485} 1650}
1486 1651
1652static int validate_unwind_hints(struct objtool_file *file)
1653{
1654 struct instruction *insn;
1655 int ret, warnings = 0;
1656 struct insn_state state;
1657
1658 if (!file->hints)
1659 return 0;
1660
1661 clear_insn_state(&state);
1662
1663 for_each_insn(file, insn) {
1664 if (insn->hint && !insn->visited) {
1665 ret = validate_branch(file, insn, state);
1666 warnings += ret;
1667 }
1668 }
1669
1670 return warnings;
1671}
1672
1487static bool is_kasan_insn(struct instruction *insn) 1673static bool is_kasan_insn(struct instruction *insn)
1488{ 1674{
1489 return (insn->type == INSN_CALL && 1675 return (insn->type == INSN_CALL &&
@@ -1580,15 +1766,6 @@ static int validate_reachable_instructions(struct objtool_file *file)
1580 if (insn->visited || ignore_unreachable_insn(insn)) 1766 if (insn->visited || ignore_unreachable_insn(insn))
1581 continue; 1767 continue;
1582 1768
1583 /*
1584 * gcov produces a lot of unreachable instructions. If we get
1585 * an unreachable warning and the file has gcov enabled, just
1586 * ignore it, and all other such warnings for the file. Do
1587 * this here because this is an expensive function.
1588 */
1589 if (gcov_enabled(file))
1590 return 0;
1591
1592 WARN_FUNC("unreachable instruction", insn->sec, insn->offset); 1769 WARN_FUNC("unreachable instruction", insn->sec, insn->offset);
1593 return 1; 1770 return 1;
1594 } 1771 }
@@ -1613,15 +1790,15 @@ static void cleanup(struct objtool_file *file)
1613 elf_close(file->elf); 1790 elf_close(file->elf);
1614} 1791}
1615 1792
1616int check(const char *_objname, bool _nofp) 1793int check(const char *_objname, bool _no_fp, bool no_unreachable, bool orc)
1617{ 1794{
1618 struct objtool_file file; 1795 struct objtool_file file;
1619 int ret, warnings = 0; 1796 int ret, warnings = 0;
1620 1797
1621 objname = _objname; 1798 objname = _objname;
1622 nofp = _nofp; 1799 no_fp = _no_fp;
1623 1800
1624 file.elf = elf_open(objname); 1801 file.elf = elf_open(objname, orc ? O_RDWR : O_RDONLY);
1625 if (!file.elf) 1802 if (!file.elf)
1626 return 1; 1803 return 1;
1627 1804
@@ -1629,8 +1806,9 @@ int check(const char *_objname, bool _nofp)
1629 hash_init(file.insn_hash); 1806 hash_init(file.insn_hash);
1630 file.whitelist = find_section_by_name(file.elf, ".discard.func_stack_frame_non_standard"); 1807 file.whitelist = find_section_by_name(file.elf, ".discard.func_stack_frame_non_standard");
1631 file.rodata = find_section_by_name(file.elf, ".rodata"); 1808 file.rodata = find_section_by_name(file.elf, ".rodata");
1632 file.ignore_unreachables = false;
1633 file.c_file = find_section_by_name(file.elf, ".comment"); 1809 file.c_file = find_section_by_name(file.elf, ".comment");
1810 file.ignore_unreachables = no_unreachable;
1811 file.hints = false;
1634 1812
1635 arch_initial_func_cfi_state(&initial_func_cfi); 1813 arch_initial_func_cfi_state(&initial_func_cfi);
1636 1814
@@ -1647,6 +1825,11 @@ int check(const char *_objname, bool _nofp)
1647 goto out; 1825 goto out;
1648 warnings += ret; 1826 warnings += ret;
1649 1827
1828 ret = validate_unwind_hints(&file);
1829 if (ret < 0)
1830 goto out;
1831 warnings += ret;
1832
1650 if (!warnings) { 1833 if (!warnings) {
1651 ret = validate_reachable_instructions(&file); 1834 ret = validate_reachable_instructions(&file);
1652 if (ret < 0) 1835 if (ret < 0)
@@ -1654,6 +1837,20 @@ int check(const char *_objname, bool _nofp)
1654 warnings += ret; 1837 warnings += ret;
1655 } 1838 }
1656 1839
1840 if (orc) {
1841 ret = create_orc(&file);
1842 if (ret < 0)
1843 goto out;
1844
1845 ret = create_orc_sections(&file);
1846 if (ret < 0)
1847 goto out;
1848
1849 ret = elf_write(file.elf);
1850 if (ret < 0)
1851 goto out;
1852 }
1853
1657out: 1854out:
1658 cleanup(&file); 1855 cleanup(&file);
1659 1856
diff --git a/tools/objtool/check.h b/tools/objtool/check.h
index da85f5b00ec6..c9af11f0c8af 100644
--- a/tools/objtool/check.h
+++ b/tools/objtool/check.h
@@ -22,12 +22,14 @@
22#include "elf.h" 22#include "elf.h"
23#include "cfi.h" 23#include "cfi.h"
24#include "arch.h" 24#include "arch.h"
25#include "orc.h"
25#include <linux/hashtable.h> 26#include <linux/hashtable.h>
26 27
27struct insn_state { 28struct insn_state {
28 struct cfi_reg cfa; 29 struct cfi_reg cfa;
29 struct cfi_reg regs[CFI_NUM_REGS]; 30 struct cfi_reg regs[CFI_NUM_REGS];
30 int stack_size; 31 int stack_size;
32 unsigned char type;
31 bool bp_scratch; 33 bool bp_scratch;
32 bool drap; 34 bool drap;
33 int drap_reg; 35 int drap_reg;
@@ -41,13 +43,14 @@ struct instruction {
41 unsigned int len; 43 unsigned int len;
42 unsigned char type; 44 unsigned char type;
43 unsigned long immediate; 45 unsigned long immediate;
44 bool alt_group, visited, dead_end, ignore; 46 bool alt_group, visited, dead_end, ignore, hint, save, restore;
45 struct symbol *call_dest; 47 struct symbol *call_dest;
46 struct instruction *jump_dest; 48 struct instruction *jump_dest;
47 struct list_head alts; 49 struct list_head alts;
48 struct symbol *func; 50 struct symbol *func;
49 struct stack_op stack_op; 51 struct stack_op stack_op;
50 struct insn_state state; 52 struct insn_state state;
53 struct orc_entry orc;
51}; 54};
52 55
53struct objtool_file { 56struct objtool_file {
@@ -55,12 +58,22 @@ struct objtool_file {
55 struct list_head insn_list; 58 struct list_head insn_list;
56 DECLARE_HASHTABLE(insn_hash, 16); 59 DECLARE_HASHTABLE(insn_hash, 16);
57 struct section *rodata, *whitelist; 60 struct section *rodata, *whitelist;
58 bool ignore_unreachables, c_file; 61 bool ignore_unreachables, c_file, hints;
59}; 62};
60 63
61int check(const char *objname, bool nofp); 64int check(const char *objname, bool no_fp, bool no_unreachable, bool orc);
65
66struct instruction *find_insn(struct objtool_file *file,
67 struct section *sec, unsigned long offset);
62 68
63#define for_each_insn(file, insn) \ 69#define for_each_insn(file, insn) \
64 list_for_each_entry(insn, &file->insn_list, list) 70 list_for_each_entry(insn, &file->insn_list, list)
65 71
72#define sec_for_each_insn(file, sec, insn) \
73 for (insn = find_insn(file, sec, 0); \
74 insn && &insn->list != &file->insn_list && \
75 insn->sec == sec; \
76 insn = list_next_entry(insn, list))
77
78
66#endif /* _CHECK_H */ 79#endif /* _CHECK_H */
diff --git a/tools/objtool/elf.c b/tools/objtool/elf.c
index 1a7e8aa2af58..6e9f980a7d26 100644
--- a/tools/objtool/elf.c
+++ b/tools/objtool/elf.c
@@ -30,16 +30,6 @@
30#include "elf.h" 30#include "elf.h"
31#include "warn.h" 31#include "warn.h"
32 32
33/*
34 * Fallback for systems without this "read, mmaping if possible" cmd.
35 */
36#ifndef ELF_C_READ_MMAP
37#define ELF_C_READ_MMAP ELF_C_READ
38#endif
39
40#define WARN_ELF(format, ...) \
41 WARN(format ": %s", ##__VA_ARGS__, elf_errmsg(-1))
42
43struct section *find_section_by_name(struct elf *elf, const char *name) 33struct section *find_section_by_name(struct elf *elf, const char *name)
44{ 34{
45 struct section *sec; 35 struct section *sec;
@@ -349,9 +339,10 @@ static int read_relas(struct elf *elf)
349 return 0; 339 return 0;
350} 340}
351 341
352struct elf *elf_open(const char *name) 342struct elf *elf_open(const char *name, int flags)
353{ 343{
354 struct elf *elf; 344 struct elf *elf;
345 Elf_Cmd cmd;
355 346
356 elf_version(EV_CURRENT); 347 elf_version(EV_CURRENT);
357 348
@@ -364,13 +355,20 @@ struct elf *elf_open(const char *name)
364 355
365 INIT_LIST_HEAD(&elf->sections); 356 INIT_LIST_HEAD(&elf->sections);
366 357
367 elf->fd = open(name, O_RDONLY); 358 elf->fd = open(name, flags);
368 if (elf->fd == -1) { 359 if (elf->fd == -1) {
369 perror("open"); 360 perror("open");
370 goto err; 361 goto err;
371 } 362 }
372 363
373 elf->elf = elf_begin(elf->fd, ELF_C_READ_MMAP, NULL); 364 if ((flags & O_ACCMODE) == O_RDONLY)
365 cmd = ELF_C_READ_MMAP;
366 else if ((flags & O_ACCMODE) == O_RDWR)
367 cmd = ELF_C_RDWR;
368 else /* O_WRONLY */
369 cmd = ELF_C_WRITE;
370
371 elf->elf = elf_begin(elf->fd, cmd, NULL);
374 if (!elf->elf) { 372 if (!elf->elf) {
375 WARN_ELF("elf_begin"); 373 WARN_ELF("elf_begin");
376 goto err; 374 goto err;
@@ -397,6 +395,194 @@ err:
397 return NULL; 395 return NULL;
398} 396}
399 397
398struct section *elf_create_section(struct elf *elf, const char *name,
399 size_t entsize, int nr)
400{
401 struct section *sec, *shstrtab;
402 size_t size = entsize * nr;
403 struct Elf_Scn *s;
404 Elf_Data *data;
405
406 sec = malloc(sizeof(*sec));
407 if (!sec) {
408 perror("malloc");
409 return NULL;
410 }
411 memset(sec, 0, sizeof(*sec));
412
413 INIT_LIST_HEAD(&sec->symbol_list);
414 INIT_LIST_HEAD(&sec->rela_list);
415 hash_init(sec->rela_hash);
416 hash_init(sec->symbol_hash);
417
418 list_add_tail(&sec->list, &elf->sections);
419
420 s = elf_newscn(elf->elf);
421 if (!s) {
422 WARN_ELF("elf_newscn");
423 return NULL;
424 }
425
426 sec->name = strdup(name);
427 if (!sec->name) {
428 perror("strdup");
429 return NULL;
430 }
431
432 sec->idx = elf_ndxscn(s);
433 sec->len = size;
434 sec->changed = true;
435
436 sec->data = elf_newdata(s);
437 if (!sec->data) {
438 WARN_ELF("elf_newdata");
439 return NULL;
440 }
441
442 sec->data->d_size = size;
443 sec->data->d_align = 1;
444
445 if (size) {
446 sec->data->d_buf = malloc(size);
447 if (!sec->data->d_buf) {
448 perror("malloc");
449 return NULL;
450 }
451 memset(sec->data->d_buf, 0, size);
452 }
453
454 if (!gelf_getshdr(s, &sec->sh)) {
455 WARN_ELF("gelf_getshdr");
456 return NULL;
457 }
458
459 sec->sh.sh_size = size;
460 sec->sh.sh_entsize = entsize;
461 sec->sh.sh_type = SHT_PROGBITS;
462 sec->sh.sh_addralign = 1;
463 sec->sh.sh_flags = SHF_ALLOC;
464
465
466 /* Add section name to .shstrtab */
467 shstrtab = find_section_by_name(elf, ".shstrtab");
468 if (!shstrtab) {
469 WARN("can't find .shstrtab section");
470 return NULL;
471 }
472
473 s = elf_getscn(elf->elf, shstrtab->idx);
474 if (!s) {
475 WARN_ELF("elf_getscn");
476 return NULL;
477 }
478
479 data = elf_newdata(s);
480 if (!data) {
481 WARN_ELF("elf_newdata");
482 return NULL;
483 }
484
485 data->d_buf = sec->name;
486 data->d_size = strlen(name) + 1;
487 data->d_align = 1;
488
489 sec->sh.sh_name = shstrtab->len;
490
491 shstrtab->len += strlen(name) + 1;
492 shstrtab->changed = true;
493
494 return sec;
495}
496
497struct section *elf_create_rela_section(struct elf *elf, struct section *base)
498{
499 char *relaname;
500 struct section *sec;
501
502 relaname = malloc(strlen(base->name) + strlen(".rela") + 1);
503 if (!relaname) {
504 perror("malloc");
505 return NULL;
506 }
507 strcpy(relaname, ".rela");
508 strcat(relaname, base->name);
509
510 sec = elf_create_section(elf, relaname, sizeof(GElf_Rela), 0);
511 if (!sec)
512 return NULL;
513
514 base->rela = sec;
515 sec->base = base;
516
517 sec->sh.sh_type = SHT_RELA;
518 sec->sh.sh_addralign = 8;
519 sec->sh.sh_link = find_section_by_name(elf, ".symtab")->idx;
520 sec->sh.sh_info = base->idx;
521 sec->sh.sh_flags = SHF_INFO_LINK;
522
523 return sec;
524}
525
526int elf_rebuild_rela_section(struct section *sec)
527{
528 struct rela *rela;
529 int nr, idx = 0, size;
530 GElf_Rela *relas;
531
532 nr = 0;
533 list_for_each_entry(rela, &sec->rela_list, list)
534 nr++;
535
536 size = nr * sizeof(*relas);
537 relas = malloc(size);
538 if (!relas) {
539 perror("malloc");
540 return -1;
541 }
542
543 sec->data->d_buf = relas;
544 sec->data->d_size = size;
545
546 sec->sh.sh_size = size;
547
548 idx = 0;
549 list_for_each_entry(rela, &sec->rela_list, list) {
550 relas[idx].r_offset = rela->offset;
551 relas[idx].r_addend = rela->addend;
552 relas[idx].r_info = GELF_R_INFO(rela->sym->idx, rela->type);
553 idx++;
554 }
555
556 return 0;
557}
558
559int elf_write(struct elf *elf)
560{
561 struct section *sec;
562 Elf_Scn *s;
563
564 list_for_each_entry(sec, &elf->sections, list) {
565 if (sec->changed) {
566 s = elf_getscn(elf->elf, sec->idx);
567 if (!s) {
568 WARN_ELF("elf_getscn");
569 return -1;
570 }
571 if (!gelf_update_shdr (s, &sec->sh)) {
572 WARN_ELF("gelf_update_shdr");
573 return -1;
574 }
575 }
576 }
577
578 if (elf_update(elf->elf, ELF_C_WRITE) < 0) {
579 WARN_ELF("elf_update");
580 return -1;
581 }
582
583 return 0;
584}
585
400void elf_close(struct elf *elf) 586void elf_close(struct elf *elf)
401{ 587{
402 struct section *sec, *tmpsec; 588 struct section *sec, *tmpsec;
diff --git a/tools/objtool/elf.h b/tools/objtool/elf.h
index 343968b778cb..d86e2ff14466 100644
--- a/tools/objtool/elf.h
+++ b/tools/objtool/elf.h
@@ -28,6 +28,13 @@
28# define elf_getshdrstrndx elf_getshstrndx 28# define elf_getshdrstrndx elf_getshstrndx
29#endif 29#endif
30 30
31/*
32 * Fallback for systems without this "read, mmaping if possible" cmd.
33 */
34#ifndef ELF_C_READ_MMAP
35#define ELF_C_READ_MMAP ELF_C_READ
36#endif
37
31struct section { 38struct section {
32 struct list_head list; 39 struct list_head list;
33 GElf_Shdr sh; 40 GElf_Shdr sh;
@@ -41,6 +48,7 @@ struct section {
41 char *name; 48 char *name;
42 int idx; 49 int idx;
43 unsigned int len; 50 unsigned int len;
51 bool changed, text;
44}; 52};
45 53
46struct symbol { 54struct symbol {
@@ -75,7 +83,7 @@ struct elf {
75}; 83};
76 84
77 85
78struct elf *elf_open(const char *name); 86struct elf *elf_open(const char *name, int flags);
79struct section *find_section_by_name(struct elf *elf, const char *name); 87struct section *find_section_by_name(struct elf *elf, const char *name);
80struct symbol *find_symbol_by_offset(struct section *sec, unsigned long offset); 88struct symbol *find_symbol_by_offset(struct section *sec, unsigned long offset);
81struct symbol *find_symbol_containing(struct section *sec, unsigned long offset); 89struct symbol *find_symbol_containing(struct section *sec, unsigned long offset);
@@ -83,6 +91,11 @@ struct rela *find_rela_by_dest(struct section *sec, unsigned long offset);
83struct rela *find_rela_by_dest_range(struct section *sec, unsigned long offset, 91struct rela *find_rela_by_dest_range(struct section *sec, unsigned long offset,
84 unsigned int len); 92 unsigned int len);
85struct symbol *find_containing_func(struct section *sec, unsigned long offset); 93struct symbol *find_containing_func(struct section *sec, unsigned long offset);
94struct section *elf_create_section(struct elf *elf, const char *name, size_t
95 entsize, int nr);
96struct section *elf_create_rela_section(struct elf *elf, struct section *base);
97int elf_rebuild_rela_section(struct section *sec);
98int elf_write(struct elf *elf);
86void elf_close(struct elf *elf); 99void elf_close(struct elf *elf);
87 100
88#define for_each_sec(file, sec) \ 101#define for_each_sec(file, sec) \
diff --git a/tools/objtool/objtool.c b/tools/objtool/objtool.c
index ecc5b1b5d15d..31e0f9143840 100644
--- a/tools/objtool/objtool.c
+++ b/tools/objtool/objtool.c
@@ -42,10 +42,11 @@ struct cmd_struct {
42}; 42};
43 43
44static const char objtool_usage_string[] = 44static const char objtool_usage_string[] =
45 "objtool [OPTIONS] COMMAND [ARGS]"; 45 "objtool COMMAND [ARGS]";
46 46
47static struct cmd_struct objtool_cmds[] = { 47static struct cmd_struct objtool_cmds[] = {
48 {"check", cmd_check, "Perform stack metadata validation on an object file" }, 48 {"check", cmd_check, "Perform stack metadata validation on an object file" },
49 {"orc", cmd_orc, "Generate in-place ORC unwind tables for an object file" },
49}; 50};
50 51
51bool help; 52bool help;
diff --git a/tools/objtool/orc.h b/tools/objtool/orc.h
new file mode 100644
index 000000000000..a4139e386ef3
--- /dev/null
+++ b/tools/objtool/orc.h
@@ -0,0 +1,30 @@
1/*
2 * Copyright (C) 2017 Josh Poimboeuf <jpoimboe@redhat.com>
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version 2
7 * of the License, or (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, see <http://www.gnu.org/licenses/>.
16 */
17
18#ifndef _ORC_H
19#define _ORC_H
20
21#include "orc_types.h"
22
23struct objtool_file;
24
25int create_orc(struct objtool_file *file);
26int create_orc_sections(struct objtool_file *file);
27
28int orc_dump(const char *objname);
29
30#endif /* _ORC_H */
diff --git a/tools/objtool/orc_dump.c b/tools/objtool/orc_dump.c
new file mode 100644
index 000000000000..36c5bf6a2675
--- /dev/null
+++ b/tools/objtool/orc_dump.c
@@ -0,0 +1,212 @@
1/*
2 * Copyright (C) 2017 Josh Poimboeuf <jpoimboe@redhat.com>
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version 2
7 * of the License, or (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, see <http://www.gnu.org/licenses/>.
16 */
17
18#include <unistd.h>
19#include "orc.h"
20#include "warn.h"
21
22static const char *reg_name(unsigned int reg)
23{
24 switch (reg) {
25 case ORC_REG_PREV_SP:
26 return "prevsp";
27 case ORC_REG_DX:
28 return "dx";
29 case ORC_REG_DI:
30 return "di";
31 case ORC_REG_BP:
32 return "bp";
33 case ORC_REG_SP:
34 return "sp";
35 case ORC_REG_R10:
36 return "r10";
37 case ORC_REG_R13:
38 return "r13";
39 case ORC_REG_BP_INDIRECT:
40 return "bp(ind)";
41 case ORC_REG_SP_INDIRECT:
42 return "sp(ind)";
43 default:
44 return "?";
45 }
46}
47
48static const char *orc_type_name(unsigned int type)
49{
50 switch (type) {
51 case ORC_TYPE_CALL:
52 return "call";
53 case ORC_TYPE_REGS:
54 return "regs";
55 case ORC_TYPE_REGS_IRET:
56 return "iret";
57 default:
58 return "?";
59 }
60}
61
62static void print_reg(unsigned int reg, int offset)
63{
64 if (reg == ORC_REG_BP_INDIRECT)
65 printf("(bp%+d)", offset);
66 else if (reg == ORC_REG_SP_INDIRECT)
67 printf("(sp%+d)", offset);
68 else if (reg == ORC_REG_UNDEFINED)
69 printf("(und)");
70 else
71 printf("%s%+d", reg_name(reg), offset);
72}
73
74int orc_dump(const char *_objname)
75{
76 int fd, nr_entries, i, *orc_ip = NULL, orc_size = 0;
77 struct orc_entry *orc = NULL;
78 char *name;
79 unsigned long nr_sections, orc_ip_addr = 0;
80 size_t shstrtab_idx;
81 Elf *elf;
82 Elf_Scn *scn;
83 GElf_Shdr sh;
84 GElf_Rela rela;
85 GElf_Sym sym;
86 Elf_Data *data, *symtab = NULL, *rela_orc_ip = NULL;
87
88
89 objname = _objname;
90
91 elf_version(EV_CURRENT);
92
93 fd = open(objname, O_RDONLY);
94 if (fd == -1) {
95 perror("open");
96 return -1;
97 }
98
99 elf = elf_begin(fd, ELF_C_READ_MMAP, NULL);
100 if (!elf) {
101 WARN_ELF("elf_begin");
102 return -1;
103 }
104
105 if (elf_getshdrnum(elf, &nr_sections)) {
106 WARN_ELF("elf_getshdrnum");
107 return -1;
108 }
109
110 if (elf_getshdrstrndx(elf, &shstrtab_idx)) {
111 WARN_ELF("elf_getshdrstrndx");
112 return -1;
113 }
114
115 for (i = 0; i < nr_sections; i++) {
116 scn = elf_getscn(elf, i);
117 if (!scn) {
118 WARN_ELF("elf_getscn");
119 return -1;
120 }
121
122 if (!gelf_getshdr(scn, &sh)) {
123 WARN_ELF("gelf_getshdr");
124 return -1;
125 }
126
127 name = elf_strptr(elf, shstrtab_idx, sh.sh_name);
128 if (!name) {
129 WARN_ELF("elf_strptr");
130 return -1;
131 }
132
133 data = elf_getdata(scn, NULL);
134 if (!data) {
135 WARN_ELF("elf_getdata");
136 return -1;
137 }
138
139 if (!strcmp(name, ".symtab")) {
140 symtab = data;
141 } else if (!strcmp(name, ".orc_unwind")) {
142 orc = data->d_buf;
143 orc_size = sh.sh_size;
144 } else if (!strcmp(name, ".orc_unwind_ip")) {
145 orc_ip = data->d_buf;
146 orc_ip_addr = sh.sh_addr;
147 } else if (!strcmp(name, ".rela.orc_unwind_ip")) {
148 rela_orc_ip = data;
149 }
150 }
151
152 if (!symtab || !orc || !orc_ip)
153 return 0;
154
155 if (orc_size % sizeof(*orc) != 0) {
156 WARN("bad .orc_unwind section size");
157 return -1;
158 }
159
160 nr_entries = orc_size / sizeof(*orc);
161 for (i = 0; i < nr_entries; i++) {
162 if (rela_orc_ip) {
163 if (!gelf_getrela(rela_orc_ip, i, &rela)) {
164 WARN_ELF("gelf_getrela");
165 return -1;
166 }
167
168 if (!gelf_getsym(symtab, GELF_R_SYM(rela.r_info), &sym)) {
169 WARN_ELF("gelf_getsym");
170 return -1;
171 }
172
173 scn = elf_getscn(elf, sym.st_shndx);
174 if (!scn) {
175 WARN_ELF("elf_getscn");
176 return -1;
177 }
178
179 if (!gelf_getshdr(scn, &sh)) {
180 WARN_ELF("gelf_getshdr");
181 return -1;
182 }
183
184 name = elf_strptr(elf, shstrtab_idx, sh.sh_name);
185 if (!name || !*name) {
186 WARN_ELF("elf_strptr");
187 return -1;
188 }
189
190 printf("%s+%lx:", name, rela.r_addend);
191
192 } else {
193 printf("%lx:", orc_ip_addr + (i * sizeof(int)) + orc_ip[i]);
194 }
195
196
197 printf(" sp:");
198
199 print_reg(orc[i].sp_reg, orc[i].sp_offset);
200
201 printf(" bp:");
202
203 print_reg(orc[i].bp_reg, orc[i].bp_offset);
204
205 printf(" type:%s\n", orc_type_name(orc[i].type));
206 }
207
208 elf_end(elf);
209 close(fd);
210
211 return 0;
212}
diff --git a/tools/objtool/orc_gen.c b/tools/objtool/orc_gen.c
new file mode 100644
index 000000000000..e5ca31429c9b
--- /dev/null
+++ b/tools/objtool/orc_gen.c
@@ -0,0 +1,214 @@
1/*
2 * Copyright (C) 2017 Josh Poimboeuf <jpoimboe@redhat.com>
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version 2
7 * of the License, or (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, see <http://www.gnu.org/licenses/>.
16 */
17
18#include <stdlib.h>
19#include <string.h>
20
21#include "orc.h"
22#include "check.h"
23#include "warn.h"
24
25int create_orc(struct objtool_file *file)
26{
27 struct instruction *insn;
28
29 for_each_insn(file, insn) {
30 struct orc_entry *orc = &insn->orc;
31 struct cfi_reg *cfa = &insn->state.cfa;
32 struct cfi_reg *bp = &insn->state.regs[CFI_BP];
33
34 if (cfa->base == CFI_UNDEFINED) {
35 orc->sp_reg = ORC_REG_UNDEFINED;
36 continue;
37 }
38
39 switch (cfa->base) {
40 case CFI_SP:
41 orc->sp_reg = ORC_REG_SP;
42 break;
43 case CFI_SP_INDIRECT:
44 orc->sp_reg = ORC_REG_SP_INDIRECT;
45 break;
46 case CFI_BP:
47 orc->sp_reg = ORC_REG_BP;
48 break;
49 case CFI_BP_INDIRECT:
50 orc->sp_reg = ORC_REG_BP_INDIRECT;
51 break;
52 case CFI_R10:
53 orc->sp_reg = ORC_REG_R10;
54 break;
55 case CFI_R13:
56 orc->sp_reg = ORC_REG_R13;
57 break;
58 case CFI_DI:
59 orc->sp_reg = ORC_REG_DI;
60 break;
61 case CFI_DX:
62 orc->sp_reg = ORC_REG_DX;
63 break;
64 default:
65 WARN_FUNC("unknown CFA base reg %d",
66 insn->sec, insn->offset, cfa->base);
67 return -1;
68 }
69
70 switch(bp->base) {
71 case CFI_UNDEFINED:
72 orc->bp_reg = ORC_REG_UNDEFINED;
73 break;
74 case CFI_CFA:
75 orc->bp_reg = ORC_REG_PREV_SP;
76 break;
77 case CFI_BP:
78 orc->bp_reg = ORC_REG_BP;
79 break;
80 default:
81 WARN_FUNC("unknown BP base reg %d",
82 insn->sec, insn->offset, bp->base);
83 return -1;
84 }
85
86 orc->sp_offset = cfa->offset;
87 orc->bp_offset = bp->offset;
88 orc->type = insn->state.type;
89 }
90
91 return 0;
92}
93
94static int create_orc_entry(struct section *u_sec, struct section *ip_relasec,
95 unsigned int idx, struct section *insn_sec,
96 unsigned long insn_off, struct orc_entry *o)
97{
98 struct orc_entry *orc;
99 struct rela *rela;
100
101 /* populate ORC data */
102 orc = (struct orc_entry *)u_sec->data->d_buf + idx;
103 memcpy(orc, o, sizeof(*orc));
104
105 /* populate rela for ip */
106 rela = malloc(sizeof(*rela));
107 if (!rela) {
108 perror("malloc");
109 return -1;
110 }
111 memset(rela, 0, sizeof(*rela));
112
113 rela->sym = insn_sec->sym;
114 rela->addend = insn_off;
115 rela->type = R_X86_64_PC32;
116 rela->offset = idx * sizeof(int);
117
118 list_add_tail(&rela->list, &ip_relasec->rela_list);
119 hash_add(ip_relasec->rela_hash, &rela->hash, rela->offset);
120
121 return 0;
122}
123
124int create_orc_sections(struct objtool_file *file)
125{
126 struct instruction *insn, *prev_insn;
127 struct section *sec, *u_sec, *ip_relasec;
128 unsigned int idx;
129
130 struct orc_entry empty = {
131 .sp_reg = ORC_REG_UNDEFINED,
132 .bp_reg = ORC_REG_UNDEFINED,
133 .type = ORC_TYPE_CALL,
134 };
135
136 sec = find_section_by_name(file->elf, ".orc_unwind");
137 if (sec) {
138 WARN("file already has .orc_unwind section, skipping");
139 return -1;
140 }
141
142 /* count the number of needed orcs */
143 idx = 0;
144 for_each_sec(file, sec) {
145 if (!sec->text)
146 continue;
147
148 prev_insn = NULL;
149 sec_for_each_insn(file, sec, insn) {
150 if (!prev_insn ||
151 memcmp(&insn->orc, &prev_insn->orc,
152 sizeof(struct orc_entry))) {
153 idx++;
154 }
155 prev_insn = insn;
156 }
157
158 /* section terminator */
159 if (prev_insn)
160 idx++;
161 }
162 if (!idx)
163 return -1;
164
165
166 /* create .orc_unwind_ip and .rela.orc_unwind_ip sections */
167 sec = elf_create_section(file->elf, ".orc_unwind_ip", sizeof(int), idx);
168
169 ip_relasec = elf_create_rela_section(file->elf, sec);
170 if (!ip_relasec)
171 return -1;
172
173 /* create .orc_unwind section */
174 u_sec = elf_create_section(file->elf, ".orc_unwind",
175 sizeof(struct orc_entry), idx);
176
177 /* populate sections */
178 idx = 0;
179 for_each_sec(file, sec) {
180 if (!sec->text)
181 continue;
182
183 prev_insn = NULL;
184 sec_for_each_insn(file, sec, insn) {
185 if (!prev_insn || memcmp(&insn->orc, &prev_insn->orc,
186 sizeof(struct orc_entry))) {
187
188 if (create_orc_entry(u_sec, ip_relasec, idx,
189 insn->sec, insn->offset,
190 &insn->orc))
191 return -1;
192
193 idx++;
194 }
195 prev_insn = insn;
196 }
197
198 /* section terminator */
199 if (prev_insn) {
200 if (create_orc_entry(u_sec, ip_relasec, idx,
201 prev_insn->sec,
202 prev_insn->offset + prev_insn->len,
203 &empty))
204 return -1;
205
206 idx++;
207 }
208 }
209
210 if (elf_rebuild_rela_section(ip_relasec))
211 return -1;
212
213 return 0;
214}
diff --git a/tools/objtool/orc_types.h b/tools/objtool/orc_types.h
new file mode 100644
index 000000000000..9c9dc579bd7d
--- /dev/null
+++ b/tools/objtool/orc_types.h
@@ -0,0 +1,107 @@
1/*
2 * Copyright (C) 2017 Josh Poimboeuf <jpoimboe@redhat.com>
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version 2
7 * of the License, or (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, see <http://www.gnu.org/licenses/>.
16 */
17
18#ifndef _ORC_TYPES_H
19#define _ORC_TYPES_H
20
21#include <linux/types.h>
22#include <linux/compiler.h>
23
24/*
25 * The ORC_REG_* registers are base registers which are used to find other
26 * registers on the stack.
27 *
28 * ORC_REG_PREV_SP, also known as DWARF Call Frame Address (CFA), is the
29 * address of the previous frame: the caller's SP before it called the current
30 * function.
31 *
32 * ORC_REG_UNDEFINED means the corresponding register's value didn't change in
33 * the current frame.
34 *
35 * The most commonly used base registers are SP and BP -- which the previous SP
36 * is usually based on -- and PREV_SP and UNDEFINED -- which the previous BP is
37 * usually based on.
38 *
39 * The rest of the base registers are needed for special cases like entry code
40 * and GCC realigned stacks.
41 */
42#define ORC_REG_UNDEFINED 0
43#define ORC_REG_PREV_SP 1
44#define ORC_REG_DX 2
45#define ORC_REG_DI 3
46#define ORC_REG_BP 4
47#define ORC_REG_SP 5
48#define ORC_REG_R10 6
49#define ORC_REG_R13 7
50#define ORC_REG_BP_INDIRECT 8
51#define ORC_REG_SP_INDIRECT 9
52#define ORC_REG_MAX 15
53
54/*
55 * ORC_TYPE_CALL: Indicates that sp_reg+sp_offset resolves to PREV_SP (the
56 * caller's SP right before it made the call). Used for all callable
57 * functions, i.e. all C code and all callable asm functions.
58 *
59 * ORC_TYPE_REGS: Used in entry code to indicate that sp_reg+sp_offset points
60 * to a fully populated pt_regs from a syscall, interrupt, or exception.
61 *
62 * ORC_TYPE_REGS_IRET: Used in entry code to indicate that sp_reg+sp_offset
63 * points to the iret return frame.
64 *
65 * The UNWIND_HINT macros are used only for the unwind_hint struct. They
66 * aren't used in struct orc_entry due to size and complexity constraints.
67 * Objtool converts them to real types when it converts the hints to orc
68 * entries.
69 */
70#define ORC_TYPE_CALL 0
71#define ORC_TYPE_REGS 1
72#define ORC_TYPE_REGS_IRET 2
73#define UNWIND_HINT_TYPE_SAVE 3
74#define UNWIND_HINT_TYPE_RESTORE 4
75
76#ifndef __ASSEMBLY__
77/*
78 * This struct is more or less a vastly simplified version of the DWARF Call
79 * Frame Information standard. It contains only the necessary parts of DWARF
80 * CFI, simplified for ease of access by the in-kernel unwinder. It tells the
81 * unwinder how to find the previous SP and BP (and sometimes entry regs) on
82 * the stack for a given code address. Each instance of the struct corresponds
83 * to one or more code locations.
84 */
85struct orc_entry {
86 s16 sp_offset;
87 s16 bp_offset;
88 unsigned sp_reg:4;
89 unsigned bp_reg:4;
90 unsigned type:2;
91} __packed;
92
93/*
94 * This struct is used by asm and inline asm code to manually annotate the
95 * location of registers on the stack for the ORC unwinder.
96 *
97 * Type can be either ORC_TYPE_* or UNWIND_HINT_TYPE_*.
98 */
99struct unwind_hint {
100 u32 ip;
101 s16 sp_offset;
102 u8 sp_reg;
103 u8 type;
104};
105#endif /* __ASSEMBLY__ */
106
107#endif /* _ORC_TYPES_H */