aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorGreg Kroah-Hartman <gregkh@linuxfoundation.org>2017-10-09 03:02:35 -0400
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>2017-10-09 03:02:35 -0400
commit1236d6bb6e19fc72ffc6bbcdeb1bfefe450e54ee (patch)
tree47da3feee8e263e8c9352c85cf518e624be3c211 /kernel
parent750b1a6894ecc9b178c6e3d0a1170122971b2036 (diff)
parent8a5776a5f49812d29fe4b2d0a2d71675c3facf3f (diff)
Merge 4.14-rc4 into staging-next
We want the staging/iio fixes in here as well to handle merge issues. Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/bpf/core.c2
-rw-r--r--kernel/bpf/devmap.c6
-rw-r--r--kernel/bpf/syscall.c6
-rw-r--r--kernel/bpf/verifier.c7
-rw-r--r--kernel/cgroup/cgroup.c8
-rw-r--r--kernel/cpu.c512
-rw-r--r--kernel/events/core.c3
-rw-r--r--kernel/events/ring_buffer.c20
-rw-r--r--kernel/exit.c23
-rw-r--r--kernel/extable.c45
-rw-r--r--kernel/fork.c18
-rw-r--r--kernel/futex.c33
-rw-r--r--kernel/irq/chip.c2
-rw-r--r--kernel/irq/generic-chip.c1
-rw-r--r--kernel/irq/irqdomain.c4
-rw-r--r--kernel/irq/manage.c4
-rw-r--r--kernel/kcmp.c2
-rw-r--r--kernel/locking/rwsem-xadd.c27
-rw-r--r--kernel/memremap.c4
-rw-r--r--kernel/params.c35
-rw-r--r--kernel/power/suspend.c18
-rw-r--r--kernel/rcu/tree.c10
-rw-r--r--kernel/sched/core.c24
-rw-r--r--kernel/sched/debug.c2
-rw-r--r--kernel/seccomp.c344
-rw-r--r--kernel/smpboot.c25
-rw-r--r--kernel/sysctl.c27
-rw-r--r--kernel/trace/blktrace.c18
-rw-r--r--kernel/trace/ftrace.c14
-rw-r--r--kernel/trace/trace.c19
-rw-r--r--kernel/trace/trace.h2
-rw-r--r--kernel/trace/trace_mmiotrace.c1
-rw-r--r--kernel/trace/trace_output.c21
-rw-r--r--kernel/trace/trace_sched_wakeup.c8
-rw-r--r--kernel/trace/trace_stack.c15
-rw-r--r--kernel/watchdog.c643
-rw-r--r--kernel/watchdog_hld.c196
37 files changed, 1324 insertions, 825 deletions
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 917cc04a0a94..7b62df86be1d 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1022,7 +1022,7 @@ select_insn:
1022 struct bpf_map *map = (struct bpf_map *) (unsigned long) BPF_R2; 1022 struct bpf_map *map = (struct bpf_map *) (unsigned long) BPF_R2;
1023 struct bpf_array *array = container_of(map, struct bpf_array, map); 1023 struct bpf_array *array = container_of(map, struct bpf_array, map);
1024 struct bpf_prog *prog; 1024 struct bpf_prog *prog;
1025 u64 index = BPF_R3; 1025 u32 index = BPF_R3;
1026 1026
1027 if (unlikely(index >= array->map.max_entries)) 1027 if (unlikely(index >= array->map.max_entries))
1028 goto out; 1028 goto out;
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index 959c9a07f318..e093d9a2c4dd 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -75,8 +75,8 @@ static u64 dev_map_bitmap_size(const union bpf_attr *attr)
75static struct bpf_map *dev_map_alloc(union bpf_attr *attr) 75static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
76{ 76{
77 struct bpf_dtab *dtab; 77 struct bpf_dtab *dtab;
78 int err = -EINVAL;
78 u64 cost; 79 u64 cost;
79 int err;
80 80
81 /* check sanity of attributes */ 81 /* check sanity of attributes */
82 if (attr->max_entries == 0 || attr->key_size != 4 || 82 if (attr->max_entries == 0 || attr->key_size != 4 ||
@@ -108,6 +108,8 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
108 if (err) 108 if (err)
109 goto free_dtab; 109 goto free_dtab;
110 110
111 err = -ENOMEM;
112
111 /* A per cpu bitfield with a bit per possible net device */ 113 /* A per cpu bitfield with a bit per possible net device */
112 dtab->flush_needed = __alloc_percpu(dev_map_bitmap_size(attr), 114 dtab->flush_needed = __alloc_percpu(dev_map_bitmap_size(attr),
113 __alignof__(unsigned long)); 115 __alignof__(unsigned long));
@@ -128,7 +130,7 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
128free_dtab: 130free_dtab:
129 free_percpu(dtab->flush_needed); 131 free_percpu(dtab->flush_needed);
130 kfree(dtab); 132 kfree(dtab);
131 return ERR_PTR(-ENOMEM); 133 return ERR_PTR(err);
132} 134}
133 135
134static void dev_map_free(struct bpf_map *map) 136static void dev_map_free(struct bpf_map *map)
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index cb17e1cd1d43..25d074920a00 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -186,15 +186,17 @@ static int bpf_map_alloc_id(struct bpf_map *map)
186 186
187static void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock) 187static void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock)
188{ 188{
189 unsigned long flags;
190
189 if (do_idr_lock) 191 if (do_idr_lock)
190 spin_lock_bh(&map_idr_lock); 192 spin_lock_irqsave(&map_idr_lock, flags);
191 else 193 else
192 __acquire(&map_idr_lock); 194 __acquire(&map_idr_lock);
193 195
194 idr_remove(&map_idr, map->id); 196 idr_remove(&map_idr, map->id);
195 197
196 if (do_idr_lock) 198 if (do_idr_lock)
197 spin_unlock_bh(&map_idr_lock); 199 spin_unlock_irqrestore(&map_idr_lock, flags);
198 else 200 else
199 __release(&map_idr_lock); 201 __release(&map_idr_lock);
200} 202}
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 799b2451ef2d..b914fbe1383e 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -4205,7 +4205,12 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
4205 } 4205 }
4206 4206
4207 if (insn->imm == BPF_FUNC_redirect_map) { 4207 if (insn->imm == BPF_FUNC_redirect_map) {
4208 u64 addr = (unsigned long)prog; 4208 /* Note, we cannot use prog directly as imm as subsequent
4209 * rewrites would still change the prog pointer. The only
4210 * stable address we can use is aux, which also works with
4211 * prog clones during blinding.
4212 */
4213 u64 addr = (unsigned long)prog->aux;
4209 struct bpf_insn r4_ld[] = { 4214 struct bpf_insn r4_ld[] = {
4210 BPF_LD_IMM64(BPF_REG_4, addr), 4215 BPF_LD_IMM64(BPF_REG_4, addr),
4211 *insn, 4216 *insn,
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index d6551cd45238..44857278eb8a 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -2311,6 +2311,14 @@ out_release_tset:
2311 list_del_init(&cset->mg_node); 2311 list_del_init(&cset->mg_node);
2312 } 2312 }
2313 spin_unlock_irq(&css_set_lock); 2313 spin_unlock_irq(&css_set_lock);
2314
2315 /*
2316 * Re-initialize the cgroup_taskset structure in case it is reused
2317 * again in another cgroup_migrate_add_task()/cgroup_migrate_execute()
2318 * iteration.
2319 */
2320 tset->nr_tasks = 0;
2321 tset->csets = &tset->src_csets;
2314 return ret; 2322 return ret;
2315} 2323}
2316 2324
diff --git a/kernel/cpu.c b/kernel/cpu.c
index acf5308fad51..d851df22f5c5 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -24,6 +24,7 @@
24#include <linux/lockdep.h> 24#include <linux/lockdep.h>
25#include <linux/tick.h> 25#include <linux/tick.h>
26#include <linux/irq.h> 26#include <linux/irq.h>
27#include <linux/nmi.h>
27#include <linux/smpboot.h> 28#include <linux/smpboot.h>
28#include <linux/relay.h> 29#include <linux/relay.h>
29#include <linux/slab.h> 30#include <linux/slab.h>
@@ -46,11 +47,13 @@
46 * @bringup: Single callback bringup or teardown selector 47 * @bringup: Single callback bringup or teardown selector
47 * @cb_state: The state for a single callback (install/uninstall) 48 * @cb_state: The state for a single callback (install/uninstall)
48 * @result: Result of the operation 49 * @result: Result of the operation
49 * @done: Signal completion to the issuer of the task 50 * @done_up: Signal completion to the issuer of the task for cpu-up
51 * @done_down: Signal completion to the issuer of the task for cpu-down
50 */ 52 */
51struct cpuhp_cpu_state { 53struct cpuhp_cpu_state {
52 enum cpuhp_state state; 54 enum cpuhp_state state;
53 enum cpuhp_state target; 55 enum cpuhp_state target;
56 enum cpuhp_state fail;
54#ifdef CONFIG_SMP 57#ifdef CONFIG_SMP
55 struct task_struct *thread; 58 struct task_struct *thread;
56 bool should_run; 59 bool should_run;
@@ -58,18 +61,39 @@ struct cpuhp_cpu_state {
58 bool single; 61 bool single;
59 bool bringup; 62 bool bringup;
60 struct hlist_node *node; 63 struct hlist_node *node;
64 struct hlist_node *last;
61 enum cpuhp_state cb_state; 65 enum cpuhp_state cb_state;
62 int result; 66 int result;
63 struct completion done; 67 struct completion done_up;
68 struct completion done_down;
64#endif 69#endif
65}; 70};
66 71
67static DEFINE_PER_CPU(struct cpuhp_cpu_state, cpuhp_state); 72static DEFINE_PER_CPU(struct cpuhp_cpu_state, cpuhp_state) = {
73 .fail = CPUHP_INVALID,
74};
68 75
69#if defined(CONFIG_LOCKDEP) && defined(CONFIG_SMP) 76#if defined(CONFIG_LOCKDEP) && defined(CONFIG_SMP)
70static struct lock_class_key cpuhp_state_key; 77static struct lockdep_map cpuhp_state_up_map =
71static struct lockdep_map cpuhp_state_lock_map = 78 STATIC_LOCKDEP_MAP_INIT("cpuhp_state-up", &cpuhp_state_up_map);
72 STATIC_LOCKDEP_MAP_INIT("cpuhp_state", &cpuhp_state_key); 79static struct lockdep_map cpuhp_state_down_map =
80 STATIC_LOCKDEP_MAP_INIT("cpuhp_state-down", &cpuhp_state_down_map);
81
82
83static void inline cpuhp_lock_acquire(bool bringup)
84{
85 lock_map_acquire(bringup ? &cpuhp_state_up_map : &cpuhp_state_down_map);
86}
87
88static void inline cpuhp_lock_release(bool bringup)
89{
90 lock_map_release(bringup ? &cpuhp_state_up_map : &cpuhp_state_down_map);
91}
92#else
93
94static void inline cpuhp_lock_acquire(bool bringup) { }
95static void inline cpuhp_lock_release(bool bringup) { }
96
73#endif 97#endif
74 98
75/** 99/**
@@ -123,13 +147,16 @@ static struct cpuhp_step *cpuhp_get_step(enum cpuhp_state state)
123/** 147/**
124 * cpuhp_invoke_callback _ Invoke the callbacks for a given state 148 * cpuhp_invoke_callback _ Invoke the callbacks for a given state
125 * @cpu: The cpu for which the callback should be invoked 149 * @cpu: The cpu for which the callback should be invoked
126 * @step: The step in the state machine 150 * @state: The state to do callbacks for
127 * @bringup: True if the bringup callback should be invoked 151 * @bringup: True if the bringup callback should be invoked
152 * @node: For multi-instance, do a single entry callback for install/remove
153 * @lastp: For multi-instance rollback, remember how far we got
128 * 154 *
129 * Called from cpu hotplug and from the state register machinery. 155 * Called from cpu hotplug and from the state register machinery.
130 */ 156 */
131static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state state, 157static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state state,
132 bool bringup, struct hlist_node *node) 158 bool bringup, struct hlist_node *node,
159 struct hlist_node **lastp)
133{ 160{
134 struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); 161 struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
135 struct cpuhp_step *step = cpuhp_get_step(state); 162 struct cpuhp_step *step = cpuhp_get_step(state);
@@ -137,7 +164,17 @@ static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state state,
137 int (*cb)(unsigned int cpu); 164 int (*cb)(unsigned int cpu);
138 int ret, cnt; 165 int ret, cnt;
139 166
167 if (st->fail == state) {
168 st->fail = CPUHP_INVALID;
169
170 if (!(bringup ? step->startup.single : step->teardown.single))
171 return 0;
172
173 return -EAGAIN;
174 }
175
140 if (!step->multi_instance) { 176 if (!step->multi_instance) {
177 WARN_ON_ONCE(lastp && *lastp);
141 cb = bringup ? step->startup.single : step->teardown.single; 178 cb = bringup ? step->startup.single : step->teardown.single;
142 if (!cb) 179 if (!cb)
143 return 0; 180 return 0;
@@ -152,6 +189,7 @@ static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state state,
152 189
153 /* Single invocation for instance add/remove */ 190 /* Single invocation for instance add/remove */
154 if (node) { 191 if (node) {
192 WARN_ON_ONCE(lastp && *lastp);
155 trace_cpuhp_multi_enter(cpu, st->target, state, cbm, node); 193 trace_cpuhp_multi_enter(cpu, st->target, state, cbm, node);
156 ret = cbm(cpu, node); 194 ret = cbm(cpu, node);
157 trace_cpuhp_exit(cpu, st->state, state, ret); 195 trace_cpuhp_exit(cpu, st->state, state, ret);
@@ -161,13 +199,23 @@ static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state state,
161 /* State transition. Invoke on all instances */ 199 /* State transition. Invoke on all instances */
162 cnt = 0; 200 cnt = 0;
163 hlist_for_each(node, &step->list) { 201 hlist_for_each(node, &step->list) {
202 if (lastp && node == *lastp)
203 break;
204
164 trace_cpuhp_multi_enter(cpu, st->target, state, cbm, node); 205 trace_cpuhp_multi_enter(cpu, st->target, state, cbm, node);
165 ret = cbm(cpu, node); 206 ret = cbm(cpu, node);
166 trace_cpuhp_exit(cpu, st->state, state, ret); 207 trace_cpuhp_exit(cpu, st->state, state, ret);
167 if (ret) 208 if (ret) {
168 goto err; 209 if (!lastp)
210 goto err;
211
212 *lastp = node;
213 return ret;
214 }
169 cnt++; 215 cnt++;
170 } 216 }
217 if (lastp)
218 *lastp = NULL;
171 return 0; 219 return 0;
172err: 220err:
173 /* Rollback the instances if one failed */ 221 /* Rollback the instances if one failed */
@@ -178,12 +226,39 @@ err:
178 hlist_for_each(node, &step->list) { 226 hlist_for_each(node, &step->list) {
179 if (!cnt--) 227 if (!cnt--)
180 break; 228 break;
181 cbm(cpu, node); 229
230 trace_cpuhp_multi_enter(cpu, st->target, state, cbm, node);
231 ret = cbm(cpu, node);
232 trace_cpuhp_exit(cpu, st->state, state, ret);
233 /*
234 * Rollback must not fail,
235 */
236 WARN_ON_ONCE(ret);
182 } 237 }
183 return ret; 238 return ret;
184} 239}
185 240
186#ifdef CONFIG_SMP 241#ifdef CONFIG_SMP
242static inline void wait_for_ap_thread(struct cpuhp_cpu_state *st, bool bringup)
243{
244 struct completion *done = bringup ? &st->done_up : &st->done_down;
245 wait_for_completion(done);
246}
247
248static inline void complete_ap_thread(struct cpuhp_cpu_state *st, bool bringup)
249{
250 struct completion *done = bringup ? &st->done_up : &st->done_down;
251 complete(done);
252}
253
254/*
255 * The former STARTING/DYING states, ran with IRQs disabled and must not fail.
256 */
257static bool cpuhp_is_atomic_state(enum cpuhp_state state)
258{
259 return CPUHP_AP_IDLE_DEAD <= state && state < CPUHP_AP_ONLINE;
260}
261
187/* Serializes the updates to cpu_online_mask, cpu_present_mask */ 262/* Serializes the updates to cpu_online_mask, cpu_present_mask */
188static DEFINE_MUTEX(cpu_add_remove_lock); 263static DEFINE_MUTEX(cpu_add_remove_lock);
189bool cpuhp_tasks_frozen; 264bool cpuhp_tasks_frozen;
@@ -271,14 +346,79 @@ void cpu_hotplug_enable(void)
271EXPORT_SYMBOL_GPL(cpu_hotplug_enable); 346EXPORT_SYMBOL_GPL(cpu_hotplug_enable);
272#endif /* CONFIG_HOTPLUG_CPU */ 347#endif /* CONFIG_HOTPLUG_CPU */
273 348
274static void __cpuhp_kick_ap_work(struct cpuhp_cpu_state *st); 349static inline enum cpuhp_state
350cpuhp_set_state(struct cpuhp_cpu_state *st, enum cpuhp_state target)
351{
352 enum cpuhp_state prev_state = st->state;
353
354 st->rollback = false;
355 st->last = NULL;
356
357 st->target = target;
358 st->single = false;
359 st->bringup = st->state < target;
360
361 return prev_state;
362}
363
364static inline void
365cpuhp_reset_state(struct cpuhp_cpu_state *st, enum cpuhp_state prev_state)
366{
367 st->rollback = true;
368
369 /*
370 * If we have st->last we need to undo partial multi_instance of this
371 * state first. Otherwise start undo at the previous state.
372 */
373 if (!st->last) {
374 if (st->bringup)
375 st->state--;
376 else
377 st->state++;
378 }
379
380 st->target = prev_state;
381 st->bringup = !st->bringup;
382}
383
384/* Regular hotplug invocation of the AP hotplug thread */
385static void __cpuhp_kick_ap(struct cpuhp_cpu_state *st)
386{
387 if (!st->single && st->state == st->target)
388 return;
389
390 st->result = 0;
391 /*
392 * Make sure the above stores are visible before should_run becomes
393 * true. Paired with the mb() above in cpuhp_thread_fun()
394 */
395 smp_mb();
396 st->should_run = true;
397 wake_up_process(st->thread);
398 wait_for_ap_thread(st, st->bringup);
399}
400
401static int cpuhp_kick_ap(struct cpuhp_cpu_state *st, enum cpuhp_state target)
402{
403 enum cpuhp_state prev_state;
404 int ret;
405
406 prev_state = cpuhp_set_state(st, target);
407 __cpuhp_kick_ap(st);
408 if ((ret = st->result)) {
409 cpuhp_reset_state(st, prev_state);
410 __cpuhp_kick_ap(st);
411 }
412
413 return ret;
414}
275 415
276static int bringup_wait_for_ap(unsigned int cpu) 416static int bringup_wait_for_ap(unsigned int cpu)
277{ 417{
278 struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); 418 struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
279 419
280 /* Wait for the CPU to reach CPUHP_AP_ONLINE_IDLE */ 420 /* Wait for the CPU to reach CPUHP_AP_ONLINE_IDLE */
281 wait_for_completion(&st->done); 421 wait_for_ap_thread(st, true);
282 if (WARN_ON_ONCE((!cpu_online(cpu)))) 422 if (WARN_ON_ONCE((!cpu_online(cpu))))
283 return -ECANCELED; 423 return -ECANCELED;
284 424
@@ -286,12 +426,10 @@ static int bringup_wait_for_ap(unsigned int cpu)
286 stop_machine_unpark(cpu); 426 stop_machine_unpark(cpu);
287 kthread_unpark(st->thread); 427 kthread_unpark(st->thread);
288 428
289 /* Should we go further up ? */ 429 if (st->target <= CPUHP_AP_ONLINE_IDLE)
290 if (st->target > CPUHP_AP_ONLINE_IDLE) { 430 return 0;
291 __cpuhp_kick_ap_work(st); 431
292 wait_for_completion(&st->done); 432 return cpuhp_kick_ap(st, st->target);
293 }
294 return st->result;
295} 433}
296 434
297static int bringup_cpu(unsigned int cpu) 435static int bringup_cpu(unsigned int cpu)
@@ -317,32 +455,6 @@ static int bringup_cpu(unsigned int cpu)
317/* 455/*
318 * Hotplug state machine related functions 456 * Hotplug state machine related functions
319 */ 457 */
320static void undo_cpu_down(unsigned int cpu, struct cpuhp_cpu_state *st)
321{
322 for (st->state++; st->state < st->target; st->state++) {
323 struct cpuhp_step *step = cpuhp_get_step(st->state);
324
325 if (!step->skip_onerr)
326 cpuhp_invoke_callback(cpu, st->state, true, NULL);
327 }
328}
329
330static int cpuhp_down_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st,
331 enum cpuhp_state target)
332{
333 enum cpuhp_state prev_state = st->state;
334 int ret = 0;
335
336 for (; st->state > target; st->state--) {
337 ret = cpuhp_invoke_callback(cpu, st->state, false, NULL);
338 if (ret) {
339 st->target = prev_state;
340 undo_cpu_down(cpu, st);
341 break;
342 }
343 }
344 return ret;
345}
346 458
347static void undo_cpu_up(unsigned int cpu, struct cpuhp_cpu_state *st) 459static void undo_cpu_up(unsigned int cpu, struct cpuhp_cpu_state *st)
348{ 460{
@@ -350,7 +462,7 @@ static void undo_cpu_up(unsigned int cpu, struct cpuhp_cpu_state *st)
350 struct cpuhp_step *step = cpuhp_get_step(st->state); 462 struct cpuhp_step *step = cpuhp_get_step(st->state);
351 463
352 if (!step->skip_onerr) 464 if (!step->skip_onerr)
353 cpuhp_invoke_callback(cpu, st->state, false, NULL); 465 cpuhp_invoke_callback(cpu, st->state, false, NULL, NULL);
354 } 466 }
355} 467}
356 468
@@ -362,7 +474,7 @@ static int cpuhp_up_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st,
362 474
363 while (st->state < target) { 475 while (st->state < target) {
364 st->state++; 476 st->state++;
365 ret = cpuhp_invoke_callback(cpu, st->state, true, NULL); 477 ret = cpuhp_invoke_callback(cpu, st->state, true, NULL, NULL);
366 if (ret) { 478 if (ret) {
367 st->target = prev_state; 479 st->target = prev_state;
368 undo_cpu_up(cpu, st); 480 undo_cpu_up(cpu, st);
@@ -379,7 +491,8 @@ static void cpuhp_create(unsigned int cpu)
379{ 491{
380 struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); 492 struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
381 493
382 init_completion(&st->done); 494 init_completion(&st->done_up);
495 init_completion(&st->done_down);
383} 496}
384 497
385static int cpuhp_should_run(unsigned int cpu) 498static int cpuhp_should_run(unsigned int cpu)
@@ -389,69 +502,90 @@ static int cpuhp_should_run(unsigned int cpu)
389 return st->should_run; 502 return st->should_run;
390} 503}
391 504
392/* Execute the teardown callbacks. Used to be CPU_DOWN_PREPARE */
393static int cpuhp_ap_offline(unsigned int cpu, struct cpuhp_cpu_state *st)
394{
395 enum cpuhp_state target = max((int)st->target, CPUHP_TEARDOWN_CPU);
396
397 return cpuhp_down_callbacks(cpu, st, target);
398}
399
400/* Execute the online startup callbacks. Used to be CPU_ONLINE */
401static int cpuhp_ap_online(unsigned int cpu, struct cpuhp_cpu_state *st)
402{
403 return cpuhp_up_callbacks(cpu, st, st->target);
404}
405
406/* 505/*
407 * Execute teardown/startup callbacks on the plugged cpu. Also used to invoke 506 * Execute teardown/startup callbacks on the plugged cpu. Also used to invoke
408 * callbacks when a state gets [un]installed at runtime. 507 * callbacks when a state gets [un]installed at runtime.
508 *
509 * Each invocation of this function by the smpboot thread does a single AP
510 * state callback.
511 *
512 * It has 3 modes of operation:
513 * - single: runs st->cb_state
514 * - up: runs ++st->state, while st->state < st->target
515 * - down: runs st->state--, while st->state > st->target
516 *
517 * When complete or on error, should_run is cleared and the completion is fired.
409 */ 518 */
410static void cpuhp_thread_fun(unsigned int cpu) 519static void cpuhp_thread_fun(unsigned int cpu)
411{ 520{
412 struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state); 521 struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state);
413 int ret = 0; 522 bool bringup = st->bringup;
523 enum cpuhp_state state;
414 524
415 /* 525 /*
416 * Paired with the mb() in cpuhp_kick_ap_work and 526 * ACQUIRE for the cpuhp_should_run() load of ->should_run. Ensures
417 * cpuhp_invoke_ap_callback, so the work set is consistent visible. 527 * that if we see ->should_run we also see the rest of the state.
418 */ 528 */
419 smp_mb(); 529 smp_mb();
420 if (!st->should_run) 530
531 if (WARN_ON_ONCE(!st->should_run))
421 return; 532 return;
422 533
423 st->should_run = false; 534 cpuhp_lock_acquire(bringup);
424 535
425 lock_map_acquire(&cpuhp_state_lock_map);
426 /* Single callback invocation for [un]install ? */
427 if (st->single) { 536 if (st->single) {
428 if (st->cb_state < CPUHP_AP_ONLINE) { 537 state = st->cb_state;
429 local_irq_disable(); 538 st->should_run = false;
430 ret = cpuhp_invoke_callback(cpu, st->cb_state, 539 } else {
431 st->bringup, st->node); 540 if (bringup) {
432 local_irq_enable(); 541 st->state++;
542 state = st->state;
543 st->should_run = (st->state < st->target);
544 WARN_ON_ONCE(st->state > st->target);
433 } else { 545 } else {
434 ret = cpuhp_invoke_callback(cpu, st->cb_state, 546 state = st->state;
435 st->bringup, st->node); 547 st->state--;
548 st->should_run = (st->state > st->target);
549 WARN_ON_ONCE(st->state < st->target);
436 } 550 }
437 } else if (st->rollback) { 551 }
438 BUG_ON(st->state < CPUHP_AP_ONLINE_IDLE); 552
553 WARN_ON_ONCE(!cpuhp_is_ap_state(state));
554
555 if (st->rollback) {
556 struct cpuhp_step *step = cpuhp_get_step(state);
557 if (step->skip_onerr)
558 goto next;
559 }
560
561 if (cpuhp_is_atomic_state(state)) {
562 local_irq_disable();
563 st->result = cpuhp_invoke_callback(cpu, state, bringup, st->node, &st->last);
564 local_irq_enable();
439 565
440 undo_cpu_down(cpu, st); 566 /*
441 st->rollback = false; 567 * STARTING/DYING must not fail!
568 */
569 WARN_ON_ONCE(st->result);
442 } else { 570 } else {
443 /* Cannot happen .... */ 571 st->result = cpuhp_invoke_callback(cpu, state, bringup, st->node, &st->last);
444 BUG_ON(st->state < CPUHP_AP_ONLINE_IDLE); 572 }
445 573
446 /* Regular hotplug work */ 574 if (st->result) {
447 if (st->state < st->target) 575 /*
448 ret = cpuhp_ap_online(cpu, st); 576 * If we fail on a rollback, we're up a creek without no
449 else if (st->state > st->target) 577 * paddle, no way forward, no way back. We loose, thanks for
450 ret = cpuhp_ap_offline(cpu, st); 578 * playing.
579 */
580 WARN_ON_ONCE(st->rollback);
581 st->should_run = false;
451 } 582 }
452 lock_map_release(&cpuhp_state_lock_map); 583
453 st->result = ret; 584next:
454 complete(&st->done); 585 cpuhp_lock_release(bringup);
586
587 if (!st->should_run)
588 complete_ap_thread(st, bringup);
455} 589}
456 590
457/* Invoke a single callback on a remote cpu */ 591/* Invoke a single callback on a remote cpu */
@@ -460,62 +594,64 @@ cpuhp_invoke_ap_callback(int cpu, enum cpuhp_state state, bool bringup,
460 struct hlist_node *node) 594 struct hlist_node *node)
461{ 595{
462 struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); 596 struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
597 int ret;
463 598
464 if (!cpu_online(cpu)) 599 if (!cpu_online(cpu))
465 return 0; 600 return 0;
466 601
467 lock_map_acquire(&cpuhp_state_lock_map); 602 cpuhp_lock_acquire(false);
468 lock_map_release(&cpuhp_state_lock_map); 603 cpuhp_lock_release(false);
604
605 cpuhp_lock_acquire(true);
606 cpuhp_lock_release(true);
469 607
470 /* 608 /*
471 * If we are up and running, use the hotplug thread. For early calls 609 * If we are up and running, use the hotplug thread. For early calls
472 * we invoke the thread function directly. 610 * we invoke the thread function directly.
473 */ 611 */
474 if (!st->thread) 612 if (!st->thread)
475 return cpuhp_invoke_callback(cpu, state, bringup, node); 613 return cpuhp_invoke_callback(cpu, state, bringup, node, NULL);
614
615 st->rollback = false;
616 st->last = NULL;
476 617
618 st->node = node;
619 st->bringup = bringup;
477 st->cb_state = state; 620 st->cb_state = state;
478 st->single = true; 621 st->single = true;
479 st->bringup = bringup;
480 st->node = node;
481 622
482 /* 623 __cpuhp_kick_ap(st);
483 * Make sure the above stores are visible before should_run becomes
484 * true. Paired with the mb() above in cpuhp_thread_fun()
485 */
486 smp_mb();
487 st->should_run = true;
488 wake_up_process(st->thread);
489 wait_for_completion(&st->done);
490 return st->result;
491}
492 624
493/* Regular hotplug invocation of the AP hotplug thread */
494static void __cpuhp_kick_ap_work(struct cpuhp_cpu_state *st)
495{
496 st->result = 0;
497 st->single = false;
498 /* 625 /*
499 * Make sure the above stores are visible before should_run becomes 626 * If we failed and did a partial, do a rollback.
500 * true. Paired with the mb() above in cpuhp_thread_fun()
501 */ 627 */
502 smp_mb(); 628 if ((ret = st->result) && st->last) {
503 st->should_run = true; 629 st->rollback = true;
504 wake_up_process(st->thread); 630 st->bringup = !bringup;
631
632 __cpuhp_kick_ap(st);
633 }
634
635 return ret;
505} 636}
506 637
507static int cpuhp_kick_ap_work(unsigned int cpu) 638static int cpuhp_kick_ap_work(unsigned int cpu)
508{ 639{
509 struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); 640 struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
510 enum cpuhp_state state = st->state; 641 enum cpuhp_state prev_state = st->state;
642 int ret;
643
644 cpuhp_lock_acquire(false);
645 cpuhp_lock_release(false);
511 646
512 trace_cpuhp_enter(cpu, st->target, state, cpuhp_kick_ap_work); 647 cpuhp_lock_acquire(true);
513 lock_map_acquire(&cpuhp_state_lock_map); 648 cpuhp_lock_release(true);
514 lock_map_release(&cpuhp_state_lock_map); 649
515 __cpuhp_kick_ap_work(st); 650 trace_cpuhp_enter(cpu, st->target, prev_state, cpuhp_kick_ap_work);
516 wait_for_completion(&st->done); 651 ret = cpuhp_kick_ap(st, st->target);
517 trace_cpuhp_exit(cpu, st->state, state, st->result); 652 trace_cpuhp_exit(cpu, st->state, prev_state, ret);
518 return st->result; 653
654 return ret;
519} 655}
520 656
521static struct smp_hotplug_thread cpuhp_threads = { 657static struct smp_hotplug_thread cpuhp_threads = {
@@ -581,6 +717,7 @@ static int take_cpu_down(void *_param)
581 struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state); 717 struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state);
582 enum cpuhp_state target = max((int)st->target, CPUHP_AP_OFFLINE); 718 enum cpuhp_state target = max((int)st->target, CPUHP_AP_OFFLINE);
583 int err, cpu = smp_processor_id(); 719 int err, cpu = smp_processor_id();
720 int ret;
584 721
585 /* Ensure this CPU doesn't handle any more interrupts. */ 722 /* Ensure this CPU doesn't handle any more interrupts. */
586 err = __cpu_disable(); 723 err = __cpu_disable();
@@ -594,8 +731,13 @@ static int take_cpu_down(void *_param)
594 WARN_ON(st->state != CPUHP_TEARDOWN_CPU); 731 WARN_ON(st->state != CPUHP_TEARDOWN_CPU);
595 st->state--; 732 st->state--;
596 /* Invoke the former CPU_DYING callbacks */ 733 /* Invoke the former CPU_DYING callbacks */
597 for (; st->state > target; st->state--) 734 for (; st->state > target; st->state--) {
598 cpuhp_invoke_callback(cpu, st->state, false, NULL); 735 ret = cpuhp_invoke_callback(cpu, st->state, false, NULL, NULL);
736 /*
737 * DYING must not fail!
738 */
739 WARN_ON_ONCE(ret);
740 }
599 741
600 /* Give up timekeeping duties */ 742 /* Give up timekeeping duties */
601 tick_handover_do_timer(); 743 tick_handover_do_timer();
@@ -639,7 +781,7 @@ static int takedown_cpu(unsigned int cpu)
639 * 781 *
640 * Wait for the stop thread to go away. 782 * Wait for the stop thread to go away.
641 */ 783 */
642 wait_for_completion(&st->done); 784 wait_for_ap_thread(st, false);
643 BUG_ON(st->state != CPUHP_AP_IDLE_DEAD); 785 BUG_ON(st->state != CPUHP_AP_IDLE_DEAD);
644 786
645 /* Interrupts are moved away from the dying cpu, reenable alloc/free */ 787 /* Interrupts are moved away from the dying cpu, reenable alloc/free */
@@ -658,7 +800,7 @@ static void cpuhp_complete_idle_dead(void *arg)
658{ 800{
659 struct cpuhp_cpu_state *st = arg; 801 struct cpuhp_cpu_state *st = arg;
660 802
661 complete(&st->done); 803 complete_ap_thread(st, false);
662} 804}
663 805
664void cpuhp_report_idle_dead(void) 806void cpuhp_report_idle_dead(void)
@@ -676,11 +818,32 @@ void cpuhp_report_idle_dead(void)
676 cpuhp_complete_idle_dead, st, 0); 818 cpuhp_complete_idle_dead, st, 0);
677} 819}
678 820
679#else 821static void undo_cpu_down(unsigned int cpu, struct cpuhp_cpu_state *st)
680#define takedown_cpu NULL 822{
681#endif 823 for (st->state++; st->state < st->target; st->state++) {
824 struct cpuhp_step *step = cpuhp_get_step(st->state);
682 825
683#ifdef CONFIG_HOTPLUG_CPU 826 if (!step->skip_onerr)
827 cpuhp_invoke_callback(cpu, st->state, true, NULL, NULL);
828 }
829}
830
831static int cpuhp_down_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st,
832 enum cpuhp_state target)
833{
834 enum cpuhp_state prev_state = st->state;
835 int ret = 0;
836
837 for (; st->state > target; st->state--) {
838 ret = cpuhp_invoke_callback(cpu, st->state, false, NULL, NULL);
839 if (ret) {
840 st->target = prev_state;
841 undo_cpu_down(cpu, st);
842 break;
843 }
844 }
845 return ret;
846}
684 847
685/* Requires cpu_add_remove_lock to be held */ 848/* Requires cpu_add_remove_lock to be held */
686static int __ref _cpu_down(unsigned int cpu, int tasks_frozen, 849static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
@@ -699,13 +862,13 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
699 862
700 cpuhp_tasks_frozen = tasks_frozen; 863 cpuhp_tasks_frozen = tasks_frozen;
701 864
702 prev_state = st->state; 865 prev_state = cpuhp_set_state(st, target);
703 st->target = target;
704 /* 866 /*
705 * If the current CPU state is in the range of the AP hotplug thread, 867 * If the current CPU state is in the range of the AP hotplug thread,
706 * then we need to kick the thread. 868 * then we need to kick the thread.
707 */ 869 */
708 if (st->state > CPUHP_TEARDOWN_CPU) { 870 if (st->state > CPUHP_TEARDOWN_CPU) {
871 st->target = max((int)target, CPUHP_TEARDOWN_CPU);
709 ret = cpuhp_kick_ap_work(cpu); 872 ret = cpuhp_kick_ap_work(cpu);
710 /* 873 /*
711 * The AP side has done the error rollback already. Just 874 * The AP side has done the error rollback already. Just
@@ -720,6 +883,8 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
720 */ 883 */
721 if (st->state > CPUHP_TEARDOWN_CPU) 884 if (st->state > CPUHP_TEARDOWN_CPU)
722 goto out; 885 goto out;
886
887 st->target = target;
723 } 888 }
724 /* 889 /*
725 * The AP brought itself down to CPUHP_TEARDOWN_CPU. So we need 890 * The AP brought itself down to CPUHP_TEARDOWN_CPU. So we need
@@ -727,13 +892,17 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
727 */ 892 */
728 ret = cpuhp_down_callbacks(cpu, st, target); 893 ret = cpuhp_down_callbacks(cpu, st, target);
729 if (ret && st->state > CPUHP_TEARDOWN_CPU && st->state < prev_state) { 894 if (ret && st->state > CPUHP_TEARDOWN_CPU && st->state < prev_state) {
730 st->target = prev_state; 895 cpuhp_reset_state(st, prev_state);
731 st->rollback = true; 896 __cpuhp_kick_ap(st);
732 cpuhp_kick_ap_work(cpu);
733 } 897 }
734 898
735out: 899out:
736 cpus_write_unlock(); 900 cpus_write_unlock();
901 /*
902 * Do post unplug cleanup. This is still protected against
903 * concurrent CPU hotplug via cpu_add_remove_lock.
904 */
905 lockup_detector_cleanup();
737 return ret; 906 return ret;
738} 907}
739 908
@@ -754,11 +923,15 @@ out:
754 cpu_maps_update_done(); 923 cpu_maps_update_done();
755 return err; 924 return err;
756} 925}
926
757int cpu_down(unsigned int cpu) 927int cpu_down(unsigned int cpu)
758{ 928{
759 return do_cpu_down(cpu, CPUHP_OFFLINE); 929 return do_cpu_down(cpu, CPUHP_OFFLINE);
760} 930}
761EXPORT_SYMBOL(cpu_down); 931EXPORT_SYMBOL(cpu_down);
932
933#else
934#define takedown_cpu NULL
762#endif /*CONFIG_HOTPLUG_CPU*/ 935#endif /*CONFIG_HOTPLUG_CPU*/
763 936
764/** 937/**
@@ -772,11 +945,16 @@ void notify_cpu_starting(unsigned int cpu)
772{ 945{
773 struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); 946 struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
774 enum cpuhp_state target = min((int)st->target, CPUHP_AP_ONLINE); 947 enum cpuhp_state target = min((int)st->target, CPUHP_AP_ONLINE);
948 int ret;
775 949
776 rcu_cpu_starting(cpu); /* Enables RCU usage on this CPU. */ 950 rcu_cpu_starting(cpu); /* Enables RCU usage on this CPU. */
777 while (st->state < target) { 951 while (st->state < target) {
778 st->state++; 952 st->state++;
779 cpuhp_invoke_callback(cpu, st->state, true, NULL); 953 ret = cpuhp_invoke_callback(cpu, st->state, true, NULL, NULL);
954 /*
955 * STARTING must not fail!
956 */
957 WARN_ON_ONCE(ret);
780 } 958 }
781} 959}
782 960
@@ -794,7 +972,7 @@ void cpuhp_online_idle(enum cpuhp_state state)
794 return; 972 return;
795 973
796 st->state = CPUHP_AP_ONLINE_IDLE; 974 st->state = CPUHP_AP_ONLINE_IDLE;
797 complete(&st->done); 975 complete_ap_thread(st, true);
798} 976}
799 977
800/* Requires cpu_add_remove_lock to be held */ 978/* Requires cpu_add_remove_lock to be held */
@@ -829,7 +1007,7 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen, enum cpuhp_state target)
829 1007
830 cpuhp_tasks_frozen = tasks_frozen; 1008 cpuhp_tasks_frozen = tasks_frozen;
831 1009
832 st->target = target; 1010 cpuhp_set_state(st, target);
833 /* 1011 /*
834 * If the current CPU state is in the range of the AP hotplug thread, 1012 * If the current CPU state is in the range of the AP hotplug thread,
835 * then we need to kick the thread once more. 1013 * then we need to kick the thread once more.
@@ -1296,6 +1474,10 @@ static int cpuhp_issue_call(int cpu, enum cpuhp_state state, bool bringup,
1296 struct cpuhp_step *sp = cpuhp_get_step(state); 1474 struct cpuhp_step *sp = cpuhp_get_step(state);
1297 int ret; 1475 int ret;
1298 1476
1477 /*
1478 * If there's nothing to do, we done.
1479 * Relies on the union for multi_instance.
1480 */
1299 if ((bringup && !sp->startup.single) || 1481 if ((bringup && !sp->startup.single) ||
1300 (!bringup && !sp->teardown.single)) 1482 (!bringup && !sp->teardown.single))
1301 return 0; 1483 return 0;
@@ -1307,9 +1489,9 @@ static int cpuhp_issue_call(int cpu, enum cpuhp_state state, bool bringup,
1307 if (cpuhp_is_ap_state(state)) 1489 if (cpuhp_is_ap_state(state))
1308 ret = cpuhp_invoke_ap_callback(cpu, state, bringup, node); 1490 ret = cpuhp_invoke_ap_callback(cpu, state, bringup, node);
1309 else 1491 else
1310 ret = cpuhp_invoke_callback(cpu, state, bringup, node); 1492 ret = cpuhp_invoke_callback(cpu, state, bringup, node, NULL);
1311#else 1493#else
1312 ret = cpuhp_invoke_callback(cpu, state, bringup, node); 1494 ret = cpuhp_invoke_callback(cpu, state, bringup, node, NULL);
1313#endif 1495#endif
1314 BUG_ON(ret && !bringup); 1496 BUG_ON(ret && !bringup);
1315 return ret; 1497 return ret;
@@ -1641,9 +1823,55 @@ static ssize_t show_cpuhp_target(struct device *dev,
1641} 1823}
1642static DEVICE_ATTR(target, 0644, show_cpuhp_target, write_cpuhp_target); 1824static DEVICE_ATTR(target, 0644, show_cpuhp_target, write_cpuhp_target);
1643 1825
1826
1827static ssize_t write_cpuhp_fail(struct device *dev,
1828 struct device_attribute *attr,
1829 const char *buf, size_t count)
1830{
1831 struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id);
1832 struct cpuhp_step *sp;
1833 int fail, ret;
1834
1835 ret = kstrtoint(buf, 10, &fail);
1836 if (ret)
1837 return ret;
1838
1839 /*
1840 * Cannot fail STARTING/DYING callbacks.
1841 */
1842 if (cpuhp_is_atomic_state(fail))
1843 return -EINVAL;
1844
1845 /*
1846 * Cannot fail anything that doesn't have callbacks.
1847 */
1848 mutex_lock(&cpuhp_state_mutex);
1849 sp = cpuhp_get_step(fail);
1850 if (!sp->startup.single && !sp->teardown.single)
1851 ret = -EINVAL;
1852 mutex_unlock(&cpuhp_state_mutex);
1853 if (ret)
1854 return ret;
1855
1856 st->fail = fail;
1857
1858 return count;
1859}
1860
1861static ssize_t show_cpuhp_fail(struct device *dev,
1862 struct device_attribute *attr, char *buf)
1863{
1864 struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id);
1865
1866 return sprintf(buf, "%d\n", st->fail);
1867}
1868
1869static DEVICE_ATTR(fail, 0644, show_cpuhp_fail, write_cpuhp_fail);
1870
1644static struct attribute *cpuhp_cpu_attrs[] = { 1871static struct attribute *cpuhp_cpu_attrs[] = {
1645 &dev_attr_state.attr, 1872 &dev_attr_state.attr,
1646 &dev_attr_target.attr, 1873 &dev_attr_target.attr,
1874 &dev_attr_fail.attr,
1647 NULL 1875 NULL
1648}; 1876};
1649 1877
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 3e691b75b2db..6bc21e202ae4 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -8171,6 +8171,7 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
8171 } 8171 }
8172 } 8172 }
8173 event->tp_event->prog = prog; 8173 event->tp_event->prog = prog;
8174 event->tp_event->bpf_prog_owner = event;
8174 8175
8175 return 0; 8176 return 0;
8176} 8177}
@@ -8185,7 +8186,7 @@ static void perf_event_free_bpf_prog(struct perf_event *event)
8185 return; 8186 return;
8186 8187
8187 prog = event->tp_event->prog; 8188 prog = event->tp_event->prog;
8188 if (prog) { 8189 if (prog && event->tp_event->bpf_prog_owner == event) {
8189 event->tp_event->prog = NULL; 8190 event->tp_event->prog = NULL;
8190 bpf_prog_put(prog); 8191 bpf_prog_put(prog);
8191 } 8192 }
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index af71a84e12ee..f684d8e5fa2b 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -412,6 +412,19 @@ err:
412 return NULL; 412 return NULL;
413} 413}
414 414
415static bool __always_inline rb_need_aux_wakeup(struct ring_buffer *rb)
416{
417 if (rb->aux_overwrite)
418 return false;
419
420 if (rb->aux_head - rb->aux_wakeup >= rb->aux_watermark) {
421 rb->aux_wakeup = rounddown(rb->aux_head, rb->aux_watermark);
422 return true;
423 }
424
425 return false;
426}
427
415/* 428/*
416 * Commit the data written by hardware into the ring buffer by adjusting 429 * Commit the data written by hardware into the ring buffer by adjusting
417 * aux_head and posting a PERF_RECORD_AUX into the perf buffer. It is the 430 * aux_head and posting a PERF_RECORD_AUX into the perf buffer. It is the
@@ -451,10 +464,8 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size)
451 } 464 }
452 465
453 rb->user_page->aux_head = rb->aux_head; 466 rb->user_page->aux_head = rb->aux_head;
454 if (rb->aux_head - rb->aux_wakeup >= rb->aux_watermark) { 467 if (rb_need_aux_wakeup(rb))
455 wakeup = true; 468 wakeup = true;
456 rb->aux_wakeup = rounddown(rb->aux_head, rb->aux_watermark);
457 }
458 469
459 if (wakeup) { 470 if (wakeup) {
460 if (handle->aux_flags & PERF_AUX_FLAG_TRUNCATED) 471 if (handle->aux_flags & PERF_AUX_FLAG_TRUNCATED)
@@ -484,9 +495,8 @@ int perf_aux_output_skip(struct perf_output_handle *handle, unsigned long size)
484 rb->aux_head += size; 495 rb->aux_head += size;
485 496
486 rb->user_page->aux_head = rb->aux_head; 497 rb->user_page->aux_head = rb->aux_head;
487 if (rb->aux_head - rb->aux_wakeup >= rb->aux_watermark) { 498 if (rb_need_aux_wakeup(rb)) {
488 perf_output_wakeup(handle); 499 perf_output_wakeup(handle);
489 rb->aux_wakeup = rounddown(rb->aux_head, rb->aux_watermark);
490 handle->wakeup = rb->aux_wakeup + rb->aux_watermark; 500 handle->wakeup = rb->aux_wakeup + rb->aux_watermark;
491 } 501 }
492 502
diff --git a/kernel/exit.c b/kernel/exit.c
index 3481ababd06a..f2cd53e92147 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1600,12 +1600,10 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
1600 struct waitid_info info = {.status = 0}; 1600 struct waitid_info info = {.status = 0};
1601 long err = kernel_waitid(which, upid, &info, options, ru ? &r : NULL); 1601 long err = kernel_waitid(which, upid, &info, options, ru ? &r : NULL);
1602 int signo = 0; 1602 int signo = 0;
1603
1603 if (err > 0) { 1604 if (err > 0) {
1604 signo = SIGCHLD; 1605 signo = SIGCHLD;
1605 err = 0; 1606 err = 0;
1606 }
1607
1608 if (!err) {
1609 if (ru && copy_to_user(ru, &r, sizeof(struct rusage))) 1607 if (ru && copy_to_user(ru, &r, sizeof(struct rusage)))
1610 return -EFAULT; 1608 return -EFAULT;
1611 } 1609 }
@@ -1723,16 +1721,15 @@ COMPAT_SYSCALL_DEFINE5(waitid,
1723 if (err > 0) { 1721 if (err > 0) {
1724 signo = SIGCHLD; 1722 signo = SIGCHLD;
1725 err = 0; 1723 err = 0;
1726 } 1724 if (uru) {
1727 1725 /* kernel_waitid() overwrites everything in ru */
1728 if (!err && uru) { 1726 if (COMPAT_USE_64BIT_TIME)
1729 /* kernel_waitid() overwrites everything in ru */ 1727 err = copy_to_user(uru, &ru, sizeof(ru));
1730 if (COMPAT_USE_64BIT_TIME) 1728 else
1731 err = copy_to_user(uru, &ru, sizeof(ru)); 1729 err = put_compat_rusage(&ru, uru);
1732 else 1730 if (err)
1733 err = put_compat_rusage(&ru, uru); 1731 return -EFAULT;
1734 if (err) 1732 }
1735 return -EFAULT;
1736 } 1733 }
1737 1734
1738 if (!infop) 1735 if (!infop)
diff --git a/kernel/extable.c b/kernel/extable.c
index 38c2412401a1..9aa1cc41ecf7 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -102,15 +102,7 @@ int core_kernel_data(unsigned long addr)
102 102
103int __kernel_text_address(unsigned long addr) 103int __kernel_text_address(unsigned long addr)
104{ 104{
105 if (core_kernel_text(addr)) 105 if (kernel_text_address(addr))
106 return 1;
107 if (is_module_text_address(addr))
108 return 1;
109 if (is_ftrace_trampoline(addr))
110 return 1;
111 if (is_kprobe_optinsn_slot(addr) || is_kprobe_insn_slot(addr))
112 return 1;
113 if (is_bpf_text_address(addr))
114 return 1; 106 return 1;
115 /* 107 /*
116 * There might be init symbols in saved stacktraces. 108 * There might be init symbols in saved stacktraces.
@@ -127,17 +119,42 @@ int __kernel_text_address(unsigned long addr)
127 119
128int kernel_text_address(unsigned long addr) 120int kernel_text_address(unsigned long addr)
129{ 121{
122 bool no_rcu;
123 int ret = 1;
124
130 if (core_kernel_text(addr)) 125 if (core_kernel_text(addr))
131 return 1; 126 return 1;
127
128 /*
129 * If a stack dump happens while RCU is not watching, then
130 * RCU needs to be notified that it requires to start
131 * watching again. This can happen either by tracing that
132 * triggers a stack trace, or a WARN() that happens during
133 * coming back from idle, or cpu on or offlining.
134 *
135 * is_module_text_address() as well as the kprobe slots
136 * and is_bpf_text_address() require RCU to be watching.
137 */
138 no_rcu = !rcu_is_watching();
139
140 /* Treat this like an NMI as it can happen anywhere */
141 if (no_rcu)
142 rcu_nmi_enter();
143
132 if (is_module_text_address(addr)) 144 if (is_module_text_address(addr))
133 return 1; 145 goto out;
134 if (is_ftrace_trampoline(addr)) 146 if (is_ftrace_trampoline(addr))
135 return 1; 147 goto out;
136 if (is_kprobe_optinsn_slot(addr) || is_kprobe_insn_slot(addr)) 148 if (is_kprobe_optinsn_slot(addr) || is_kprobe_insn_slot(addr))
137 return 1; 149 goto out;
138 if (is_bpf_text_address(addr)) 150 if (is_bpf_text_address(addr))
139 return 1; 151 goto out;
140 return 0; 152 ret = 0;
153out:
154 if (no_rcu)
155 rcu_nmi_exit();
156
157 return ret;
141} 158}
142 159
143/* 160/*
diff --git a/kernel/fork.c b/kernel/fork.c
index 10646182440f..e702cb9ffbd8 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -946,6 +946,24 @@ void mmput(struct mm_struct *mm)
946} 946}
947EXPORT_SYMBOL_GPL(mmput); 947EXPORT_SYMBOL_GPL(mmput);
948 948
949#ifdef CONFIG_MMU
950static void mmput_async_fn(struct work_struct *work)
951{
952 struct mm_struct *mm = container_of(work, struct mm_struct,
953 async_put_work);
954
955 __mmput(mm);
956}
957
958void mmput_async(struct mm_struct *mm)
959{
960 if (atomic_dec_and_test(&mm->mm_users)) {
961 INIT_WORK(&mm->async_put_work, mmput_async_fn);
962 schedule_work(&mm->async_put_work);
963 }
964}
965#endif
966
949/** 967/**
950 * set_mm_exe_file - change a reference to the mm's executable file 968 * set_mm_exe_file - change a reference to the mm's executable file
951 * 969 *
diff --git a/kernel/futex.c b/kernel/futex.c
index 3d38eaf05492..0518a0bfc746 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -821,8 +821,6 @@ static void get_pi_state(struct futex_pi_state *pi_state)
821/* 821/*
822 * Drops a reference to the pi_state object and frees or caches it 822 * Drops a reference to the pi_state object and frees or caches it
823 * when the last reference is gone. 823 * when the last reference is gone.
824 *
825 * Must be called with the hb lock held.
826 */ 824 */
827static void put_pi_state(struct futex_pi_state *pi_state) 825static void put_pi_state(struct futex_pi_state *pi_state)
828{ 826{
@@ -837,16 +835,22 @@ static void put_pi_state(struct futex_pi_state *pi_state)
837 * and has cleaned up the pi_state already 835 * and has cleaned up the pi_state already
838 */ 836 */
839 if (pi_state->owner) { 837 if (pi_state->owner) {
840 raw_spin_lock_irq(&pi_state->owner->pi_lock); 838 struct task_struct *owner;
841 list_del_init(&pi_state->list);
842 raw_spin_unlock_irq(&pi_state->owner->pi_lock);
843 839
844 rt_mutex_proxy_unlock(&pi_state->pi_mutex, pi_state->owner); 840 raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
841 owner = pi_state->owner;
842 if (owner) {
843 raw_spin_lock(&owner->pi_lock);
844 list_del_init(&pi_state->list);
845 raw_spin_unlock(&owner->pi_lock);
846 }
847 rt_mutex_proxy_unlock(&pi_state->pi_mutex, owner);
848 raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
845 } 849 }
846 850
847 if (current->pi_state_cache) 851 if (current->pi_state_cache) {
848 kfree(pi_state); 852 kfree(pi_state);
849 else { 853 } else {
850 /* 854 /*
851 * pi_state->list is already empty. 855 * pi_state->list is already empty.
852 * clear pi_state->owner. 856 * clear pi_state->owner.
@@ -907,13 +911,14 @@ void exit_pi_state_list(struct task_struct *curr)
907 raw_spin_unlock_irq(&curr->pi_lock); 911 raw_spin_unlock_irq(&curr->pi_lock);
908 912
909 spin_lock(&hb->lock); 913 spin_lock(&hb->lock);
910 914 raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
911 raw_spin_lock_irq(&curr->pi_lock); 915 raw_spin_lock(&curr->pi_lock);
912 /* 916 /*
913 * We dropped the pi-lock, so re-check whether this 917 * We dropped the pi-lock, so re-check whether this
914 * task still owns the PI-state: 918 * task still owns the PI-state:
915 */ 919 */
916 if (head->next != next) { 920 if (head->next != next) {
921 raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
917 spin_unlock(&hb->lock); 922 spin_unlock(&hb->lock);
918 continue; 923 continue;
919 } 924 }
@@ -922,9 +927,10 @@ void exit_pi_state_list(struct task_struct *curr)
922 WARN_ON(list_empty(&pi_state->list)); 927 WARN_ON(list_empty(&pi_state->list));
923 list_del_init(&pi_state->list); 928 list_del_init(&pi_state->list);
924 pi_state->owner = NULL; 929 pi_state->owner = NULL;
925 raw_spin_unlock_irq(&curr->pi_lock); 930 raw_spin_unlock(&curr->pi_lock);
926 931
927 get_pi_state(pi_state); 932 get_pi_state(pi_state);
933 raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
928 spin_unlock(&hb->lock); 934 spin_unlock(&hb->lock);
929 935
930 rt_mutex_futex_unlock(&pi_state->pi_mutex); 936 rt_mutex_futex_unlock(&pi_state->pi_mutex);
@@ -1208,6 +1214,10 @@ static int attach_to_pi_owner(u32 uval, union futex_key *key,
1208 1214
1209 WARN_ON(!list_empty(&pi_state->list)); 1215 WARN_ON(!list_empty(&pi_state->list));
1210 list_add(&pi_state->list, &p->pi_state_list); 1216 list_add(&pi_state->list, &p->pi_state_list);
1217 /*
1218 * Assignment without holding pi_state->pi_mutex.wait_lock is safe
1219 * because there is no concurrency as the object is not published yet.
1220 */
1211 pi_state->owner = p; 1221 pi_state->owner = p;
1212 raw_spin_unlock_irq(&p->pi_lock); 1222 raw_spin_unlock_irq(&p->pi_lock);
1213 1223
@@ -2878,6 +2888,7 @@ retry:
2878 raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); 2888 raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
2879 spin_unlock(&hb->lock); 2889 spin_unlock(&hb->lock);
2880 2890
2891 /* drops pi_state->pi_mutex.wait_lock */
2881 ret = wake_futex_pi(uaddr, uval, pi_state); 2892 ret = wake_futex_pi(uaddr, uval, pi_state);
2882 2893
2883 put_pi_state(pi_state); 2894 put_pi_state(pi_state);
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index f51b7b6d2451..6fc89fd93824 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -202,7 +202,7 @@ __irq_startup_managed(struct irq_desc *desc, struct cpumask *aff, bool force)
202 202
203 irqd_clr_managed_shutdown(d); 203 irqd_clr_managed_shutdown(d);
204 204
205 if (cpumask_any_and(aff, cpu_online_mask) > nr_cpu_ids) { 205 if (cpumask_any_and(aff, cpu_online_mask) >= nr_cpu_ids) {
206 /* 206 /*
207 * Catch code which fiddles with enable_irq() on a managed 207 * Catch code which fiddles with enable_irq() on a managed
208 * and potentially shutdown IRQ. Chained interrupt 208 * and potentially shutdown IRQ. Chained interrupt
diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c
index f7086b78ad6e..5270a54b9fa4 100644
--- a/kernel/irq/generic-chip.c
+++ b/kernel/irq/generic-chip.c
@@ -322,7 +322,6 @@ int __irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip,
322 /* Calc pointer to the next generic chip */ 322 /* Calc pointer to the next generic chip */
323 tmp += sizeof(*gc) + num_ct * sizeof(struct irq_chip_type); 323 tmp += sizeof(*gc) + num_ct * sizeof(struct irq_chip_type);
324 } 324 }
325 d->name = name;
326 return 0; 325 return 0;
327} 326}
328EXPORT_SYMBOL_GPL(__irq_alloc_domain_generic_chips); 327EXPORT_SYMBOL_GPL(__irq_alloc_domain_generic_chips);
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index e84b7056bb08..ac4644e92b49 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -945,7 +945,7 @@ static int virq_debug_show(struct seq_file *m, void *private)
945 struct irq_desc *desc; 945 struct irq_desc *desc;
946 struct irq_domain *domain; 946 struct irq_domain *domain;
947 struct radix_tree_iter iter; 947 struct radix_tree_iter iter;
948 void **slot; 948 void __rcu **slot;
949 int i; 949 int i;
950 950
951 seq_printf(m, " %-16s %-6s %-10s %-10s %s\n", 951 seq_printf(m, " %-16s %-6s %-10s %-10s %s\n",
@@ -1453,7 +1453,7 @@ out_free_desc:
1453/* The irq_data was moved, fix the revmap to refer to the new location */ 1453/* The irq_data was moved, fix the revmap to refer to the new location */
1454static void irq_domain_fix_revmap(struct irq_data *d) 1454static void irq_domain_fix_revmap(struct irq_data *d)
1455{ 1455{
1456 void **slot; 1456 void __rcu **slot;
1457 1457
1458 if (d->hwirq < d->domain->revmap_size) 1458 if (d->hwirq < d->domain->revmap_size)
1459 return; /* Not using radix tree. */ 1459 return; /* Not using radix tree. */
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 573dc52b0806..d00132b5c325 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -1643,6 +1643,10 @@ const void *free_irq(unsigned int irq, void *dev_id)
1643#endif 1643#endif
1644 1644
1645 action = __free_irq(irq, dev_id); 1645 action = __free_irq(irq, dev_id);
1646
1647 if (!action)
1648 return NULL;
1649
1646 devname = action->name; 1650 devname = action->name;
1647 kfree(action); 1651 kfree(action);
1648 return devname; 1652 return devname;
diff --git a/kernel/kcmp.c b/kernel/kcmp.c
index ea34ed8bb952..055bb2962a0b 100644
--- a/kernel/kcmp.c
+++ b/kernel/kcmp.c
@@ -131,7 +131,7 @@ static int kcmp_epoll_target(struct task_struct *task1,
131 if (filp_epoll) { 131 if (filp_epoll) {
132 filp_tgt = get_epoll_tfile_raw_ptr(filp_epoll, slot.tfd, slot.toff); 132 filp_tgt = get_epoll_tfile_raw_ptr(filp_epoll, slot.tfd, slot.toff);
133 fput(filp_epoll); 133 fput(filp_epoll);
134 } else 134 }
135 135
136 if (IS_ERR(filp_tgt)) 136 if (IS_ERR(filp_tgt))
137 return PTR_ERR(filp_tgt); 137 return PTR_ERR(filp_tgt);
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index 02f660666ab8..1fefe6dcafd7 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -613,6 +613,33 @@ struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem)
613 DEFINE_WAKE_Q(wake_q); 613 DEFINE_WAKE_Q(wake_q);
614 614
615 /* 615 /*
616 * __rwsem_down_write_failed_common(sem)
617 * rwsem_optimistic_spin(sem)
618 * osq_unlock(sem->osq)
619 * ...
620 * atomic_long_add_return(&sem->count)
621 *
622 * - VS -
623 *
624 * __up_write()
625 * if (atomic_long_sub_return_release(&sem->count) < 0)
626 * rwsem_wake(sem)
627 * osq_is_locked(&sem->osq)
628 *
629 * And __up_write() must observe !osq_is_locked() when it observes the
630 * atomic_long_add_return() in order to not miss a wakeup.
631 *
632 * This boils down to:
633 *
634 * [S.rel] X = 1 [RmW] r0 = (Y += 0)
635 * MB RMB
636 * [RmW] Y += 1 [L] r1 = X
637 *
638 * exists (r0=1 /\ r1=0)
639 */
640 smp_rmb();
641
642 /*
616 * If a spinner is present, it is not necessary to do the wakeup. 643 * If a spinner is present, it is not necessary to do the wakeup.
617 * Try to do wakeup only if the trylock succeeds to minimize 644 * Try to do wakeup only if the trylock succeeds to minimize
618 * spinlock contention which may introduce too much delay in the 645 * spinlock contention which may introduce too much delay in the
diff --git a/kernel/memremap.c b/kernel/memremap.c
index 6bcbfbf1a8fd..403ab9cdb949 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -350,7 +350,7 @@ void *devm_memremap_pages(struct device *dev, struct resource *res,
350 pgprot_t pgprot = PAGE_KERNEL; 350 pgprot_t pgprot = PAGE_KERNEL;
351 struct dev_pagemap *pgmap; 351 struct dev_pagemap *pgmap;
352 struct page_map *page_map; 352 struct page_map *page_map;
353 int error, nid, is_ram; 353 int error, nid, is_ram, i = 0;
354 354
355 align_start = res->start & ~(SECTION_SIZE - 1); 355 align_start = res->start & ~(SECTION_SIZE - 1);
356 align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE) 356 align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE)
@@ -448,6 +448,8 @@ void *devm_memremap_pages(struct device *dev, struct resource *res,
448 list_del(&page->lru); 448 list_del(&page->lru);
449 page->pgmap = pgmap; 449 page->pgmap = pgmap;
450 percpu_ref_get(ref); 450 percpu_ref_get(ref);
451 if (!(++i % 1024))
452 cond_resched();
451 } 453 }
452 devres_add(dev, page_map); 454 devres_add(dev, page_map);
453 return __va(res->start); 455 return __va(res->start);
diff --git a/kernel/params.c b/kernel/params.c
index 60b2d8101355..cc9108c2a1fd 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -224,7 +224,7 @@ char *parse_args(const char *doing,
224 } \ 224 } \
225 int param_get_##name(char *buffer, const struct kernel_param *kp) \ 225 int param_get_##name(char *buffer, const struct kernel_param *kp) \
226 { \ 226 { \
227 return scnprintf(buffer, PAGE_SIZE, format, \ 227 return scnprintf(buffer, PAGE_SIZE, format "\n", \
228 *((type *)kp->arg)); \ 228 *((type *)kp->arg)); \
229 } \ 229 } \
230 const struct kernel_param_ops param_ops_##name = { \ 230 const struct kernel_param_ops param_ops_##name = { \
@@ -236,14 +236,14 @@ char *parse_args(const char *doing,
236 EXPORT_SYMBOL(param_ops_##name) 236 EXPORT_SYMBOL(param_ops_##name)
237 237
238 238
239STANDARD_PARAM_DEF(byte, unsigned char, "%hhu", kstrtou8); 239STANDARD_PARAM_DEF(byte, unsigned char, "%hhu", kstrtou8);
240STANDARD_PARAM_DEF(short, short, "%hi", kstrtos16); 240STANDARD_PARAM_DEF(short, short, "%hi", kstrtos16);
241STANDARD_PARAM_DEF(ushort, unsigned short, "%hu", kstrtou16); 241STANDARD_PARAM_DEF(ushort, unsigned short, "%hu", kstrtou16);
242STANDARD_PARAM_DEF(int, int, "%i", kstrtoint); 242STANDARD_PARAM_DEF(int, int, "%i", kstrtoint);
243STANDARD_PARAM_DEF(uint, unsigned int, "%u", kstrtouint); 243STANDARD_PARAM_DEF(uint, unsigned int, "%u", kstrtouint);
244STANDARD_PARAM_DEF(long, long, "%li", kstrtol); 244STANDARD_PARAM_DEF(long, long, "%li", kstrtol);
245STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", kstrtoul); 245STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", kstrtoul);
246STANDARD_PARAM_DEF(ullong, unsigned long long, "%llu", kstrtoull); 246STANDARD_PARAM_DEF(ullong, unsigned long long, "%llu", kstrtoull);
247 247
248int param_set_charp(const char *val, const struct kernel_param *kp) 248int param_set_charp(const char *val, const struct kernel_param *kp)
249{ 249{
@@ -270,7 +270,7 @@ EXPORT_SYMBOL(param_set_charp);
270 270
271int param_get_charp(char *buffer, const struct kernel_param *kp) 271int param_get_charp(char *buffer, const struct kernel_param *kp)
272{ 272{
273 return scnprintf(buffer, PAGE_SIZE, "%s", *((char **)kp->arg)); 273 return scnprintf(buffer, PAGE_SIZE, "%s\n", *((char **)kp->arg));
274} 274}
275EXPORT_SYMBOL(param_get_charp); 275EXPORT_SYMBOL(param_get_charp);
276 276
@@ -301,7 +301,7 @@ EXPORT_SYMBOL(param_set_bool);
301int param_get_bool(char *buffer, const struct kernel_param *kp) 301int param_get_bool(char *buffer, const struct kernel_param *kp)
302{ 302{
303 /* Y and N chosen as being relatively non-coder friendly */ 303 /* Y and N chosen as being relatively non-coder friendly */
304 return sprintf(buffer, "%c", *(bool *)kp->arg ? 'Y' : 'N'); 304 return sprintf(buffer, "%c\n", *(bool *)kp->arg ? 'Y' : 'N');
305} 305}
306EXPORT_SYMBOL(param_get_bool); 306EXPORT_SYMBOL(param_get_bool);
307 307
@@ -360,7 +360,7 @@ EXPORT_SYMBOL(param_set_invbool);
360 360
361int param_get_invbool(char *buffer, const struct kernel_param *kp) 361int param_get_invbool(char *buffer, const struct kernel_param *kp)
362{ 362{
363 return sprintf(buffer, "%c", (*(bool *)kp->arg) ? 'N' : 'Y'); 363 return sprintf(buffer, "%c\n", (*(bool *)kp->arg) ? 'N' : 'Y');
364} 364}
365EXPORT_SYMBOL(param_get_invbool); 365EXPORT_SYMBOL(param_get_invbool);
366 366
@@ -460,8 +460,9 @@ static int param_array_get(char *buffer, const struct kernel_param *kp)
460 struct kernel_param p = *kp; 460 struct kernel_param p = *kp;
461 461
462 for (i = off = 0; i < (arr->num ? *arr->num : arr->max); i++) { 462 for (i = off = 0; i < (arr->num ? *arr->num : arr->max); i++) {
463 /* Replace \n with comma */
463 if (i) 464 if (i)
464 buffer[off++] = ','; 465 buffer[off - 1] = ',';
465 p.arg = arr->elem + arr->elemsize * i; 466 p.arg = arr->elem + arr->elemsize * i;
466 check_kparam_locked(p.mod); 467 check_kparam_locked(p.mod);
467 ret = arr->ops->get(buffer + off, &p); 468 ret = arr->ops->get(buffer + off, &p);
@@ -507,7 +508,7 @@ EXPORT_SYMBOL(param_set_copystring);
507int param_get_string(char *buffer, const struct kernel_param *kp) 508int param_get_string(char *buffer, const struct kernel_param *kp)
508{ 509{
509 const struct kparam_string *kps = kp->str; 510 const struct kparam_string *kps = kp->str;
510 return strlcpy(buffer, kps->string, kps->maxlen); 511 return scnprintf(buffer, PAGE_SIZE, "%s\n", kps->string);
511} 512}
512EXPORT_SYMBOL(param_get_string); 513EXPORT_SYMBOL(param_get_string);
513 514
@@ -549,10 +550,6 @@ static ssize_t param_attr_show(struct module_attribute *mattr,
549 kernel_param_lock(mk->mod); 550 kernel_param_lock(mk->mod);
550 count = attribute->param->ops->get(buf, attribute->param); 551 count = attribute->param->ops->get(buf, attribute->param);
551 kernel_param_unlock(mk->mod); 552 kernel_param_unlock(mk->mod);
552 if (count > 0) {
553 strcat(buf, "\n");
554 ++count;
555 }
556 return count; 553 return count;
557} 554}
558 555
@@ -600,7 +597,7 @@ EXPORT_SYMBOL(kernel_param_unlock);
600/* 597/*
601 * add_sysfs_param - add a parameter to sysfs 598 * add_sysfs_param - add a parameter to sysfs
602 * @mk: struct module_kobject 599 * @mk: struct module_kobject
603 * @kparam: the actual parameter definition to add to sysfs 600 * @kp: the actual parameter definition to add to sysfs
604 * @name: name of parameter 601 * @name: name of parameter
605 * 602 *
606 * Create a kobject if for a (per-module) parameter if mp NULL, and 603 * Create a kobject if for a (per-module) parameter if mp NULL, and
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 3e2b4f519009..ccd2d20e6b06 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -120,22 +120,26 @@ static void s2idle_loop(void)
120 * frozen processes + suspended devices + idle processors. 120 * frozen processes + suspended devices + idle processors.
121 * Thus s2idle_enter() should be called right after 121 * Thus s2idle_enter() should be called right after
122 * all devices have been suspended. 122 * all devices have been suspended.
123 *
124 * Wakeups during the noirq suspend of devices may be spurious,
125 * so prevent them from terminating the loop right away.
123 */ 126 */
124 error = dpm_noirq_suspend_devices(PMSG_SUSPEND); 127 error = dpm_noirq_suspend_devices(PMSG_SUSPEND);
125 if (!error) 128 if (!error)
126 s2idle_enter(); 129 s2idle_enter();
130 else if (error == -EBUSY && pm_wakeup_pending())
131 error = 0;
127 132
128 dpm_noirq_resume_devices(PMSG_RESUME); 133 if (!error && s2idle_ops && s2idle_ops->wake)
129 if (error && (error != -EBUSY || !pm_wakeup_pending())) {
130 dpm_noirq_end();
131 break;
132 }
133
134 if (s2idle_ops && s2idle_ops->wake)
135 s2idle_ops->wake(); 134 s2idle_ops->wake();
136 135
136 dpm_noirq_resume_devices(PMSG_RESUME);
137
137 dpm_noirq_end(); 138 dpm_noirq_end();
138 139
140 if (error)
141 break;
142
139 if (s2idle_ops && s2idle_ops->sync) 143 if (s2idle_ops && s2idle_ops->sync)
140 s2idle_ops->sync(); 144 s2idle_ops->sync();
141 145
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 1250e4bd4b85..b0ad62b0e7b8 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -882,6 +882,11 @@ void rcu_irq_exit(void)
882 882
883 RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_irq_exit() invoked with irqs enabled!!!"); 883 RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_irq_exit() invoked with irqs enabled!!!");
884 rdtp = this_cpu_ptr(&rcu_dynticks); 884 rdtp = this_cpu_ptr(&rcu_dynticks);
885
886 /* Page faults can happen in NMI handlers, so check... */
887 if (rdtp->dynticks_nmi_nesting)
888 return;
889
885 WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && 890 WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
886 rdtp->dynticks_nesting < 1); 891 rdtp->dynticks_nesting < 1);
887 if (rdtp->dynticks_nesting <= 1) { 892 if (rdtp->dynticks_nesting <= 1) {
@@ -1015,6 +1020,11 @@ void rcu_irq_enter(void)
1015 1020
1016 RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_irq_enter() invoked with irqs enabled!!!"); 1021 RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_irq_enter() invoked with irqs enabled!!!");
1017 rdtp = this_cpu_ptr(&rcu_dynticks); 1022 rdtp = this_cpu_ptr(&rcu_dynticks);
1023
1024 /* Page faults can happen in NMI handlers, so check... */
1025 if (rdtp->dynticks_nmi_nesting)
1026 return;
1027
1018 oldval = rdtp->dynticks_nesting; 1028 oldval = rdtp->dynticks_nesting;
1019 rdtp->dynticks_nesting++; 1029 rdtp->dynticks_nesting++;
1020 WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && 1030 WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 18a6966567da..d17c5da523a0 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5166,6 +5166,28 @@ void sched_show_task(struct task_struct *p)
5166 put_task_stack(p); 5166 put_task_stack(p);
5167} 5167}
5168 5168
5169static inline bool
5170state_filter_match(unsigned long state_filter, struct task_struct *p)
5171{
5172 /* no filter, everything matches */
5173 if (!state_filter)
5174 return true;
5175
5176 /* filter, but doesn't match */
5177 if (!(p->state & state_filter))
5178 return false;
5179
5180 /*
5181 * When looking for TASK_UNINTERRUPTIBLE skip TASK_IDLE (allows
5182 * TASK_KILLABLE).
5183 */
5184 if (state_filter == TASK_UNINTERRUPTIBLE && p->state == TASK_IDLE)
5185 return false;
5186
5187 return true;
5188}
5189
5190
5169void show_state_filter(unsigned long state_filter) 5191void show_state_filter(unsigned long state_filter)
5170{ 5192{
5171 struct task_struct *g, *p; 5193 struct task_struct *g, *p;
@@ -5188,7 +5210,7 @@ void show_state_filter(unsigned long state_filter)
5188 */ 5210 */
5189 touch_nmi_watchdog(); 5211 touch_nmi_watchdog();
5190 touch_all_softlockup_watchdogs(); 5212 touch_all_softlockup_watchdogs();
5191 if (!state_filter || (p->state & state_filter)) 5213 if (state_filter_match(state_filter, p))
5192 sched_show_task(p); 5214 sched_show_task(p);
5193 } 5215 }
5194 5216
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 01217fb5a5de..2f93e4a2d9f6 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -466,8 +466,6 @@ static char *task_group_path(struct task_group *tg)
466} 466}
467#endif 467#endif
468 468
469static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;
470
471static void 469static void
472print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) 470print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
473{ 471{
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 98b59b5db90b..bb3a38005b9c 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -17,11 +17,13 @@
17#include <linux/audit.h> 17#include <linux/audit.h>
18#include <linux/compat.h> 18#include <linux/compat.h>
19#include <linux/coredump.h> 19#include <linux/coredump.h>
20#include <linux/kmemleak.h>
20#include <linux/sched.h> 21#include <linux/sched.h>
21#include <linux/sched/task_stack.h> 22#include <linux/sched/task_stack.h>
22#include <linux/seccomp.h> 23#include <linux/seccomp.h>
23#include <linux/slab.h> 24#include <linux/slab.h>
24#include <linux/syscalls.h> 25#include <linux/syscalls.h>
26#include <linux/sysctl.h>
25 27
26#ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER 28#ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER
27#include <asm/syscall.h> 29#include <asm/syscall.h>
@@ -42,6 +44,7 @@
42 * get/put helpers should be used when accessing an instance 44 * get/put helpers should be used when accessing an instance
43 * outside of a lifetime-guarded section. In general, this 45 * outside of a lifetime-guarded section. In general, this
44 * is only needed for handling filters shared across tasks. 46 * is only needed for handling filters shared across tasks.
47 * @log: true if all actions except for SECCOMP_RET_ALLOW should be logged
45 * @prev: points to a previously installed, or inherited, filter 48 * @prev: points to a previously installed, or inherited, filter
46 * @prog: the BPF program to evaluate 49 * @prog: the BPF program to evaluate
47 * 50 *
@@ -57,6 +60,7 @@
57 */ 60 */
58struct seccomp_filter { 61struct seccomp_filter {
59 refcount_t usage; 62 refcount_t usage;
63 bool log;
60 struct seccomp_filter *prev; 64 struct seccomp_filter *prev;
61 struct bpf_prog *prog; 65 struct bpf_prog *prog;
62}; 66};
@@ -171,10 +175,15 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen)
171/** 175/**
172 * seccomp_run_filters - evaluates all seccomp filters against @sd 176 * seccomp_run_filters - evaluates all seccomp filters against @sd
173 * @sd: optional seccomp data to be passed to filters 177 * @sd: optional seccomp data to be passed to filters
178 * @match: stores struct seccomp_filter that resulted in the return value,
179 * unless filter returned SECCOMP_RET_ALLOW, in which case it will
180 * be unchanged.
174 * 181 *
175 * Returns valid seccomp BPF response codes. 182 * Returns valid seccomp BPF response codes.
176 */ 183 */
177static u32 seccomp_run_filters(const struct seccomp_data *sd) 184#define ACTION_ONLY(ret) ((s32)((ret) & (SECCOMP_RET_ACTION_FULL)))
185static u32 seccomp_run_filters(const struct seccomp_data *sd,
186 struct seccomp_filter **match)
178{ 187{
179 struct seccomp_data sd_local; 188 struct seccomp_data sd_local;
180 u32 ret = SECCOMP_RET_ALLOW; 189 u32 ret = SECCOMP_RET_ALLOW;
@@ -184,7 +193,7 @@ static u32 seccomp_run_filters(const struct seccomp_data *sd)
184 193
185 /* Ensure unexpected behavior doesn't result in failing open. */ 194 /* Ensure unexpected behavior doesn't result in failing open. */
186 if (unlikely(WARN_ON(f == NULL))) 195 if (unlikely(WARN_ON(f == NULL)))
187 return SECCOMP_RET_KILL; 196 return SECCOMP_RET_KILL_PROCESS;
188 197
189 if (!sd) { 198 if (!sd) {
190 populate_seccomp_data(&sd_local); 199 populate_seccomp_data(&sd_local);
@@ -198,8 +207,10 @@ static u32 seccomp_run_filters(const struct seccomp_data *sd)
198 for (; f; f = f->prev) { 207 for (; f; f = f->prev) {
199 u32 cur_ret = BPF_PROG_RUN(f->prog, sd); 208 u32 cur_ret = BPF_PROG_RUN(f->prog, sd);
200 209
201 if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION)) 210 if (ACTION_ONLY(cur_ret) < ACTION_ONLY(ret)) {
202 ret = cur_ret; 211 ret = cur_ret;
212 *match = f;
213 }
203 } 214 }
204 return ret; 215 return ret;
205} 216}
@@ -444,6 +455,10 @@ static long seccomp_attach_filter(unsigned int flags,
444 return ret; 455 return ret;
445 } 456 }
446 457
458 /* Set log flag, if present. */
459 if (flags & SECCOMP_FILTER_FLAG_LOG)
460 filter->log = true;
461
447 /* 462 /*
448 * If there is an existing filter, make it the prev and don't drop its 463 * If there is an existing filter, make it the prev and don't drop its
449 * task reference. 464 * task reference.
@@ -458,14 +473,19 @@ static long seccomp_attach_filter(unsigned int flags,
458 return 0; 473 return 0;
459} 474}
460 475
476void __get_seccomp_filter(struct seccomp_filter *filter)
477{
478 /* Reference count is bounded by the number of total processes. */
479 refcount_inc(&filter->usage);
480}
481
461/* get_seccomp_filter - increments the reference count of the filter on @tsk */ 482/* get_seccomp_filter - increments the reference count of the filter on @tsk */
462void get_seccomp_filter(struct task_struct *tsk) 483void get_seccomp_filter(struct task_struct *tsk)
463{ 484{
464 struct seccomp_filter *orig = tsk->seccomp.filter; 485 struct seccomp_filter *orig = tsk->seccomp.filter;
465 if (!orig) 486 if (!orig)
466 return; 487 return;
467 /* Reference count is bounded by the number of total processes. */ 488 __get_seccomp_filter(orig);
468 refcount_inc(&orig->usage);
469} 489}
470 490
471static inline void seccomp_filter_free(struct seccomp_filter *filter) 491static inline void seccomp_filter_free(struct seccomp_filter *filter)
@@ -476,10 +496,8 @@ static inline void seccomp_filter_free(struct seccomp_filter *filter)
476 } 496 }
477} 497}
478 498
479/* put_seccomp_filter - decrements the ref count of tsk->seccomp.filter */ 499static void __put_seccomp_filter(struct seccomp_filter *orig)
480void put_seccomp_filter(struct task_struct *tsk)
481{ 500{
482 struct seccomp_filter *orig = tsk->seccomp.filter;
483 /* Clean up single-reference branches iteratively. */ 501 /* Clean up single-reference branches iteratively. */
484 while (orig && refcount_dec_and_test(&orig->usage)) { 502 while (orig && refcount_dec_and_test(&orig->usage)) {
485 struct seccomp_filter *freeme = orig; 503 struct seccomp_filter *freeme = orig;
@@ -488,6 +506,12 @@ void put_seccomp_filter(struct task_struct *tsk)
488 } 506 }
489} 507}
490 508
509/* put_seccomp_filter - decrements the ref count of tsk->seccomp.filter */
510void put_seccomp_filter(struct task_struct *tsk)
511{
512 __put_seccomp_filter(tsk->seccomp.filter);
513}
514
491static void seccomp_init_siginfo(siginfo_t *info, int syscall, int reason) 515static void seccomp_init_siginfo(siginfo_t *info, int syscall, int reason)
492{ 516{
493 memset(info, 0, sizeof(*info)); 517 memset(info, 0, sizeof(*info));
@@ -514,6 +538,65 @@ static void seccomp_send_sigsys(int syscall, int reason)
514} 538}
515#endif /* CONFIG_SECCOMP_FILTER */ 539#endif /* CONFIG_SECCOMP_FILTER */
516 540
541/* For use with seccomp_actions_logged */
542#define SECCOMP_LOG_KILL_PROCESS (1 << 0)
543#define SECCOMP_LOG_KILL_THREAD (1 << 1)
544#define SECCOMP_LOG_TRAP (1 << 2)
545#define SECCOMP_LOG_ERRNO (1 << 3)
546#define SECCOMP_LOG_TRACE (1 << 4)
547#define SECCOMP_LOG_LOG (1 << 5)
548#define SECCOMP_LOG_ALLOW (1 << 6)
549
550static u32 seccomp_actions_logged = SECCOMP_LOG_KILL_PROCESS |
551 SECCOMP_LOG_KILL_THREAD |
552 SECCOMP_LOG_TRAP |
553 SECCOMP_LOG_ERRNO |
554 SECCOMP_LOG_TRACE |
555 SECCOMP_LOG_LOG;
556
557static inline void seccomp_log(unsigned long syscall, long signr, u32 action,
558 bool requested)
559{
560 bool log = false;
561
562 switch (action) {
563 case SECCOMP_RET_ALLOW:
564 break;
565 case SECCOMP_RET_TRAP:
566 log = requested && seccomp_actions_logged & SECCOMP_LOG_TRAP;
567 break;
568 case SECCOMP_RET_ERRNO:
569 log = requested && seccomp_actions_logged & SECCOMP_LOG_ERRNO;
570 break;
571 case SECCOMP_RET_TRACE:
572 log = requested && seccomp_actions_logged & SECCOMP_LOG_TRACE;
573 break;
574 case SECCOMP_RET_LOG:
575 log = seccomp_actions_logged & SECCOMP_LOG_LOG;
576 break;
577 case SECCOMP_RET_KILL_THREAD:
578 log = seccomp_actions_logged & SECCOMP_LOG_KILL_THREAD;
579 break;
580 case SECCOMP_RET_KILL_PROCESS:
581 default:
582 log = seccomp_actions_logged & SECCOMP_LOG_KILL_PROCESS;
583 }
584
585 /*
586 * Force an audit message to be emitted when the action is RET_KILL_*,
587 * RET_LOG, or the FILTER_FLAG_LOG bit was set and the action is
588 * allowed to be logged by the admin.
589 */
590 if (log)
591 return __audit_seccomp(syscall, signr, action);
592
593 /*
594 * Let the audit subsystem decide if the action should be audited based
595 * on whether the current task itself is being audited.
596 */
597 return audit_seccomp(syscall, signr, action);
598}
599
517/* 600/*
518 * Secure computing mode 1 allows only read/write/exit/sigreturn. 601 * Secure computing mode 1 allows only read/write/exit/sigreturn.
519 * To be fully secure this must be combined with rlimit 602 * To be fully secure this must be combined with rlimit
@@ -539,7 +622,7 @@ static void __secure_computing_strict(int this_syscall)
539#ifdef SECCOMP_DEBUG 622#ifdef SECCOMP_DEBUG
540 dump_stack(); 623 dump_stack();
541#endif 624#endif
542 audit_seccomp(this_syscall, SIGKILL, SECCOMP_RET_KILL); 625 seccomp_log(this_syscall, SIGKILL, SECCOMP_RET_KILL_THREAD, true);
543 do_exit(SIGKILL); 626 do_exit(SIGKILL);
544} 627}
545 628
@@ -566,6 +649,7 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
566 const bool recheck_after_trace) 649 const bool recheck_after_trace)
567{ 650{
568 u32 filter_ret, action; 651 u32 filter_ret, action;
652 struct seccomp_filter *match = NULL;
569 int data; 653 int data;
570 654
571 /* 655 /*
@@ -574,9 +658,9 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
574 */ 658 */
575 rmb(); 659 rmb();
576 660
577 filter_ret = seccomp_run_filters(sd); 661 filter_ret = seccomp_run_filters(sd, &match);
578 data = filter_ret & SECCOMP_RET_DATA; 662 data = filter_ret & SECCOMP_RET_DATA;
579 action = filter_ret & SECCOMP_RET_ACTION; 663 action = filter_ret & SECCOMP_RET_ACTION_FULL;
580 664
581 switch (action) { 665 switch (action) {
582 case SECCOMP_RET_ERRNO: 666 case SECCOMP_RET_ERRNO:
@@ -637,14 +721,25 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
637 721
638 return 0; 722 return 0;
639 723
724 case SECCOMP_RET_LOG:
725 seccomp_log(this_syscall, 0, action, true);
726 return 0;
727
640 case SECCOMP_RET_ALLOW: 728 case SECCOMP_RET_ALLOW:
729 /*
730 * Note that the "match" filter will always be NULL for
731 * this action since SECCOMP_RET_ALLOW is the starting
732 * state in seccomp_run_filters().
733 */
641 return 0; 734 return 0;
642 735
643 case SECCOMP_RET_KILL: 736 case SECCOMP_RET_KILL_THREAD:
737 case SECCOMP_RET_KILL_PROCESS:
644 default: 738 default:
645 audit_seccomp(this_syscall, SIGSYS, action); 739 seccomp_log(this_syscall, SIGSYS, action, true);
646 /* Dump core only if this is the last remaining thread. */ 740 /* Dump core only if this is the last remaining thread. */
647 if (get_nr_threads(current) == 1) { 741 if (action == SECCOMP_RET_KILL_PROCESS ||
742 get_nr_threads(current) == 1) {
648 siginfo_t info; 743 siginfo_t info;
649 744
650 /* Show the original registers in the dump. */ 745 /* Show the original registers in the dump. */
@@ -653,13 +748,16 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
653 seccomp_init_siginfo(&info, this_syscall, data); 748 seccomp_init_siginfo(&info, this_syscall, data);
654 do_coredump(&info); 749 do_coredump(&info);
655 } 750 }
656 do_exit(SIGSYS); 751 if (action == SECCOMP_RET_KILL_PROCESS)
752 do_group_exit(SIGSYS);
753 else
754 do_exit(SIGSYS);
657 } 755 }
658 756
659 unreachable(); 757 unreachable();
660 758
661skip: 759skip:
662 audit_seccomp(this_syscall, 0, action); 760 seccomp_log(this_syscall, 0, action, match ? match->log : false);
663 return -1; 761 return -1;
664} 762}
665#else 763#else
@@ -794,6 +892,29 @@ static inline long seccomp_set_mode_filter(unsigned int flags,
794} 892}
795#endif 893#endif
796 894
895static long seccomp_get_action_avail(const char __user *uaction)
896{
897 u32 action;
898
899 if (copy_from_user(&action, uaction, sizeof(action)))
900 return -EFAULT;
901
902 switch (action) {
903 case SECCOMP_RET_KILL_PROCESS:
904 case SECCOMP_RET_KILL_THREAD:
905 case SECCOMP_RET_TRAP:
906 case SECCOMP_RET_ERRNO:
907 case SECCOMP_RET_TRACE:
908 case SECCOMP_RET_LOG:
909 case SECCOMP_RET_ALLOW:
910 break;
911 default:
912 return -EOPNOTSUPP;
913 }
914
915 return 0;
916}
917
797/* Common entry point for both prctl and syscall. */ 918/* Common entry point for both prctl and syscall. */
798static long do_seccomp(unsigned int op, unsigned int flags, 919static long do_seccomp(unsigned int op, unsigned int flags,
799 const char __user *uargs) 920 const char __user *uargs)
@@ -805,6 +926,11 @@ static long do_seccomp(unsigned int op, unsigned int flags,
805 return seccomp_set_mode_strict(); 926 return seccomp_set_mode_strict();
806 case SECCOMP_SET_MODE_FILTER: 927 case SECCOMP_SET_MODE_FILTER:
807 return seccomp_set_mode_filter(flags, uargs); 928 return seccomp_set_mode_filter(flags, uargs);
929 case SECCOMP_GET_ACTION_AVAIL:
930 if (flags != 0)
931 return -EINVAL;
932
933 return seccomp_get_action_avail(uargs);
808 default: 934 default:
809 return -EINVAL; 935 return -EINVAL;
810 } 936 }
@@ -908,13 +1034,13 @@ long seccomp_get_filter(struct task_struct *task, unsigned long filter_off,
908 if (!data) 1034 if (!data)
909 goto out; 1035 goto out;
910 1036
911 get_seccomp_filter(task); 1037 __get_seccomp_filter(filter);
912 spin_unlock_irq(&task->sighand->siglock); 1038 spin_unlock_irq(&task->sighand->siglock);
913 1039
914 if (copy_to_user(data, fprog->filter, bpf_classic_proglen(fprog))) 1040 if (copy_to_user(data, fprog->filter, bpf_classic_proglen(fprog)))
915 ret = -EFAULT; 1041 ret = -EFAULT;
916 1042
917 put_seccomp_filter(task); 1043 __put_seccomp_filter(filter);
918 return ret; 1044 return ret;
919 1045
920out: 1046out:
@@ -922,3 +1048,185 @@ out:
922 return ret; 1048 return ret;
923} 1049}
924#endif 1050#endif
1051
1052#ifdef CONFIG_SYSCTL
1053
1054/* Human readable action names for friendly sysctl interaction */
1055#define SECCOMP_RET_KILL_PROCESS_NAME "kill_process"
1056#define SECCOMP_RET_KILL_THREAD_NAME "kill_thread"
1057#define SECCOMP_RET_TRAP_NAME "trap"
1058#define SECCOMP_RET_ERRNO_NAME "errno"
1059#define SECCOMP_RET_TRACE_NAME "trace"
1060#define SECCOMP_RET_LOG_NAME "log"
1061#define SECCOMP_RET_ALLOW_NAME "allow"
1062
1063static const char seccomp_actions_avail[] =
1064 SECCOMP_RET_KILL_PROCESS_NAME " "
1065 SECCOMP_RET_KILL_THREAD_NAME " "
1066 SECCOMP_RET_TRAP_NAME " "
1067 SECCOMP_RET_ERRNO_NAME " "
1068 SECCOMP_RET_TRACE_NAME " "
1069 SECCOMP_RET_LOG_NAME " "
1070 SECCOMP_RET_ALLOW_NAME;
1071
1072struct seccomp_log_name {
1073 u32 log;
1074 const char *name;
1075};
1076
1077static const struct seccomp_log_name seccomp_log_names[] = {
1078 { SECCOMP_LOG_KILL_PROCESS, SECCOMP_RET_KILL_PROCESS_NAME },
1079 { SECCOMP_LOG_KILL_THREAD, SECCOMP_RET_KILL_THREAD_NAME },
1080 { SECCOMP_LOG_TRAP, SECCOMP_RET_TRAP_NAME },
1081 { SECCOMP_LOG_ERRNO, SECCOMP_RET_ERRNO_NAME },
1082 { SECCOMP_LOG_TRACE, SECCOMP_RET_TRACE_NAME },
1083 { SECCOMP_LOG_LOG, SECCOMP_RET_LOG_NAME },
1084 { SECCOMP_LOG_ALLOW, SECCOMP_RET_ALLOW_NAME },
1085 { }
1086};
1087
1088static bool seccomp_names_from_actions_logged(char *names, size_t size,
1089 u32 actions_logged)
1090{
1091 const struct seccomp_log_name *cur;
1092 bool append_space = false;
1093
1094 for (cur = seccomp_log_names; cur->name && size; cur++) {
1095 ssize_t ret;
1096
1097 if (!(actions_logged & cur->log))
1098 continue;
1099
1100 if (append_space) {
1101 ret = strscpy(names, " ", size);
1102 if (ret < 0)
1103 return false;
1104
1105 names += ret;
1106 size -= ret;
1107 } else
1108 append_space = true;
1109
1110 ret = strscpy(names, cur->name, size);
1111 if (ret < 0)
1112 return false;
1113
1114 names += ret;
1115 size -= ret;
1116 }
1117
1118 return true;
1119}
1120
1121static bool seccomp_action_logged_from_name(u32 *action_logged,
1122 const char *name)
1123{
1124 const struct seccomp_log_name *cur;
1125
1126 for (cur = seccomp_log_names; cur->name; cur++) {
1127 if (!strcmp(cur->name, name)) {
1128 *action_logged = cur->log;
1129 return true;
1130 }
1131 }
1132
1133 return false;
1134}
1135
1136static bool seccomp_actions_logged_from_names(u32 *actions_logged, char *names)
1137{
1138 char *name;
1139
1140 *actions_logged = 0;
1141 while ((name = strsep(&names, " ")) && *name) {
1142 u32 action_logged = 0;
1143
1144 if (!seccomp_action_logged_from_name(&action_logged, name))
1145 return false;
1146
1147 *actions_logged |= action_logged;
1148 }
1149
1150 return true;
1151}
1152
1153static int seccomp_actions_logged_handler(struct ctl_table *ro_table, int write,
1154 void __user *buffer, size_t *lenp,
1155 loff_t *ppos)
1156{
1157 char names[sizeof(seccomp_actions_avail)];
1158 struct ctl_table table;
1159 int ret;
1160
1161 if (write && !capable(CAP_SYS_ADMIN))
1162 return -EPERM;
1163
1164 memset(names, 0, sizeof(names));
1165
1166 if (!write) {
1167 if (!seccomp_names_from_actions_logged(names, sizeof(names),
1168 seccomp_actions_logged))
1169 return -EINVAL;
1170 }
1171
1172 table = *ro_table;
1173 table.data = names;
1174 table.maxlen = sizeof(names);
1175 ret = proc_dostring(&table, write, buffer, lenp, ppos);
1176 if (ret)
1177 return ret;
1178
1179 if (write) {
1180 u32 actions_logged;
1181
1182 if (!seccomp_actions_logged_from_names(&actions_logged,
1183 table.data))
1184 return -EINVAL;
1185
1186 if (actions_logged & SECCOMP_LOG_ALLOW)
1187 return -EINVAL;
1188
1189 seccomp_actions_logged = actions_logged;
1190 }
1191
1192 return 0;
1193}
1194
1195static struct ctl_path seccomp_sysctl_path[] = {
1196 { .procname = "kernel", },
1197 { .procname = "seccomp", },
1198 { }
1199};
1200
1201static struct ctl_table seccomp_sysctl_table[] = {
1202 {
1203 .procname = "actions_avail",
1204 .data = (void *) &seccomp_actions_avail,
1205 .maxlen = sizeof(seccomp_actions_avail),
1206 .mode = 0444,
1207 .proc_handler = proc_dostring,
1208 },
1209 {
1210 .procname = "actions_logged",
1211 .mode = 0644,
1212 .proc_handler = seccomp_actions_logged_handler,
1213 },
1214 { }
1215};
1216
1217static int __init seccomp_sysctl_init(void)
1218{
1219 struct ctl_table_header *hdr;
1220
1221 hdr = register_sysctl_paths(seccomp_sysctl_path, seccomp_sysctl_table);
1222 if (!hdr)
1223 pr_warn("seccomp: sysctl registration failed\n");
1224 else
1225 kmemleak_not_leak(hdr);
1226
1227 return 0;
1228}
1229
1230device_initcall(seccomp_sysctl_init)
1231
1232#endif /* CONFIG_SYSCTL */
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index 1d71c051a951..5043e7433f4b 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -344,39 +344,30 @@ EXPORT_SYMBOL_GPL(smpboot_unregister_percpu_thread);
344 * by the client, but only by calling this function. 344 * by the client, but only by calling this function.
345 * This function can only be called on a registered smp_hotplug_thread. 345 * This function can only be called on a registered smp_hotplug_thread.
346 */ 346 */
347int smpboot_update_cpumask_percpu_thread(struct smp_hotplug_thread *plug_thread, 347void smpboot_update_cpumask_percpu_thread(struct smp_hotplug_thread *plug_thread,
348 const struct cpumask *new) 348 const struct cpumask *new)
349{ 349{
350 struct cpumask *old = plug_thread->cpumask; 350 struct cpumask *old = plug_thread->cpumask;
351 cpumask_var_t tmp; 351 static struct cpumask tmp;
352 unsigned int cpu; 352 unsigned int cpu;
353 353
354 if (!alloc_cpumask_var(&tmp, GFP_KERNEL)) 354 lockdep_assert_cpus_held();
355 return -ENOMEM;
356
357 get_online_cpus();
358 mutex_lock(&smpboot_threads_lock); 355 mutex_lock(&smpboot_threads_lock);
359 356
360 /* Park threads that were exclusively enabled on the old mask. */ 357 /* Park threads that were exclusively enabled on the old mask. */
361 cpumask_andnot(tmp, old, new); 358 cpumask_andnot(&tmp, old, new);
362 for_each_cpu_and(cpu, tmp, cpu_online_mask) 359 for_each_cpu_and(cpu, &tmp, cpu_online_mask)
363 smpboot_park_thread(plug_thread, cpu); 360 smpboot_park_thread(plug_thread, cpu);
364 361
365 /* Unpark threads that are exclusively enabled on the new mask. */ 362 /* Unpark threads that are exclusively enabled on the new mask. */
366 cpumask_andnot(tmp, new, old); 363 cpumask_andnot(&tmp, new, old);
367 for_each_cpu_and(cpu, tmp, cpu_online_mask) 364 for_each_cpu_and(cpu, &tmp, cpu_online_mask)
368 smpboot_unpark_thread(plug_thread, cpu); 365 smpboot_unpark_thread(plug_thread, cpu);
369 366
370 cpumask_copy(old, new); 367 cpumask_copy(old, new);
371 368
372 mutex_unlock(&smpboot_threads_lock); 369 mutex_unlock(&smpboot_threads_lock);
373 put_online_cpus();
374
375 free_cpumask_var(tmp);
376
377 return 0;
378} 370}
379EXPORT_SYMBOL_GPL(smpboot_update_cpumask_percpu_thread);
380 371
381static DEFINE_PER_CPU(atomic_t, cpu_hotplug_state) = ATOMIC_INIT(CPU_POST_DEAD); 372static DEFINE_PER_CPU(atomic_t, cpu_hotplug_state) = ATOMIC_INIT(CPU_POST_DEAD);
382 373
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 6648fbbb8157..d9c31bc2eaea 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -367,7 +367,8 @@ static struct ctl_table kern_table[] = {
367 .data = &sysctl_sched_time_avg, 367 .data = &sysctl_sched_time_avg,
368 .maxlen = sizeof(unsigned int), 368 .maxlen = sizeof(unsigned int),
369 .mode = 0644, 369 .mode = 0644,
370 .proc_handler = proc_dointvec, 370 .proc_handler = proc_dointvec_minmax,
371 .extra1 = &one,
371 }, 372 },
372#ifdef CONFIG_SCHEDSTATS 373#ifdef CONFIG_SCHEDSTATS
373 { 374 {
@@ -871,9 +872,9 @@ static struct ctl_table kern_table[] = {
871#if defined(CONFIG_LOCKUP_DETECTOR) 872#if defined(CONFIG_LOCKUP_DETECTOR)
872 { 873 {
873 .procname = "watchdog", 874 .procname = "watchdog",
874 .data = &watchdog_user_enabled, 875 .data = &watchdog_user_enabled,
875 .maxlen = sizeof (int), 876 .maxlen = sizeof(int),
876 .mode = 0644, 877 .mode = 0644,
877 .proc_handler = proc_watchdog, 878 .proc_handler = proc_watchdog,
878 .extra1 = &zero, 879 .extra1 = &zero,
879 .extra2 = &one, 880 .extra2 = &one,
@@ -889,16 +890,12 @@ static struct ctl_table kern_table[] = {
889 }, 890 },
890 { 891 {
891 .procname = "nmi_watchdog", 892 .procname = "nmi_watchdog",
892 .data = &nmi_watchdog_enabled, 893 .data = &nmi_watchdog_user_enabled,
893 .maxlen = sizeof (int), 894 .maxlen = sizeof(int),
894 .mode = 0644, 895 .mode = NMI_WATCHDOG_SYSCTL_PERM,
895 .proc_handler = proc_nmi_watchdog, 896 .proc_handler = proc_nmi_watchdog,
896 .extra1 = &zero, 897 .extra1 = &zero,
897#if defined(CONFIG_HAVE_NMI_WATCHDOG) || defined(CONFIG_HARDLOCKUP_DETECTOR)
898 .extra2 = &one, 898 .extra2 = &one,
899#else
900 .extra2 = &zero,
901#endif
902 }, 899 },
903 { 900 {
904 .procname = "watchdog_cpumask", 901 .procname = "watchdog_cpumask",
@@ -910,9 +907,9 @@ static struct ctl_table kern_table[] = {
910#ifdef CONFIG_SOFTLOCKUP_DETECTOR 907#ifdef CONFIG_SOFTLOCKUP_DETECTOR
911 { 908 {
912 .procname = "soft_watchdog", 909 .procname = "soft_watchdog",
913 .data = &soft_watchdog_enabled, 910 .data = &soft_watchdog_user_enabled,
914 .maxlen = sizeof (int), 911 .maxlen = sizeof(int),
915 .mode = 0644, 912 .mode = 0644,
916 .proc_handler = proc_soft_watchdog, 913 .proc_handler = proc_soft_watchdog,
917 .extra1 = &zero, 914 .extra1 = &zero,
918 .extra2 = &one, 915 .extra2 = &one,
@@ -2187,8 +2184,6 @@ static int do_proc_douintvec_conv(unsigned long *lvalp,
2187 if (write) { 2184 if (write) {
2188 if (*lvalp > UINT_MAX) 2185 if (*lvalp > UINT_MAX)
2189 return -EINVAL; 2186 return -EINVAL;
2190 if (*lvalp > UINT_MAX)
2191 return -EINVAL;
2192 *valp = *lvalp; 2187 *valp = *lvalp;
2193 } else { 2188 } else {
2194 unsigned int val = *valp; 2189 unsigned int val = *valp;
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 2a685b45b73b..45a3928544ce 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -648,6 +648,12 @@ int blk_trace_startstop(struct request_queue *q, int start)
648} 648}
649EXPORT_SYMBOL_GPL(blk_trace_startstop); 649EXPORT_SYMBOL_GPL(blk_trace_startstop);
650 650
651/*
652 * When reading or writing the blktrace sysfs files, the references to the
653 * opened sysfs or device files should prevent the underlying block device
654 * from being removed. So no further delete protection is really needed.
655 */
656
651/** 657/**
652 * blk_trace_ioctl: - handle the ioctls associated with tracing 658 * blk_trace_ioctl: - handle the ioctls associated with tracing
653 * @bdev: the block device 659 * @bdev: the block device
@@ -665,7 +671,7 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
665 if (!q) 671 if (!q)
666 return -ENXIO; 672 return -ENXIO;
667 673
668 mutex_lock(&bdev->bd_mutex); 674 mutex_lock(&q->blk_trace_mutex);
669 675
670 switch (cmd) { 676 switch (cmd) {
671 case BLKTRACESETUP: 677 case BLKTRACESETUP:
@@ -691,7 +697,7 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
691 break; 697 break;
692 } 698 }
693 699
694 mutex_unlock(&bdev->bd_mutex); 700 mutex_unlock(&q->blk_trace_mutex);
695 return ret; 701 return ret;
696} 702}
697 703
@@ -1727,7 +1733,7 @@ static ssize_t sysfs_blk_trace_attr_show(struct device *dev,
1727 if (q == NULL) 1733 if (q == NULL)
1728 goto out_bdput; 1734 goto out_bdput;
1729 1735
1730 mutex_lock(&bdev->bd_mutex); 1736 mutex_lock(&q->blk_trace_mutex);
1731 1737
1732 if (attr == &dev_attr_enable) { 1738 if (attr == &dev_attr_enable) {
1733 ret = sprintf(buf, "%u\n", !!q->blk_trace); 1739 ret = sprintf(buf, "%u\n", !!q->blk_trace);
@@ -1746,7 +1752,7 @@ static ssize_t sysfs_blk_trace_attr_show(struct device *dev,
1746 ret = sprintf(buf, "%llu\n", q->blk_trace->end_lba); 1752 ret = sprintf(buf, "%llu\n", q->blk_trace->end_lba);
1747 1753
1748out_unlock_bdev: 1754out_unlock_bdev:
1749 mutex_unlock(&bdev->bd_mutex); 1755 mutex_unlock(&q->blk_trace_mutex);
1750out_bdput: 1756out_bdput:
1751 bdput(bdev); 1757 bdput(bdev);
1752out: 1758out:
@@ -1788,7 +1794,7 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
1788 if (q == NULL) 1794 if (q == NULL)
1789 goto out_bdput; 1795 goto out_bdput;
1790 1796
1791 mutex_lock(&bdev->bd_mutex); 1797 mutex_lock(&q->blk_trace_mutex);
1792 1798
1793 if (attr == &dev_attr_enable) { 1799 if (attr == &dev_attr_enable) {
1794 if (value) 1800 if (value)
@@ -1814,7 +1820,7 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
1814 } 1820 }
1815 1821
1816out_unlock_bdev: 1822out_unlock_bdev:
1817 mutex_unlock(&bdev->bd_mutex); 1823 mutex_unlock(&q->blk_trace_mutex);
1818out_bdput: 1824out_bdput:
1819 bdput(bdev); 1825 bdput(bdev);
1820out: 1826out:
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 6abfafd7f173..8319e09e15b9 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -4954,9 +4954,6 @@ static char ftrace_graph_buf[FTRACE_FILTER_SIZE] __initdata;
4954static char ftrace_graph_notrace_buf[FTRACE_FILTER_SIZE] __initdata; 4954static char ftrace_graph_notrace_buf[FTRACE_FILTER_SIZE] __initdata;
4955static int ftrace_graph_set_hash(struct ftrace_hash *hash, char *buffer); 4955static int ftrace_graph_set_hash(struct ftrace_hash *hash, char *buffer);
4956 4956
4957static unsigned long save_global_trampoline;
4958static unsigned long save_global_flags;
4959
4960static int __init set_graph_function(char *str) 4957static int __init set_graph_function(char *str)
4961{ 4958{
4962 strlcpy(ftrace_graph_buf, str, FTRACE_FILTER_SIZE); 4959 strlcpy(ftrace_graph_buf, str, FTRACE_FILTER_SIZE);
@@ -6808,17 +6805,6 @@ void unregister_ftrace_graph(void)
6808 unregister_pm_notifier(&ftrace_suspend_notifier); 6805 unregister_pm_notifier(&ftrace_suspend_notifier);
6809 unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL); 6806 unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL);
6810 6807
6811#ifdef CONFIG_DYNAMIC_FTRACE
6812 /*
6813 * Function graph does not allocate the trampoline, but
6814 * other global_ops do. We need to reset the ALLOC_TRAMP flag
6815 * if one was used.
6816 */
6817 global_ops.trampoline = save_global_trampoline;
6818 if (save_global_flags & FTRACE_OPS_FL_ALLOC_TRAMP)
6819 global_ops.flags |= FTRACE_OPS_FL_ALLOC_TRAMP;
6820#endif
6821
6822 out: 6808 out:
6823 mutex_unlock(&ftrace_lock); 6809 mutex_unlock(&ftrace_lock);
6824} 6810}
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 5360b7aec57a..752e5daf0896 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -4020,11 +4020,17 @@ static int tracing_open(struct inode *inode, struct file *file)
4020 /* If this file was open for write, then erase contents */ 4020 /* If this file was open for write, then erase contents */
4021 if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) { 4021 if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) {
4022 int cpu = tracing_get_cpu(inode); 4022 int cpu = tracing_get_cpu(inode);
4023 struct trace_buffer *trace_buf = &tr->trace_buffer;
4024
4025#ifdef CONFIG_TRACER_MAX_TRACE
4026 if (tr->current_trace->print_max)
4027 trace_buf = &tr->max_buffer;
4028#endif
4023 4029
4024 if (cpu == RING_BUFFER_ALL_CPUS) 4030 if (cpu == RING_BUFFER_ALL_CPUS)
4025 tracing_reset_online_cpus(&tr->trace_buffer); 4031 tracing_reset_online_cpus(trace_buf);
4026 else 4032 else
4027 tracing_reset(&tr->trace_buffer, cpu); 4033 tracing_reset(trace_buf, cpu);
4028 } 4034 }
4029 4035
4030 if (file->f_mode & FMODE_READ) { 4036 if (file->f_mode & FMODE_READ) {
@@ -5358,6 +5364,13 @@ static int tracing_set_tracer(struct trace_array *tr, const char *buf)
5358 if (t == tr->current_trace) 5364 if (t == tr->current_trace)
5359 goto out; 5365 goto out;
5360 5366
5367 /* Some tracers won't work on kernel command line */
5368 if (system_state < SYSTEM_RUNNING && t->noboot) {
5369 pr_warn("Tracer '%s' is not allowed on command line, ignored\n",
5370 t->name);
5371 goto out;
5372 }
5373
5361 /* Some tracers are only allowed for the top level buffer */ 5374 /* Some tracers are only allowed for the top level buffer */
5362 if (!trace_ok_for_array(t, tr)) { 5375 if (!trace_ok_for_array(t, tr)) {
5363 ret = -EINVAL; 5376 ret = -EINVAL;
@@ -5667,7 +5680,7 @@ static int tracing_wait_pipe(struct file *filp)
5667 * 5680 *
5668 * iter->pos will be 0 if we haven't read anything. 5681 * iter->pos will be 0 if we haven't read anything.
5669 */ 5682 */
5670 if (!tracing_is_on() && iter->pos) 5683 if (!tracer_tracing_is_on(iter->tr) && iter->pos)
5671 break; 5684 break;
5672 5685
5673 mutex_unlock(&iter->mutex); 5686 mutex_unlock(&iter->mutex);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index fb5d54d0d1b3..652c682707cd 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -444,6 +444,8 @@ struct tracer {
444#ifdef CONFIG_TRACER_MAX_TRACE 444#ifdef CONFIG_TRACER_MAX_TRACE
445 bool use_max_tr; 445 bool use_max_tr;
446#endif 446#endif
447 /* True if tracer cannot be enabled in kernel param */
448 bool noboot;
447}; 449};
448 450
449 451
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index cd7480d0a201..dca78fc48439 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -282,6 +282,7 @@ static struct tracer mmio_tracer __read_mostly =
282 .close = mmio_close, 282 .close = mmio_close,
283 .read = mmio_read, 283 .read = mmio_read,
284 .print_line = mmio_print_line, 284 .print_line = mmio_print_line,
285 .noboot = true,
285}; 286};
286 287
287__init static int init_mmio_trace(void) 288__init static int init_mmio_trace(void)
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index bac629af2285..c738e764e2a5 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -656,15 +656,6 @@ int trace_print_lat_context(struct trace_iterator *iter)
656 return !trace_seq_has_overflowed(s); 656 return !trace_seq_has_overflowed(s);
657} 657}
658 658
659static const char state_to_char[] = TASK_STATE_TO_CHAR_STR;
660
661static int task_state_char(unsigned long state)
662{
663 int bit = state ? __ffs(state) + 1 : 0;
664
665 return bit < sizeof(state_to_char) - 1 ? state_to_char[bit] : '?';
666}
667
668/** 659/**
669 * ftrace_find_event - find a registered event 660 * ftrace_find_event - find a registered event
670 * @type: the type of event to look for 661 * @type: the type of event to look for
@@ -930,8 +921,8 @@ static enum print_line_t trace_ctxwake_print(struct trace_iterator *iter,
930 921
931 trace_assign_type(field, iter->ent); 922 trace_assign_type(field, iter->ent);
932 923
933 T = task_state_char(field->next_state); 924 T = __task_state_to_char(field->next_state);
934 S = task_state_char(field->prev_state); 925 S = __task_state_to_char(field->prev_state);
935 trace_find_cmdline(field->next_pid, comm); 926 trace_find_cmdline(field->next_pid, comm);
936 trace_seq_printf(&iter->seq, 927 trace_seq_printf(&iter->seq,
937 " %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n", 928 " %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n",
@@ -966,8 +957,8 @@ static int trace_ctxwake_raw(struct trace_iterator *iter, char S)
966 trace_assign_type(field, iter->ent); 957 trace_assign_type(field, iter->ent);
967 958
968 if (!S) 959 if (!S)
969 S = task_state_char(field->prev_state); 960 S = __task_state_to_char(field->prev_state);
970 T = task_state_char(field->next_state); 961 T = __task_state_to_char(field->next_state);
971 trace_seq_printf(&iter->seq, "%d %d %c %d %d %d %c\n", 962 trace_seq_printf(&iter->seq, "%d %d %c %d %d %d %c\n",
972 field->prev_pid, 963 field->prev_pid,
973 field->prev_prio, 964 field->prev_prio,
@@ -1002,8 +993,8 @@ static int trace_ctxwake_hex(struct trace_iterator *iter, char S)
1002 trace_assign_type(field, iter->ent); 993 trace_assign_type(field, iter->ent);
1003 994
1004 if (!S) 995 if (!S)
1005 S = task_state_char(field->prev_state); 996 S = __task_state_to_char(field->prev_state);
1006 T = task_state_char(field->next_state); 997 T = __task_state_to_char(field->next_state);
1007 998
1008 SEQ_PUT_HEX_FIELD(s, field->prev_pid); 999 SEQ_PUT_HEX_FIELD(s, field->prev_pid);
1009 SEQ_PUT_HEX_FIELD(s, field->prev_prio); 1000 SEQ_PUT_HEX_FIELD(s, field->prev_prio);
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index ddec53b67646..0c331978b1a6 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -397,10 +397,10 @@ tracing_sched_switch_trace(struct trace_array *tr,
397 entry = ring_buffer_event_data(event); 397 entry = ring_buffer_event_data(event);
398 entry->prev_pid = prev->pid; 398 entry->prev_pid = prev->pid;
399 entry->prev_prio = prev->prio; 399 entry->prev_prio = prev->prio;
400 entry->prev_state = prev->state; 400 entry->prev_state = __get_task_state(prev);
401 entry->next_pid = next->pid; 401 entry->next_pid = next->pid;
402 entry->next_prio = next->prio; 402 entry->next_prio = next->prio;
403 entry->next_state = next->state; 403 entry->next_state = __get_task_state(next);
404 entry->next_cpu = task_cpu(next); 404 entry->next_cpu = task_cpu(next);
405 405
406 if (!call_filter_check_discard(call, entry, buffer, event)) 406 if (!call_filter_check_discard(call, entry, buffer, event))
@@ -425,10 +425,10 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
425 entry = ring_buffer_event_data(event); 425 entry = ring_buffer_event_data(event);
426 entry->prev_pid = curr->pid; 426 entry->prev_pid = curr->pid;
427 entry->prev_prio = curr->prio; 427 entry->prev_prio = curr->prio;
428 entry->prev_state = curr->state; 428 entry->prev_state = __get_task_state(curr);
429 entry->next_pid = wakee->pid; 429 entry->next_pid = wakee->pid;
430 entry->next_prio = wakee->prio; 430 entry->next_prio = wakee->prio;
431 entry->next_state = wakee->state; 431 entry->next_state = __get_task_state(wakee);
432 entry->next_cpu = task_cpu(wakee); 432 entry->next_cpu = task_cpu(wakee);
433 433
434 if (!call_filter_check_discard(call, entry, buffer, event)) 434 if (!call_filter_check_discard(call, entry, buffer, event))
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index a4df67cbc711..49cb41412eec 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -96,23 +96,9 @@ check_stack(unsigned long ip, unsigned long *stack)
96 if (in_nmi()) 96 if (in_nmi())
97 return; 97 return;
98 98
99 /*
100 * There's a slight chance that we are tracing inside the
101 * RCU infrastructure, and rcu_irq_enter() will not work
102 * as expected.
103 */
104 if (unlikely(rcu_irq_enter_disabled()))
105 return;
106
107 local_irq_save(flags); 99 local_irq_save(flags);
108 arch_spin_lock(&stack_trace_max_lock); 100 arch_spin_lock(&stack_trace_max_lock);
109 101
110 /*
111 * RCU may not be watching, make it see us.
112 * The stack trace code uses rcu_sched.
113 */
114 rcu_irq_enter();
115
116 /* In case another CPU set the tracer_frame on us */ 102 /* In case another CPU set the tracer_frame on us */
117 if (unlikely(!frame_size)) 103 if (unlikely(!frame_size))
118 this_size -= tracer_frame; 104 this_size -= tracer_frame;
@@ -205,7 +191,6 @@ check_stack(unsigned long ip, unsigned long *stack)
205 } 191 }
206 192
207 out: 193 out:
208 rcu_irq_exit();
209 arch_spin_unlock(&stack_trace_max_lock); 194 arch_spin_unlock(&stack_trace_max_lock);
210 local_irq_restore(flags); 195 local_irq_restore(flags);
211} 196}
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index f5d52024f6b7..6bcb854909c0 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -29,20 +29,29 @@
29#include <linux/kvm_para.h> 29#include <linux/kvm_para.h>
30#include <linux/kthread.h> 30#include <linux/kthread.h>
31 31
32/* Watchdog configuration */ 32static DEFINE_MUTEX(watchdog_mutex);
33static DEFINE_MUTEX(watchdog_proc_mutex);
34
35int __read_mostly nmi_watchdog_enabled;
36 33
37#if defined(CONFIG_HARDLOCKUP_DETECTOR) || defined(CONFIG_HAVE_NMI_WATCHDOG) 34#if defined(CONFIG_HARDLOCKUP_DETECTOR) || defined(CONFIG_HAVE_NMI_WATCHDOG)
38unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED | 35# define WATCHDOG_DEFAULT (SOFT_WATCHDOG_ENABLED | NMI_WATCHDOG_ENABLED)
39 NMI_WATCHDOG_ENABLED; 36# define NMI_WATCHDOG_DEFAULT 1
40#else 37#else
41unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED; 38# define WATCHDOG_DEFAULT (SOFT_WATCHDOG_ENABLED)
39# define NMI_WATCHDOG_DEFAULT 0
42#endif 40#endif
43 41
42unsigned long __read_mostly watchdog_enabled;
43int __read_mostly watchdog_user_enabled = 1;
44int __read_mostly nmi_watchdog_user_enabled = NMI_WATCHDOG_DEFAULT;
45int __read_mostly soft_watchdog_user_enabled = 1;
46int __read_mostly watchdog_thresh = 10;
47int __read_mostly nmi_watchdog_available;
48
49struct cpumask watchdog_allowed_mask __read_mostly;
50
51struct cpumask watchdog_cpumask __read_mostly;
52unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask);
53
44#ifdef CONFIG_HARDLOCKUP_DETECTOR 54#ifdef CONFIG_HARDLOCKUP_DETECTOR
45/* boot commands */
46/* 55/*
47 * Should we panic when a soft-lockup or hard-lockup occurs: 56 * Should we panic when a soft-lockup or hard-lockup occurs:
48 */ 57 */
@@ -56,9 +65,9 @@ unsigned int __read_mostly hardlockup_panic =
56 * kernel command line parameters are parsed, because otherwise it is not 65 * kernel command line parameters are parsed, because otherwise it is not
57 * possible to override this in hardlockup_panic_setup(). 66 * possible to override this in hardlockup_panic_setup().
58 */ 67 */
59void hardlockup_detector_disable(void) 68void __init hardlockup_detector_disable(void)
60{ 69{
61 watchdog_enabled &= ~NMI_WATCHDOG_ENABLED; 70 nmi_watchdog_user_enabled = 0;
62} 71}
63 72
64static int __init hardlockup_panic_setup(char *str) 73static int __init hardlockup_panic_setup(char *str)
@@ -68,48 +77,24 @@ static int __init hardlockup_panic_setup(char *str)
68 else if (!strncmp(str, "nopanic", 7)) 77 else if (!strncmp(str, "nopanic", 7))
69 hardlockup_panic = 0; 78 hardlockup_panic = 0;
70 else if (!strncmp(str, "0", 1)) 79 else if (!strncmp(str, "0", 1))
71 watchdog_enabled &= ~NMI_WATCHDOG_ENABLED; 80 nmi_watchdog_user_enabled = 0;
72 else if (!strncmp(str, "1", 1)) 81 else if (!strncmp(str, "1", 1))
73 watchdog_enabled |= NMI_WATCHDOG_ENABLED; 82 nmi_watchdog_user_enabled = 1;
74 return 1; 83 return 1;
75} 84}
76__setup("nmi_watchdog=", hardlockup_panic_setup); 85__setup("nmi_watchdog=", hardlockup_panic_setup);
77 86
78#endif 87# ifdef CONFIG_SMP
79
80#ifdef CONFIG_SOFTLOCKUP_DETECTOR
81int __read_mostly soft_watchdog_enabled;
82#endif
83
84int __read_mostly watchdog_user_enabled;
85int __read_mostly watchdog_thresh = 10;
86
87#ifdef CONFIG_SMP
88int __read_mostly sysctl_softlockup_all_cpu_backtrace;
89int __read_mostly sysctl_hardlockup_all_cpu_backtrace; 88int __read_mostly sysctl_hardlockup_all_cpu_backtrace;
90#endif
91struct cpumask watchdog_cpumask __read_mostly;
92unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask);
93 89
94/* 90static int __init hardlockup_all_cpu_backtrace_setup(char *str)
95 * The 'watchdog_running' variable is set to 1 when the watchdog threads 91{
96 * are registered/started and is set to 0 when the watchdog threads are 92 sysctl_hardlockup_all_cpu_backtrace = !!simple_strtol(str, NULL, 0);
97 * unregistered/stopped, so it is an indicator whether the threads exist. 93 return 1;
98 */ 94}
99static int __read_mostly watchdog_running; 95__setup("hardlockup_all_cpu_backtrace=", hardlockup_all_cpu_backtrace_setup);
100/* 96# endif /* CONFIG_SMP */
101 * If a subsystem has a need to deactivate the watchdog temporarily, it 97#endif /* CONFIG_HARDLOCKUP_DETECTOR */
102 * can use the suspend/resume interface to achieve this. The content of
103 * the 'watchdog_suspended' variable reflects this state. Existing threads
104 * are parked/unparked by the lockup_detector_{suspend|resume} functions
105 * (see comment blocks pertaining to those functions for further details).
106 *
107 * 'watchdog_suspended' also prevents threads from being registered/started
108 * or unregistered/stopped via parameters in /proc/sys/kernel, so the state
109 * of 'watchdog_running' cannot change while the watchdog is deactivated
110 * temporarily (see related code in 'proc' handlers).
111 */
112int __read_mostly watchdog_suspended;
113 98
114/* 99/*
115 * These functions can be overridden if an architecture implements its 100 * These functions can be overridden if an architecture implements its
@@ -121,36 +106,68 @@ int __read_mostly watchdog_suspended;
121 */ 106 */
122int __weak watchdog_nmi_enable(unsigned int cpu) 107int __weak watchdog_nmi_enable(unsigned int cpu)
123{ 108{
109 hardlockup_detector_perf_enable();
124 return 0; 110 return 0;
125} 111}
112
126void __weak watchdog_nmi_disable(unsigned int cpu) 113void __weak watchdog_nmi_disable(unsigned int cpu)
127{ 114{
115 hardlockup_detector_perf_disable();
128} 116}
129 117
130/* 118/* Return 0, if a NMI watchdog is available. Error code otherwise */
131 * watchdog_nmi_reconfigure can be implemented to be notified after any 119int __weak __init watchdog_nmi_probe(void)
132 * watchdog configuration change. The arch hardlockup watchdog should 120{
133 * respond to the following variables: 121 return hardlockup_detector_perf_init();
134 * - nmi_watchdog_enabled 122}
123
124/**
125 * watchdog_nmi_stop - Stop the watchdog for reconfiguration
126 *
127 * The reconfiguration steps are:
128 * watchdog_nmi_stop();
129 * update_variables();
130 * watchdog_nmi_start();
131 */
132void __weak watchdog_nmi_stop(void) { }
133
134/**
135 * watchdog_nmi_start - Start the watchdog after reconfiguration
136 *
137 * Counterpart to watchdog_nmi_stop().
138 *
139 * The following variables have been updated in update_variables() and
140 * contain the currently valid configuration:
141 * - watchdog_enabled
135 * - watchdog_thresh 142 * - watchdog_thresh
136 * - watchdog_cpumask 143 * - watchdog_cpumask
137 * - sysctl_hardlockup_all_cpu_backtrace
138 * - hardlockup_panic
139 * - watchdog_suspended
140 */ 144 */
141void __weak watchdog_nmi_reconfigure(void) 145void __weak watchdog_nmi_start(void) { }
146
147/**
148 * lockup_detector_update_enable - Update the sysctl enable bit
149 *
150 * Caller needs to make sure that the NMI/perf watchdogs are off, so this
151 * can't race with watchdog_nmi_disable().
152 */
153static void lockup_detector_update_enable(void)
142{ 154{
155 watchdog_enabled = 0;
156 if (!watchdog_user_enabled)
157 return;
158 if (nmi_watchdog_available && nmi_watchdog_user_enabled)
159 watchdog_enabled |= NMI_WATCHDOG_ENABLED;
160 if (soft_watchdog_user_enabled)
161 watchdog_enabled |= SOFT_WATCHDOG_ENABLED;
143} 162}
144 163
145
146#ifdef CONFIG_SOFTLOCKUP_DETECTOR 164#ifdef CONFIG_SOFTLOCKUP_DETECTOR
147 165
148/* Helper for online, unparked cpus. */ 166/* Global variables, exported for sysctl */
149#define for_each_watchdog_cpu(cpu) \ 167unsigned int __read_mostly softlockup_panic =
150 for_each_cpu_and((cpu), cpu_online_mask, &watchdog_cpumask) 168 CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE;
151
152atomic_t watchdog_park_in_progress = ATOMIC_INIT(0);
153 169
170static bool softlockup_threads_initialized __read_mostly;
154static u64 __read_mostly sample_period; 171static u64 __read_mostly sample_period;
155 172
156static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); 173static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts);
@@ -164,50 +181,40 @@ static DEFINE_PER_CPU(struct task_struct *, softlockup_task_ptr_saved);
164static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved); 181static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved);
165static unsigned long soft_lockup_nmi_warn; 182static unsigned long soft_lockup_nmi_warn;
166 183
167unsigned int __read_mostly softlockup_panic =
168 CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE;
169
170static int __init softlockup_panic_setup(char *str) 184static int __init softlockup_panic_setup(char *str)
171{ 185{
172 softlockup_panic = simple_strtoul(str, NULL, 0); 186 softlockup_panic = simple_strtoul(str, NULL, 0);
173
174 return 1; 187 return 1;
175} 188}
176__setup("softlockup_panic=", softlockup_panic_setup); 189__setup("softlockup_panic=", softlockup_panic_setup);
177 190
178static int __init nowatchdog_setup(char *str) 191static int __init nowatchdog_setup(char *str)
179{ 192{
180 watchdog_enabled = 0; 193 watchdog_user_enabled = 0;
181 return 1; 194 return 1;
182} 195}
183__setup("nowatchdog", nowatchdog_setup); 196__setup("nowatchdog", nowatchdog_setup);
184 197
185static int __init nosoftlockup_setup(char *str) 198static int __init nosoftlockup_setup(char *str)
186{ 199{
187 watchdog_enabled &= ~SOFT_WATCHDOG_ENABLED; 200 soft_watchdog_user_enabled = 0;
188 return 1; 201 return 1;
189} 202}
190__setup("nosoftlockup", nosoftlockup_setup); 203__setup("nosoftlockup", nosoftlockup_setup);
191 204
192#ifdef CONFIG_SMP 205#ifdef CONFIG_SMP
206int __read_mostly sysctl_softlockup_all_cpu_backtrace;
207
193static int __init softlockup_all_cpu_backtrace_setup(char *str) 208static int __init softlockup_all_cpu_backtrace_setup(char *str)
194{ 209{
195 sysctl_softlockup_all_cpu_backtrace = 210 sysctl_softlockup_all_cpu_backtrace = !!simple_strtol(str, NULL, 0);
196 !!simple_strtol(str, NULL, 0);
197 return 1; 211 return 1;
198} 212}
199__setup("softlockup_all_cpu_backtrace=", softlockup_all_cpu_backtrace_setup); 213__setup("softlockup_all_cpu_backtrace=", softlockup_all_cpu_backtrace_setup);
200#ifdef CONFIG_HARDLOCKUP_DETECTOR
201static int __init hardlockup_all_cpu_backtrace_setup(char *str)
202{
203 sysctl_hardlockup_all_cpu_backtrace =
204 !!simple_strtol(str, NULL, 0);
205 return 1;
206}
207__setup("hardlockup_all_cpu_backtrace=", hardlockup_all_cpu_backtrace_setup);
208#endif
209#endif 214#endif
210 215
216static void __lockup_detector_cleanup(void);
217
211/* 218/*
212 * Hard-lockup warnings should be triggered after just a few seconds. Soft- 219 * Hard-lockup warnings should be triggered after just a few seconds. Soft-
213 * lockups can have false positives under extreme conditions. So we generally 220 * lockups can have false positives under extreme conditions. So we generally
@@ -278,11 +285,15 @@ void touch_all_softlockup_watchdogs(void)
278 int cpu; 285 int cpu;
279 286
280 /* 287 /*
281 * this is done lockless 288 * watchdog_mutex cannpt be taken here, as this might be called
282 * do we care if a 0 races with a timestamp? 289 * from (soft)interrupt context, so the access to
283 * all it means is the softlock check starts one cycle later 290 * watchdog_allowed_cpumask might race with a concurrent update.
291 *
292 * The watchdog time stamp can race against a concurrent real
293 * update as well, the only side effect might be a cycle delay for
294 * the softlockup check.
284 */ 295 */
285 for_each_watchdog_cpu(cpu) 296 for_each_cpu(cpu, &watchdog_allowed_mask)
286 per_cpu(watchdog_touch_ts, cpu) = 0; 297 per_cpu(watchdog_touch_ts, cpu) = 0;
287 wq_watchdog_touch(-1); 298 wq_watchdog_touch(-1);
288} 299}
@@ -322,9 +333,6 @@ static void watchdog_interrupt_count(void)
322 __this_cpu_inc(hrtimer_interrupts); 333 __this_cpu_inc(hrtimer_interrupts);
323} 334}
324 335
325static int watchdog_enable_all_cpus(void);
326static void watchdog_disable_all_cpus(void);
327
328/* watchdog kicker functions */ 336/* watchdog kicker functions */
329static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) 337static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
330{ 338{
@@ -333,7 +341,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
333 int duration; 341 int duration;
334 int softlockup_all_cpu_backtrace = sysctl_softlockup_all_cpu_backtrace; 342 int softlockup_all_cpu_backtrace = sysctl_softlockup_all_cpu_backtrace;
335 343
336 if (atomic_read(&watchdog_park_in_progress) != 0) 344 if (!watchdog_enabled)
337 return HRTIMER_NORESTART; 345 return HRTIMER_NORESTART;
338 346
339 /* kick the hardlockup detector */ 347 /* kick the hardlockup detector */
@@ -447,32 +455,38 @@ static void watchdog_set_prio(unsigned int policy, unsigned int prio)
447 455
448static void watchdog_enable(unsigned int cpu) 456static void watchdog_enable(unsigned int cpu)
449{ 457{
450 struct hrtimer *hrtimer = raw_cpu_ptr(&watchdog_hrtimer); 458 struct hrtimer *hrtimer = this_cpu_ptr(&watchdog_hrtimer);
451 459
452 /* kick off the timer for the hardlockup detector */ 460 /*
461 * Start the timer first to prevent the NMI watchdog triggering
462 * before the timer has a chance to fire.
463 */
453 hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 464 hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
454 hrtimer->function = watchdog_timer_fn; 465 hrtimer->function = watchdog_timer_fn;
455
456 /* Enable the perf event */
457 watchdog_nmi_enable(cpu);
458
459 /* done here because hrtimer_start can only pin to smp_processor_id() */
460 hrtimer_start(hrtimer, ns_to_ktime(sample_period), 466 hrtimer_start(hrtimer, ns_to_ktime(sample_period),
461 HRTIMER_MODE_REL_PINNED); 467 HRTIMER_MODE_REL_PINNED);
462 468
463 /* initialize timestamp */ 469 /* Initialize timestamp */
464 watchdog_set_prio(SCHED_FIFO, MAX_RT_PRIO - 1);
465 __touch_watchdog(); 470 __touch_watchdog();
471 /* Enable the perf event */
472 if (watchdog_enabled & NMI_WATCHDOG_ENABLED)
473 watchdog_nmi_enable(cpu);
474
475 watchdog_set_prio(SCHED_FIFO, MAX_RT_PRIO - 1);
466} 476}
467 477
468static void watchdog_disable(unsigned int cpu) 478static void watchdog_disable(unsigned int cpu)
469{ 479{
470 struct hrtimer *hrtimer = raw_cpu_ptr(&watchdog_hrtimer); 480 struct hrtimer *hrtimer = this_cpu_ptr(&watchdog_hrtimer);
471 481
472 watchdog_set_prio(SCHED_NORMAL, 0); 482 watchdog_set_prio(SCHED_NORMAL, 0);
473 hrtimer_cancel(hrtimer); 483 /*
474 /* disable the perf event */ 484 * Disable the perf event first. That prevents that a large delay
485 * between disabling the timer and disabling the perf event causes
486 * the perf NMI to detect a false positive.
487 */
475 watchdog_nmi_disable(cpu); 488 watchdog_nmi_disable(cpu);
489 hrtimer_cancel(hrtimer);
476} 490}
477 491
478static void watchdog_cleanup(unsigned int cpu, bool online) 492static void watchdog_cleanup(unsigned int cpu, bool online)
@@ -499,21 +513,6 @@ static void watchdog(unsigned int cpu)
499 __this_cpu_write(soft_lockup_hrtimer_cnt, 513 __this_cpu_write(soft_lockup_hrtimer_cnt,
500 __this_cpu_read(hrtimer_interrupts)); 514 __this_cpu_read(hrtimer_interrupts));
501 __touch_watchdog(); 515 __touch_watchdog();
502
503 /*
504 * watchdog_nmi_enable() clears the NMI_WATCHDOG_ENABLED bit in the
505 * failure path. Check for failures that can occur asynchronously -
506 * for example, when CPUs are on-lined - and shut down the hardware
507 * perf event on each CPU accordingly.
508 *
509 * The only non-obvious place this bit can be cleared is through
510 * watchdog_nmi_enable(), so a pr_info() is placed there. Placing a
511 * pr_info here would be too noisy as it would result in a message
512 * every few seconds if the hardlockup was disabled but the softlockup
513 * enabled.
514 */
515 if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED))
516 watchdog_nmi_disable(cpu);
517} 516}
518 517
519static struct smp_hotplug_thread watchdog_threads = { 518static struct smp_hotplug_thread watchdog_threads = {
@@ -527,295 +526,174 @@ static struct smp_hotplug_thread watchdog_threads = {
527 .unpark = watchdog_enable, 526 .unpark = watchdog_enable,
528}; 527};
529 528
530/* 529static void softlockup_update_smpboot_threads(void)
531 * park all watchdog threads that are specified in 'watchdog_cpumask'
532 *
533 * This function returns an error if kthread_park() of a watchdog thread
534 * fails. In this situation, the watchdog threads of some CPUs can already
535 * be parked and the watchdog threads of other CPUs can still be runnable.
536 * Callers are expected to handle this special condition as appropriate in
537 * their context.
538 *
539 * This function may only be called in a context that is protected against
540 * races with CPU hotplug - for example, via get_online_cpus().
541 */
542static int watchdog_park_threads(void)
543{ 530{
544 int cpu, ret = 0; 531 lockdep_assert_held(&watchdog_mutex);
545 532
546 atomic_set(&watchdog_park_in_progress, 1); 533 if (!softlockup_threads_initialized)
534 return;
547 535
548 for_each_watchdog_cpu(cpu) { 536 smpboot_update_cpumask_percpu_thread(&watchdog_threads,
549 ret = kthread_park(per_cpu(softlockup_watchdog, cpu)); 537 &watchdog_allowed_mask);
550 if (ret)
551 break;
552 }
553
554 atomic_set(&watchdog_park_in_progress, 0);
555
556 return ret;
557} 538}
558 539
559/* 540/* Temporarily park all watchdog threads */
560 * unpark all watchdog threads that are specified in 'watchdog_cpumask' 541static void softlockup_park_all_threads(void)
561 *
562 * This function may only be called in a context that is protected against
563 * races with CPU hotplug - for example, via get_online_cpus().
564 */
565static void watchdog_unpark_threads(void)
566{ 542{
567 int cpu; 543 cpumask_clear(&watchdog_allowed_mask);
568 544 softlockup_update_smpboot_threads();
569 for_each_watchdog_cpu(cpu)
570 kthread_unpark(per_cpu(softlockup_watchdog, cpu));
571} 545}
572 546
573static int update_watchdog_all_cpus(void) 547/* Unpark enabled threads */
548static void softlockup_unpark_threads(void)
574{ 549{
575 int ret; 550 cpumask_copy(&watchdog_allowed_mask, &watchdog_cpumask);
576 551 softlockup_update_smpboot_threads();
577 ret = watchdog_park_threads();
578 if (ret)
579 return ret;
580
581 watchdog_unpark_threads();
582
583 return 0;
584} 552}
585 553
586static int watchdog_enable_all_cpus(void) 554static void lockup_detector_reconfigure(void)
587{ 555{
588 int err = 0; 556 cpus_read_lock();
589 557 watchdog_nmi_stop();
590 if (!watchdog_running) { 558 softlockup_park_all_threads();
591 err = smpboot_register_percpu_thread_cpumask(&watchdog_threads, 559 set_sample_period();
592 &watchdog_cpumask); 560 lockup_detector_update_enable();
593 if (err) 561 if (watchdog_enabled && watchdog_thresh)
594 pr_err("Failed to create watchdog threads, disabled\n"); 562 softlockup_unpark_threads();
595 else 563 watchdog_nmi_start();
596 watchdog_running = 1; 564 cpus_read_unlock();
597 } else { 565 /*
598 /* 566 * Must be called outside the cpus locked section to prevent
599 * Enable/disable the lockup detectors or 567 * recursive locking in the perf code.
600 * change the sample period 'on the fly'. 568 */
601 */ 569 __lockup_detector_cleanup();
602 err = update_watchdog_all_cpus();
603
604 if (err) {
605 watchdog_disable_all_cpus();
606 pr_err("Failed to update lockup detectors, disabled\n");
607 }
608 }
609
610 if (err)
611 watchdog_enabled = 0;
612
613 return err;
614} 570}
615 571
616static void watchdog_disable_all_cpus(void) 572/*
573 * Create the watchdog thread infrastructure and configure the detector(s).
574 *
575 * The threads are not unparked as watchdog_allowed_mask is empty. When
576 * the threads are sucessfully initialized, take the proper locks and
577 * unpark the threads in the watchdog_cpumask if the watchdog is enabled.
578 */
579static __init void lockup_detector_setup(void)
617{ 580{
618 if (watchdog_running) { 581 int ret;
619 watchdog_running = 0;
620 smpboot_unregister_percpu_thread(&watchdog_threads);
621 }
622}
623 582
624#ifdef CONFIG_SYSCTL 583 /*
625static int watchdog_update_cpus(void) 584 * If sysctl is off and watchdog got disabled on the command line,
626{ 585 * nothing to do here.
627 return smpboot_update_cpumask_percpu_thread( 586 */
628 &watchdog_threads, &watchdog_cpumask); 587 lockup_detector_update_enable();
629}
630#endif
631 588
632#else /* SOFTLOCKUP */ 589 if (!IS_ENABLED(CONFIG_SYSCTL) &&
633static int watchdog_park_threads(void) 590 !(watchdog_enabled && watchdog_thresh))
634{ 591 return;
635 return 0;
636}
637 592
638static void watchdog_unpark_threads(void) 593 ret = smpboot_register_percpu_thread_cpumask(&watchdog_threads,
639{ 594 &watchdog_allowed_mask);
640} 595 if (ret) {
596 pr_err("Failed to initialize soft lockup detector threads\n");
597 return;
598 }
641 599
642static int watchdog_enable_all_cpus(void) 600 mutex_lock(&watchdog_mutex);
643{ 601 softlockup_threads_initialized = true;
644 return 0; 602 lockup_detector_reconfigure();
603 mutex_unlock(&watchdog_mutex);
645} 604}
646 605
647static void watchdog_disable_all_cpus(void) 606#else /* CONFIG_SOFTLOCKUP_DETECTOR */
607static inline int watchdog_park_threads(void) { return 0; }
608static inline void watchdog_unpark_threads(void) { }
609static inline int watchdog_enable_all_cpus(void) { return 0; }
610static inline void watchdog_disable_all_cpus(void) { }
611static void lockup_detector_reconfigure(void)
648{ 612{
613 cpus_read_lock();
614 watchdog_nmi_stop();
615 lockup_detector_update_enable();
616 watchdog_nmi_start();
617 cpus_read_unlock();
649} 618}
650 619static inline void lockup_detector_setup(void)
651#ifdef CONFIG_SYSCTL
652static int watchdog_update_cpus(void)
653{ 620{
654 return 0; 621 lockup_detector_reconfigure();
655} 622}
656#endif 623#endif /* !CONFIG_SOFTLOCKUP_DETECTOR */
657 624
658static void set_sample_period(void) 625static void __lockup_detector_cleanup(void)
659{ 626{
627 lockdep_assert_held(&watchdog_mutex);
628 hardlockup_detector_perf_cleanup();
660} 629}
661#endif /* SOFTLOCKUP */
662 630
663/* 631/**
664 * Suspend the hard and soft lockup detector by parking the watchdog threads. 632 * lockup_detector_cleanup - Cleanup after cpu hotplug or sysctl changes
633 *
634 * Caller must not hold the cpu hotplug rwsem.
665 */ 635 */
666int lockup_detector_suspend(void) 636void lockup_detector_cleanup(void)
667{ 637{
668 int ret = 0; 638 mutex_lock(&watchdog_mutex);
669 639 __lockup_detector_cleanup();
670 get_online_cpus(); 640 mutex_unlock(&watchdog_mutex);
671 mutex_lock(&watchdog_proc_mutex);
672 /*
673 * Multiple suspend requests can be active in parallel (counted by
674 * the 'watchdog_suspended' variable). If the watchdog threads are
675 * running, the first caller takes care that they will be parked.
676 * The state of 'watchdog_running' cannot change while a suspend
677 * request is active (see related code in 'proc' handlers).
678 */
679 if (watchdog_running && !watchdog_suspended)
680 ret = watchdog_park_threads();
681
682 if (ret == 0)
683 watchdog_suspended++;
684 else {
685 watchdog_disable_all_cpus();
686 pr_err("Failed to suspend lockup detectors, disabled\n");
687 watchdog_enabled = 0;
688 }
689
690 watchdog_nmi_reconfigure();
691
692 mutex_unlock(&watchdog_proc_mutex);
693
694 return ret;
695} 641}
696 642
697/* 643/**
698 * Resume the hard and soft lockup detector by unparking the watchdog threads. 644 * lockup_detector_soft_poweroff - Interface to stop lockup detector(s)
645 *
646 * Special interface for parisc. It prevents lockup detector warnings from
647 * the default pm_poweroff() function which busy loops forever.
699 */ 648 */
700void lockup_detector_resume(void) 649void lockup_detector_soft_poweroff(void)
701{ 650{
702 mutex_lock(&watchdog_proc_mutex); 651 watchdog_enabled = 0;
703
704 watchdog_suspended--;
705 /*
706 * The watchdog threads are unparked if they were previously running
707 * and if there is no more active suspend request.
708 */
709 if (watchdog_running && !watchdog_suspended)
710 watchdog_unpark_threads();
711
712 watchdog_nmi_reconfigure();
713
714 mutex_unlock(&watchdog_proc_mutex);
715 put_online_cpus();
716} 652}
717 653
718#ifdef CONFIG_SYSCTL 654#ifdef CONFIG_SYSCTL
719 655
720/* 656/* Propagate any changes to the watchdog threads */
721 * Update the run state of the lockup detectors. 657static void proc_watchdog_update(void)
722 */
723static int proc_watchdog_update(void)
724{ 658{
725 int err = 0; 659 /* Remove impossible cpus to keep sysctl output clean. */
726 660 cpumask_and(&watchdog_cpumask, &watchdog_cpumask, cpu_possible_mask);
727 /* 661 lockup_detector_reconfigure();
728 * Watchdog threads won't be started if they are already active.
729 * The 'watchdog_running' variable in watchdog_*_all_cpus() takes
730 * care of this. If those threads are already active, the sample
731 * period will be updated and the lockup detectors will be enabled
732 * or disabled 'on the fly'.
733 */
734 if (watchdog_enabled && watchdog_thresh)
735 err = watchdog_enable_all_cpus();
736 else
737 watchdog_disable_all_cpus();
738
739 watchdog_nmi_reconfigure();
740
741 return err;
742
743} 662}
744 663
745/* 664/*
746 * common function for watchdog, nmi_watchdog and soft_watchdog parameter 665 * common function for watchdog, nmi_watchdog and soft_watchdog parameter
747 * 666 *
748 * caller | table->data points to | 'which' contains the flag(s) 667 * caller | table->data points to | 'which'
749 * -------------------|-----------------------|----------------------------- 668 * -------------------|----------------------------|--------------------------
750 * proc_watchdog | watchdog_user_enabled | NMI_WATCHDOG_ENABLED or'ed 669 * proc_watchdog | watchdog_user_enabled | NMI_WATCHDOG_ENABLED |
751 * | | with SOFT_WATCHDOG_ENABLED 670 * | | SOFT_WATCHDOG_ENABLED
752 * -------------------|-----------------------|----------------------------- 671 * -------------------|----------------------------|--------------------------
753 * proc_nmi_watchdog | nmi_watchdog_enabled | NMI_WATCHDOG_ENABLED 672 * proc_nmi_watchdog | nmi_watchdog_user_enabled | NMI_WATCHDOG_ENABLED
754 * -------------------|-----------------------|----------------------------- 673 * -------------------|----------------------------|--------------------------
755 * proc_soft_watchdog | soft_watchdog_enabled | SOFT_WATCHDOG_ENABLED 674 * proc_soft_watchdog | soft_watchdog_user_enabled | SOFT_WATCHDOG_ENABLED
756 */ 675 */
757static int proc_watchdog_common(int which, struct ctl_table *table, int write, 676static int proc_watchdog_common(int which, struct ctl_table *table, int write,
758 void __user *buffer, size_t *lenp, loff_t *ppos) 677 void __user *buffer, size_t *lenp, loff_t *ppos)
759{ 678{
760 int err, old, new; 679 int err, old, *param = table->data;
761 int *watchdog_param = (int *)table->data;
762 680
763 get_online_cpus(); 681 mutex_lock(&watchdog_mutex);
764 mutex_lock(&watchdog_proc_mutex);
765 682
766 if (watchdog_suspended) {
767 /* no parameter changes allowed while watchdog is suspended */
768 err = -EAGAIN;
769 goto out;
770 }
771
772 /*
773 * If the parameter is being read return the state of the corresponding
774 * bit(s) in 'watchdog_enabled', else update 'watchdog_enabled' and the
775 * run state of the lockup detectors.
776 */
777 if (!write) { 683 if (!write) {
778 *watchdog_param = (watchdog_enabled & which) != 0; 684 /*
685 * On read synchronize the userspace interface. This is a
686 * racy snapshot.
687 */
688 *param = (watchdog_enabled & which) != 0;
779 err = proc_dointvec_minmax(table, write, buffer, lenp, ppos); 689 err = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
780 } else { 690 } else {
691 old = READ_ONCE(*param);
781 err = proc_dointvec_minmax(table, write, buffer, lenp, ppos); 692 err = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
782 if (err) 693 if (!err && old != READ_ONCE(*param))
783 goto out; 694 proc_watchdog_update();
784
785 /*
786 * There is a race window between fetching the current value
787 * from 'watchdog_enabled' and storing the new value. During
788 * this race window, watchdog_nmi_enable() can sneak in and
789 * clear the NMI_WATCHDOG_ENABLED bit in 'watchdog_enabled'.
790 * The 'cmpxchg' detects this race and the loop retries.
791 */
792 do {
793 old = watchdog_enabled;
794 /*
795 * If the parameter value is not zero set the
796 * corresponding bit(s), else clear it(them).
797 */
798 if (*watchdog_param)
799 new = old | which;
800 else
801 new = old & ~which;
802 } while (cmpxchg(&watchdog_enabled, old, new) != old);
803
804 /*
805 * Update the run state of the lockup detectors. There is _no_
806 * need to check the value returned by proc_watchdog_update()
807 * and to restore the previous value of 'watchdog_enabled' as
808 * both lockup detectors are disabled if proc_watchdog_update()
809 * returns an error.
810 */
811 if (old == new)
812 goto out;
813
814 err = proc_watchdog_update();
815 } 695 }
816out: 696 mutex_unlock(&watchdog_mutex);
817 mutex_unlock(&watchdog_proc_mutex);
818 put_online_cpus();
819 return err; 697 return err;
820} 698}
821 699
@@ -835,6 +713,8 @@ int proc_watchdog(struct ctl_table *table, int write,
835int proc_nmi_watchdog(struct ctl_table *table, int write, 713int proc_nmi_watchdog(struct ctl_table *table, int write,
836 void __user *buffer, size_t *lenp, loff_t *ppos) 714 void __user *buffer, size_t *lenp, loff_t *ppos)
837{ 715{
716 if (!nmi_watchdog_available && write)
717 return -ENOTSUPP;
838 return proc_watchdog_common(NMI_WATCHDOG_ENABLED, 718 return proc_watchdog_common(NMI_WATCHDOG_ENABLED,
839 table, write, buffer, lenp, ppos); 719 table, write, buffer, lenp, ppos);
840} 720}
@@ -855,39 +735,17 @@ int proc_soft_watchdog(struct ctl_table *table, int write,
855int proc_watchdog_thresh(struct ctl_table *table, int write, 735int proc_watchdog_thresh(struct ctl_table *table, int write,
856 void __user *buffer, size_t *lenp, loff_t *ppos) 736 void __user *buffer, size_t *lenp, loff_t *ppos)
857{ 737{
858 int err, old, new; 738 int err, old;
859
860 get_online_cpus();
861 mutex_lock(&watchdog_proc_mutex);
862 739
863 if (watchdog_suspended) { 740 mutex_lock(&watchdog_mutex);
864 /* no parameter changes allowed while watchdog is suspended */
865 err = -EAGAIN;
866 goto out;
867 }
868 741
869 old = ACCESS_ONCE(watchdog_thresh); 742 old = READ_ONCE(watchdog_thresh);
870 err = proc_dointvec_minmax(table, write, buffer, lenp, ppos); 743 err = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
871 744
872 if (err || !write) 745 if (!err && write && old != READ_ONCE(watchdog_thresh))
873 goto out; 746 proc_watchdog_update();
874
875 /*
876 * Update the sample period. Restore on failure.
877 */
878 new = ACCESS_ONCE(watchdog_thresh);
879 if (old == new)
880 goto out;
881 747
882 set_sample_period(); 748 mutex_unlock(&watchdog_mutex);
883 err = proc_watchdog_update();
884 if (err) {
885 watchdog_thresh = old;
886 set_sample_period();
887 }
888out:
889 mutex_unlock(&watchdog_proc_mutex);
890 put_online_cpus();
891 return err; 749 return err;
892} 750}
893 751
@@ -902,45 +760,19 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write,
902{ 760{
903 int err; 761 int err;
904 762
905 get_online_cpus(); 763 mutex_lock(&watchdog_mutex);
906 mutex_lock(&watchdog_proc_mutex);
907
908 if (watchdog_suspended) {
909 /* no parameter changes allowed while watchdog is suspended */
910 err = -EAGAIN;
911 goto out;
912 }
913 764
914 err = proc_do_large_bitmap(table, write, buffer, lenp, ppos); 765 err = proc_do_large_bitmap(table, write, buffer, lenp, ppos);
915 if (!err && write) { 766 if (!err && write)
916 /* Remove impossible cpus to keep sysctl output cleaner. */ 767 proc_watchdog_update();
917 cpumask_and(&watchdog_cpumask, &watchdog_cpumask,
918 cpu_possible_mask);
919
920 if (watchdog_running) {
921 /*
922 * Failure would be due to being unable to allocate
923 * a temporary cpumask, so we are likely not in a
924 * position to do much else to make things better.
925 */
926 if (watchdog_update_cpus() != 0)
927 pr_err("cpumask update failed\n");
928 }
929 768
930 watchdog_nmi_reconfigure(); 769 mutex_unlock(&watchdog_mutex);
931 }
932out:
933 mutex_unlock(&watchdog_proc_mutex);
934 put_online_cpus();
935 return err; 770 return err;
936} 771}
937
938#endif /* CONFIG_SYSCTL */ 772#endif /* CONFIG_SYSCTL */
939 773
940void __init lockup_detector_init(void) 774void __init lockup_detector_init(void)
941{ 775{
942 set_sample_period();
943
944#ifdef CONFIG_NO_HZ_FULL 776#ifdef CONFIG_NO_HZ_FULL
945 if (tick_nohz_full_enabled()) { 777 if (tick_nohz_full_enabled()) {
946 pr_info("Disabling watchdog on nohz_full cores by default\n"); 778 pr_info("Disabling watchdog on nohz_full cores by default\n");
@@ -951,6 +783,7 @@ void __init lockup_detector_init(void)
951 cpumask_copy(&watchdog_cpumask, cpu_possible_mask); 783 cpumask_copy(&watchdog_cpumask, cpu_possible_mask);
952#endif 784#endif
953 785
954 if (watchdog_enabled) 786 if (!watchdog_nmi_probe())
955 watchdog_enable_all_cpus(); 787 nmi_watchdog_available = true;
788 lockup_detector_setup();
956} 789}
diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c
index 3a09ea1b1d3d..71a62ceacdc8 100644
--- a/kernel/watchdog_hld.c
+++ b/kernel/watchdog_hld.c
@@ -21,8 +21,10 @@
21static DEFINE_PER_CPU(bool, hard_watchdog_warn); 21static DEFINE_PER_CPU(bool, hard_watchdog_warn);
22static DEFINE_PER_CPU(bool, watchdog_nmi_touch); 22static DEFINE_PER_CPU(bool, watchdog_nmi_touch);
23static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); 23static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
24static struct cpumask dead_events_mask;
24 25
25static unsigned long hardlockup_allcpu_dumped; 26static unsigned long hardlockup_allcpu_dumped;
27static unsigned int watchdog_cpus;
26 28
27void arch_touch_nmi_watchdog(void) 29void arch_touch_nmi_watchdog(void)
28{ 30{
@@ -103,15 +105,12 @@ static struct perf_event_attr wd_hw_attr = {
103 105
104/* Callback function for perf event subsystem */ 106/* Callback function for perf event subsystem */
105static void watchdog_overflow_callback(struct perf_event *event, 107static void watchdog_overflow_callback(struct perf_event *event,
106 struct perf_sample_data *data, 108 struct perf_sample_data *data,
107 struct pt_regs *regs) 109 struct pt_regs *regs)
108{ 110{
109 /* Ensure the watchdog never gets throttled */ 111 /* Ensure the watchdog never gets throttled */
110 event->hw.interrupts = 0; 112 event->hw.interrupts = 0;
111 113
112 if (atomic_read(&watchdog_park_in_progress) != 0)
113 return;
114
115 if (__this_cpu_read(watchdog_nmi_touch) == true) { 114 if (__this_cpu_read(watchdog_nmi_touch) == true) {
116 __this_cpu_write(watchdog_nmi_touch, false); 115 __this_cpu_write(watchdog_nmi_touch, false);
117 return; 116 return;
@@ -160,104 +159,131 @@ static void watchdog_overflow_callback(struct perf_event *event,
160 return; 159 return;
161} 160}
162 161
163/* 162static int hardlockup_detector_event_create(void)
164 * People like the simple clean cpu node info on boot.
165 * Reduce the watchdog noise by only printing messages
166 * that are different from what cpu0 displayed.
167 */
168static unsigned long firstcpu_err;
169static atomic_t watchdog_cpus;
170
171int watchdog_nmi_enable(unsigned int cpu)
172{ 163{
164 unsigned int cpu = smp_processor_id();
173 struct perf_event_attr *wd_attr; 165 struct perf_event_attr *wd_attr;
174 struct perf_event *event = per_cpu(watchdog_ev, cpu); 166 struct perf_event *evt;
175 int firstcpu = 0;
176
177 /* nothing to do if the hard lockup detector is disabled */
178 if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED))
179 goto out;
180
181 /* is it already setup and enabled? */
182 if (event && event->state > PERF_EVENT_STATE_OFF)
183 goto out;
184
185 /* it is setup but not enabled */
186 if (event != NULL)
187 goto out_enable;
188
189 if (atomic_inc_return(&watchdog_cpus) == 1)
190 firstcpu = 1;
191 167
192 wd_attr = &wd_hw_attr; 168 wd_attr = &wd_hw_attr;
193 wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh); 169 wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh);
194 170
195 /* Try to register using hardware perf events */ 171 /* Try to register using hardware perf events */
196 event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL); 172 evt = perf_event_create_kernel_counter(wd_attr, cpu, NULL,
173 watchdog_overflow_callback, NULL);
174 if (IS_ERR(evt)) {
175 pr_info("Perf event create on CPU %d failed with %ld\n", cpu,
176 PTR_ERR(evt));
177 return PTR_ERR(evt);
178 }
179 this_cpu_write(watchdog_ev, evt);
180 return 0;
181}
197 182
198 /* save the first cpu's error for future comparision */ 183/**
199 if (firstcpu && IS_ERR(event)) 184 * hardlockup_detector_perf_enable - Enable the local event
200 firstcpu_err = PTR_ERR(event); 185 */
186void hardlockup_detector_perf_enable(void)
187{
188 if (hardlockup_detector_event_create())
189 return;
201 190
202 if (!IS_ERR(event)) { 191 if (!watchdog_cpus++)
203 /* only print for the first cpu initialized */ 192 pr_info("Enabled. Permanently consumes one hw-PMU counter.\n");
204 if (firstcpu || firstcpu_err)
205 pr_info("enabled on all CPUs, permanently consumes one hw-PMU counter.\n");
206 goto out_save;
207 }
208 193
209 /* 194 perf_event_enable(this_cpu_read(watchdog_ev));
210 * Disable the hard lockup detector if _any_ CPU fails to set up
211 * set up the hardware perf event. The watchdog() function checks
212 * the NMI_WATCHDOG_ENABLED bit periodically.
213 *
214 * The barriers are for syncing up watchdog_enabled across all the
215 * cpus, as clear_bit() does not use barriers.
216 */
217 smp_mb__before_atomic();
218 clear_bit(NMI_WATCHDOG_ENABLED_BIT, &watchdog_enabled);
219 smp_mb__after_atomic();
220
221 /* skip displaying the same error again */
222 if (!firstcpu && (PTR_ERR(event) == firstcpu_err))
223 return PTR_ERR(event);
224
225 /* vary the KERN level based on the returned errno */
226 if (PTR_ERR(event) == -EOPNOTSUPP)
227 pr_info("disabled (cpu%i): not supported (no LAPIC?)\n", cpu);
228 else if (PTR_ERR(event) == -ENOENT)
229 pr_warn("disabled (cpu%i): hardware events not enabled\n",
230 cpu);
231 else
232 pr_err("disabled (cpu%i): unable to create perf event: %ld\n",
233 cpu, PTR_ERR(event));
234
235 pr_info("Shutting down hard lockup detector on all cpus\n");
236
237 return PTR_ERR(event);
238
239 /* success path */
240out_save:
241 per_cpu(watchdog_ev, cpu) = event;
242out_enable:
243 perf_event_enable(per_cpu(watchdog_ev, cpu));
244out:
245 return 0;
246} 195}
247 196
248void watchdog_nmi_disable(unsigned int cpu) 197/**
198 * hardlockup_detector_perf_disable - Disable the local event
199 */
200void hardlockup_detector_perf_disable(void)
249{ 201{
250 struct perf_event *event = per_cpu(watchdog_ev, cpu); 202 struct perf_event *event = this_cpu_read(watchdog_ev);
251 203
252 if (event) { 204 if (event) {
253 perf_event_disable(event); 205 perf_event_disable(event);
206 cpumask_set_cpu(smp_processor_id(), &dead_events_mask);
207 watchdog_cpus--;
208 }
209}
210
211/**
212 * hardlockup_detector_perf_cleanup - Cleanup disabled events and destroy them
213 *
214 * Called from lockup_detector_cleanup(). Serialized by the caller.
215 */
216void hardlockup_detector_perf_cleanup(void)
217{
218 int cpu;
219
220 for_each_cpu(cpu, &dead_events_mask) {
221 struct perf_event *event = per_cpu(watchdog_ev, cpu);
222
223 /*
224 * Required because for_each_cpu() reports unconditionally
225 * CPU0 as set on UP kernels. Sigh.
226 */
227 if (event)
228 perf_event_release_kernel(event);
254 per_cpu(watchdog_ev, cpu) = NULL; 229 per_cpu(watchdog_ev, cpu) = NULL;
230 }
231 cpumask_clear(&dead_events_mask);
232}
233
234/**
235 * hardlockup_detector_perf_stop - Globally stop watchdog events
236 *
237 * Special interface for x86 to handle the perf HT bug.
238 */
239void __init hardlockup_detector_perf_stop(void)
240{
241 int cpu;
242
243 lockdep_assert_cpus_held();
244
245 for_each_online_cpu(cpu) {
246 struct perf_event *event = per_cpu(watchdog_ev, cpu);
247
248 if (event)
249 perf_event_disable(event);
250 }
251}
255 252
256 /* should be in cleanup, but blocks oprofile */ 253/**
257 perf_event_release_kernel(event); 254 * hardlockup_detector_perf_restart - Globally restart watchdog events
255 *
256 * Special interface for x86 to handle the perf HT bug.
257 */
258void __init hardlockup_detector_perf_restart(void)
259{
260 int cpu;
261
262 lockdep_assert_cpus_held();
263
264 if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED))
265 return;
266
267 for_each_online_cpu(cpu) {
268 struct perf_event *event = per_cpu(watchdog_ev, cpu);
269
270 if (event)
271 perf_event_enable(event);
272 }
273}
274
275/**
276 * hardlockup_detector_perf_init - Probe whether NMI event is available at all
277 */
278int __init hardlockup_detector_perf_init(void)
279{
280 int ret = hardlockup_detector_event_create();
258 281
259 /* watchdog_nmi_enable() expects this to be zero initially. */ 282 if (ret) {
260 if (atomic_dec_and_test(&watchdog_cpus)) 283 pr_info("Perf NMI watchdog permanently disabled\n");
261 firstcpu_err = 0; 284 } else {
285 perf_event_release_kernel(this_cpu_read(watchdog_ev));
286 this_cpu_write(watchdog_ev, NULL);
262 } 287 }
288 return ret;
263} 289}