diff options
Diffstat (limited to 'kernel')
37 files changed, 1324 insertions, 825 deletions
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 917cc04a0a94..7b62df86be1d 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c | |||
@@ -1022,7 +1022,7 @@ select_insn: | |||
1022 | struct bpf_map *map = (struct bpf_map *) (unsigned long) BPF_R2; | 1022 | struct bpf_map *map = (struct bpf_map *) (unsigned long) BPF_R2; |
1023 | struct bpf_array *array = container_of(map, struct bpf_array, map); | 1023 | struct bpf_array *array = container_of(map, struct bpf_array, map); |
1024 | struct bpf_prog *prog; | 1024 | struct bpf_prog *prog; |
1025 | u64 index = BPF_R3; | 1025 | u32 index = BPF_R3; |
1026 | 1026 | ||
1027 | if (unlikely(index >= array->map.max_entries)) | 1027 | if (unlikely(index >= array->map.max_entries)) |
1028 | goto out; | 1028 | goto out; |
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 959c9a07f318..e093d9a2c4dd 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c | |||
@@ -75,8 +75,8 @@ static u64 dev_map_bitmap_size(const union bpf_attr *attr) | |||
75 | static struct bpf_map *dev_map_alloc(union bpf_attr *attr) | 75 | static struct bpf_map *dev_map_alloc(union bpf_attr *attr) |
76 | { | 76 | { |
77 | struct bpf_dtab *dtab; | 77 | struct bpf_dtab *dtab; |
78 | int err = -EINVAL; | ||
78 | u64 cost; | 79 | u64 cost; |
79 | int err; | ||
80 | 80 | ||
81 | /* check sanity of attributes */ | 81 | /* check sanity of attributes */ |
82 | if (attr->max_entries == 0 || attr->key_size != 4 || | 82 | if (attr->max_entries == 0 || attr->key_size != 4 || |
@@ -108,6 +108,8 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) | |||
108 | if (err) | 108 | if (err) |
109 | goto free_dtab; | 109 | goto free_dtab; |
110 | 110 | ||
111 | err = -ENOMEM; | ||
112 | |||
111 | /* A per cpu bitfield with a bit per possible net device */ | 113 | /* A per cpu bitfield with a bit per possible net device */ |
112 | dtab->flush_needed = __alloc_percpu(dev_map_bitmap_size(attr), | 114 | dtab->flush_needed = __alloc_percpu(dev_map_bitmap_size(attr), |
113 | __alignof__(unsigned long)); | 115 | __alignof__(unsigned long)); |
@@ -128,7 +130,7 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) | |||
128 | free_dtab: | 130 | free_dtab: |
129 | free_percpu(dtab->flush_needed); | 131 | free_percpu(dtab->flush_needed); |
130 | kfree(dtab); | 132 | kfree(dtab); |
131 | return ERR_PTR(-ENOMEM); | 133 | return ERR_PTR(err); |
132 | } | 134 | } |
133 | 135 | ||
134 | static void dev_map_free(struct bpf_map *map) | 136 | static void dev_map_free(struct bpf_map *map) |
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index cb17e1cd1d43..25d074920a00 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c | |||
@@ -186,15 +186,17 @@ static int bpf_map_alloc_id(struct bpf_map *map) | |||
186 | 186 | ||
187 | static void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock) | 187 | static void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock) |
188 | { | 188 | { |
189 | unsigned long flags; | ||
190 | |||
189 | if (do_idr_lock) | 191 | if (do_idr_lock) |
190 | spin_lock_bh(&map_idr_lock); | 192 | spin_lock_irqsave(&map_idr_lock, flags); |
191 | else | 193 | else |
192 | __acquire(&map_idr_lock); | 194 | __acquire(&map_idr_lock); |
193 | 195 | ||
194 | idr_remove(&map_idr, map->id); | 196 | idr_remove(&map_idr, map->id); |
195 | 197 | ||
196 | if (do_idr_lock) | 198 | if (do_idr_lock) |
197 | spin_unlock_bh(&map_idr_lock); | 199 | spin_unlock_irqrestore(&map_idr_lock, flags); |
198 | else | 200 | else |
199 | __release(&map_idr_lock); | 201 | __release(&map_idr_lock); |
200 | } | 202 | } |
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 799b2451ef2d..b914fbe1383e 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c | |||
@@ -4205,7 +4205,12 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) | |||
4205 | } | 4205 | } |
4206 | 4206 | ||
4207 | if (insn->imm == BPF_FUNC_redirect_map) { | 4207 | if (insn->imm == BPF_FUNC_redirect_map) { |
4208 | u64 addr = (unsigned long)prog; | 4208 | /* Note, we cannot use prog directly as imm as subsequent |
4209 | * rewrites would still change the prog pointer. The only | ||
4210 | * stable address we can use is aux, which also works with | ||
4211 | * prog clones during blinding. | ||
4212 | */ | ||
4213 | u64 addr = (unsigned long)prog->aux; | ||
4209 | struct bpf_insn r4_ld[] = { | 4214 | struct bpf_insn r4_ld[] = { |
4210 | BPF_LD_IMM64(BPF_REG_4, addr), | 4215 | BPF_LD_IMM64(BPF_REG_4, addr), |
4211 | *insn, | 4216 | *insn, |
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index d6551cd45238..44857278eb8a 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c | |||
@@ -2311,6 +2311,14 @@ out_release_tset: | |||
2311 | list_del_init(&cset->mg_node); | 2311 | list_del_init(&cset->mg_node); |
2312 | } | 2312 | } |
2313 | spin_unlock_irq(&css_set_lock); | 2313 | spin_unlock_irq(&css_set_lock); |
2314 | |||
2315 | /* | ||
2316 | * Re-initialize the cgroup_taskset structure in case it is reused | ||
2317 | * again in another cgroup_migrate_add_task()/cgroup_migrate_execute() | ||
2318 | * iteration. | ||
2319 | */ | ||
2320 | tset->nr_tasks = 0; | ||
2321 | tset->csets = &tset->src_csets; | ||
2314 | return ret; | 2322 | return ret; |
2315 | } | 2323 | } |
2316 | 2324 | ||
diff --git a/kernel/cpu.c b/kernel/cpu.c index acf5308fad51..d851df22f5c5 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c | |||
@@ -24,6 +24,7 @@ | |||
24 | #include <linux/lockdep.h> | 24 | #include <linux/lockdep.h> |
25 | #include <linux/tick.h> | 25 | #include <linux/tick.h> |
26 | #include <linux/irq.h> | 26 | #include <linux/irq.h> |
27 | #include <linux/nmi.h> | ||
27 | #include <linux/smpboot.h> | 28 | #include <linux/smpboot.h> |
28 | #include <linux/relay.h> | 29 | #include <linux/relay.h> |
29 | #include <linux/slab.h> | 30 | #include <linux/slab.h> |
@@ -46,11 +47,13 @@ | |||
46 | * @bringup: Single callback bringup or teardown selector | 47 | * @bringup: Single callback bringup or teardown selector |
47 | * @cb_state: The state for a single callback (install/uninstall) | 48 | * @cb_state: The state for a single callback (install/uninstall) |
48 | * @result: Result of the operation | 49 | * @result: Result of the operation |
49 | * @done: Signal completion to the issuer of the task | 50 | * @done_up: Signal completion to the issuer of the task for cpu-up |
51 | * @done_down: Signal completion to the issuer of the task for cpu-down | ||
50 | */ | 52 | */ |
51 | struct cpuhp_cpu_state { | 53 | struct cpuhp_cpu_state { |
52 | enum cpuhp_state state; | 54 | enum cpuhp_state state; |
53 | enum cpuhp_state target; | 55 | enum cpuhp_state target; |
56 | enum cpuhp_state fail; | ||
54 | #ifdef CONFIG_SMP | 57 | #ifdef CONFIG_SMP |
55 | struct task_struct *thread; | 58 | struct task_struct *thread; |
56 | bool should_run; | 59 | bool should_run; |
@@ -58,18 +61,39 @@ struct cpuhp_cpu_state { | |||
58 | bool single; | 61 | bool single; |
59 | bool bringup; | 62 | bool bringup; |
60 | struct hlist_node *node; | 63 | struct hlist_node *node; |
64 | struct hlist_node *last; | ||
61 | enum cpuhp_state cb_state; | 65 | enum cpuhp_state cb_state; |
62 | int result; | 66 | int result; |
63 | struct completion done; | 67 | struct completion done_up; |
68 | struct completion done_down; | ||
64 | #endif | 69 | #endif |
65 | }; | 70 | }; |
66 | 71 | ||
67 | static DEFINE_PER_CPU(struct cpuhp_cpu_state, cpuhp_state); | 72 | static DEFINE_PER_CPU(struct cpuhp_cpu_state, cpuhp_state) = { |
73 | .fail = CPUHP_INVALID, | ||
74 | }; | ||
68 | 75 | ||
69 | #if defined(CONFIG_LOCKDEP) && defined(CONFIG_SMP) | 76 | #if defined(CONFIG_LOCKDEP) && defined(CONFIG_SMP) |
70 | static struct lock_class_key cpuhp_state_key; | 77 | static struct lockdep_map cpuhp_state_up_map = |
71 | static struct lockdep_map cpuhp_state_lock_map = | 78 | STATIC_LOCKDEP_MAP_INIT("cpuhp_state-up", &cpuhp_state_up_map); |
72 | STATIC_LOCKDEP_MAP_INIT("cpuhp_state", &cpuhp_state_key); | 79 | static struct lockdep_map cpuhp_state_down_map = |
80 | STATIC_LOCKDEP_MAP_INIT("cpuhp_state-down", &cpuhp_state_down_map); | ||
81 | |||
82 | |||
83 | static void inline cpuhp_lock_acquire(bool bringup) | ||
84 | { | ||
85 | lock_map_acquire(bringup ? &cpuhp_state_up_map : &cpuhp_state_down_map); | ||
86 | } | ||
87 | |||
88 | static void inline cpuhp_lock_release(bool bringup) | ||
89 | { | ||
90 | lock_map_release(bringup ? &cpuhp_state_up_map : &cpuhp_state_down_map); | ||
91 | } | ||
92 | #else | ||
93 | |||
94 | static void inline cpuhp_lock_acquire(bool bringup) { } | ||
95 | static void inline cpuhp_lock_release(bool bringup) { } | ||
96 | |||
73 | #endif | 97 | #endif |
74 | 98 | ||
75 | /** | 99 | /** |
@@ -123,13 +147,16 @@ static struct cpuhp_step *cpuhp_get_step(enum cpuhp_state state) | |||
123 | /** | 147 | /** |
124 | * cpuhp_invoke_callback _ Invoke the callbacks for a given state | 148 | * cpuhp_invoke_callback _ Invoke the callbacks for a given state |
125 | * @cpu: The cpu for which the callback should be invoked | 149 | * @cpu: The cpu for which the callback should be invoked |
126 | * @step: The step in the state machine | 150 | * @state: The state to do callbacks for |
127 | * @bringup: True if the bringup callback should be invoked | 151 | * @bringup: True if the bringup callback should be invoked |
152 | * @node: For multi-instance, do a single entry callback for install/remove | ||
153 | * @lastp: For multi-instance rollback, remember how far we got | ||
128 | * | 154 | * |
129 | * Called from cpu hotplug and from the state register machinery. | 155 | * Called from cpu hotplug and from the state register machinery. |
130 | */ | 156 | */ |
131 | static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state state, | 157 | static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state state, |
132 | bool bringup, struct hlist_node *node) | 158 | bool bringup, struct hlist_node *node, |
159 | struct hlist_node **lastp) | ||
133 | { | 160 | { |
134 | struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); | 161 | struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); |
135 | struct cpuhp_step *step = cpuhp_get_step(state); | 162 | struct cpuhp_step *step = cpuhp_get_step(state); |
@@ -137,7 +164,17 @@ static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state state, | |||
137 | int (*cb)(unsigned int cpu); | 164 | int (*cb)(unsigned int cpu); |
138 | int ret, cnt; | 165 | int ret, cnt; |
139 | 166 | ||
167 | if (st->fail == state) { | ||
168 | st->fail = CPUHP_INVALID; | ||
169 | |||
170 | if (!(bringup ? step->startup.single : step->teardown.single)) | ||
171 | return 0; | ||
172 | |||
173 | return -EAGAIN; | ||
174 | } | ||
175 | |||
140 | if (!step->multi_instance) { | 176 | if (!step->multi_instance) { |
177 | WARN_ON_ONCE(lastp && *lastp); | ||
141 | cb = bringup ? step->startup.single : step->teardown.single; | 178 | cb = bringup ? step->startup.single : step->teardown.single; |
142 | if (!cb) | 179 | if (!cb) |
143 | return 0; | 180 | return 0; |
@@ -152,6 +189,7 @@ static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state state, | |||
152 | 189 | ||
153 | /* Single invocation for instance add/remove */ | 190 | /* Single invocation for instance add/remove */ |
154 | if (node) { | 191 | if (node) { |
192 | WARN_ON_ONCE(lastp && *lastp); | ||
155 | trace_cpuhp_multi_enter(cpu, st->target, state, cbm, node); | 193 | trace_cpuhp_multi_enter(cpu, st->target, state, cbm, node); |
156 | ret = cbm(cpu, node); | 194 | ret = cbm(cpu, node); |
157 | trace_cpuhp_exit(cpu, st->state, state, ret); | 195 | trace_cpuhp_exit(cpu, st->state, state, ret); |
@@ -161,13 +199,23 @@ static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state state, | |||
161 | /* State transition. Invoke on all instances */ | 199 | /* State transition. Invoke on all instances */ |
162 | cnt = 0; | 200 | cnt = 0; |
163 | hlist_for_each(node, &step->list) { | 201 | hlist_for_each(node, &step->list) { |
202 | if (lastp && node == *lastp) | ||
203 | break; | ||
204 | |||
164 | trace_cpuhp_multi_enter(cpu, st->target, state, cbm, node); | 205 | trace_cpuhp_multi_enter(cpu, st->target, state, cbm, node); |
165 | ret = cbm(cpu, node); | 206 | ret = cbm(cpu, node); |
166 | trace_cpuhp_exit(cpu, st->state, state, ret); | 207 | trace_cpuhp_exit(cpu, st->state, state, ret); |
167 | if (ret) | 208 | if (ret) { |
168 | goto err; | 209 | if (!lastp) |
210 | goto err; | ||
211 | |||
212 | *lastp = node; | ||
213 | return ret; | ||
214 | } | ||
169 | cnt++; | 215 | cnt++; |
170 | } | 216 | } |
217 | if (lastp) | ||
218 | *lastp = NULL; | ||
171 | return 0; | 219 | return 0; |
172 | err: | 220 | err: |
173 | /* Rollback the instances if one failed */ | 221 | /* Rollback the instances if one failed */ |
@@ -178,12 +226,39 @@ err: | |||
178 | hlist_for_each(node, &step->list) { | 226 | hlist_for_each(node, &step->list) { |
179 | if (!cnt--) | 227 | if (!cnt--) |
180 | break; | 228 | break; |
181 | cbm(cpu, node); | 229 | |
230 | trace_cpuhp_multi_enter(cpu, st->target, state, cbm, node); | ||
231 | ret = cbm(cpu, node); | ||
232 | trace_cpuhp_exit(cpu, st->state, state, ret); | ||
233 | /* | ||
234 | * Rollback must not fail, | ||
235 | */ | ||
236 | WARN_ON_ONCE(ret); | ||
182 | } | 237 | } |
183 | return ret; | 238 | return ret; |
184 | } | 239 | } |
185 | 240 | ||
186 | #ifdef CONFIG_SMP | 241 | #ifdef CONFIG_SMP |
242 | static inline void wait_for_ap_thread(struct cpuhp_cpu_state *st, bool bringup) | ||
243 | { | ||
244 | struct completion *done = bringup ? &st->done_up : &st->done_down; | ||
245 | wait_for_completion(done); | ||
246 | } | ||
247 | |||
248 | static inline void complete_ap_thread(struct cpuhp_cpu_state *st, bool bringup) | ||
249 | { | ||
250 | struct completion *done = bringup ? &st->done_up : &st->done_down; | ||
251 | complete(done); | ||
252 | } | ||
253 | |||
254 | /* | ||
255 | * The former STARTING/DYING states, ran with IRQs disabled and must not fail. | ||
256 | */ | ||
257 | static bool cpuhp_is_atomic_state(enum cpuhp_state state) | ||
258 | { | ||
259 | return CPUHP_AP_IDLE_DEAD <= state && state < CPUHP_AP_ONLINE; | ||
260 | } | ||
261 | |||
187 | /* Serializes the updates to cpu_online_mask, cpu_present_mask */ | 262 | /* Serializes the updates to cpu_online_mask, cpu_present_mask */ |
188 | static DEFINE_MUTEX(cpu_add_remove_lock); | 263 | static DEFINE_MUTEX(cpu_add_remove_lock); |
189 | bool cpuhp_tasks_frozen; | 264 | bool cpuhp_tasks_frozen; |
@@ -271,14 +346,79 @@ void cpu_hotplug_enable(void) | |||
271 | EXPORT_SYMBOL_GPL(cpu_hotplug_enable); | 346 | EXPORT_SYMBOL_GPL(cpu_hotplug_enable); |
272 | #endif /* CONFIG_HOTPLUG_CPU */ | 347 | #endif /* CONFIG_HOTPLUG_CPU */ |
273 | 348 | ||
274 | static void __cpuhp_kick_ap_work(struct cpuhp_cpu_state *st); | 349 | static inline enum cpuhp_state |
350 | cpuhp_set_state(struct cpuhp_cpu_state *st, enum cpuhp_state target) | ||
351 | { | ||
352 | enum cpuhp_state prev_state = st->state; | ||
353 | |||
354 | st->rollback = false; | ||
355 | st->last = NULL; | ||
356 | |||
357 | st->target = target; | ||
358 | st->single = false; | ||
359 | st->bringup = st->state < target; | ||
360 | |||
361 | return prev_state; | ||
362 | } | ||
363 | |||
364 | static inline void | ||
365 | cpuhp_reset_state(struct cpuhp_cpu_state *st, enum cpuhp_state prev_state) | ||
366 | { | ||
367 | st->rollback = true; | ||
368 | |||
369 | /* | ||
370 | * If we have st->last we need to undo partial multi_instance of this | ||
371 | * state first. Otherwise start undo at the previous state. | ||
372 | */ | ||
373 | if (!st->last) { | ||
374 | if (st->bringup) | ||
375 | st->state--; | ||
376 | else | ||
377 | st->state++; | ||
378 | } | ||
379 | |||
380 | st->target = prev_state; | ||
381 | st->bringup = !st->bringup; | ||
382 | } | ||
383 | |||
384 | /* Regular hotplug invocation of the AP hotplug thread */ | ||
385 | static void __cpuhp_kick_ap(struct cpuhp_cpu_state *st) | ||
386 | { | ||
387 | if (!st->single && st->state == st->target) | ||
388 | return; | ||
389 | |||
390 | st->result = 0; | ||
391 | /* | ||
392 | * Make sure the above stores are visible before should_run becomes | ||
393 | * true. Paired with the mb() above in cpuhp_thread_fun() | ||
394 | */ | ||
395 | smp_mb(); | ||
396 | st->should_run = true; | ||
397 | wake_up_process(st->thread); | ||
398 | wait_for_ap_thread(st, st->bringup); | ||
399 | } | ||
400 | |||
401 | static int cpuhp_kick_ap(struct cpuhp_cpu_state *st, enum cpuhp_state target) | ||
402 | { | ||
403 | enum cpuhp_state prev_state; | ||
404 | int ret; | ||
405 | |||
406 | prev_state = cpuhp_set_state(st, target); | ||
407 | __cpuhp_kick_ap(st); | ||
408 | if ((ret = st->result)) { | ||
409 | cpuhp_reset_state(st, prev_state); | ||
410 | __cpuhp_kick_ap(st); | ||
411 | } | ||
412 | |||
413 | return ret; | ||
414 | } | ||
275 | 415 | ||
276 | static int bringup_wait_for_ap(unsigned int cpu) | 416 | static int bringup_wait_for_ap(unsigned int cpu) |
277 | { | 417 | { |
278 | struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); | 418 | struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); |
279 | 419 | ||
280 | /* Wait for the CPU to reach CPUHP_AP_ONLINE_IDLE */ | 420 | /* Wait for the CPU to reach CPUHP_AP_ONLINE_IDLE */ |
281 | wait_for_completion(&st->done); | 421 | wait_for_ap_thread(st, true); |
282 | if (WARN_ON_ONCE((!cpu_online(cpu)))) | 422 | if (WARN_ON_ONCE((!cpu_online(cpu)))) |
283 | return -ECANCELED; | 423 | return -ECANCELED; |
284 | 424 | ||
@@ -286,12 +426,10 @@ static int bringup_wait_for_ap(unsigned int cpu) | |||
286 | stop_machine_unpark(cpu); | 426 | stop_machine_unpark(cpu); |
287 | kthread_unpark(st->thread); | 427 | kthread_unpark(st->thread); |
288 | 428 | ||
289 | /* Should we go further up ? */ | 429 | if (st->target <= CPUHP_AP_ONLINE_IDLE) |
290 | if (st->target > CPUHP_AP_ONLINE_IDLE) { | 430 | return 0; |
291 | __cpuhp_kick_ap_work(st); | 431 | |
292 | wait_for_completion(&st->done); | 432 | return cpuhp_kick_ap(st, st->target); |
293 | } | ||
294 | return st->result; | ||
295 | } | 433 | } |
296 | 434 | ||
297 | static int bringup_cpu(unsigned int cpu) | 435 | static int bringup_cpu(unsigned int cpu) |
@@ -317,32 +455,6 @@ static int bringup_cpu(unsigned int cpu) | |||
317 | /* | 455 | /* |
318 | * Hotplug state machine related functions | 456 | * Hotplug state machine related functions |
319 | */ | 457 | */ |
320 | static void undo_cpu_down(unsigned int cpu, struct cpuhp_cpu_state *st) | ||
321 | { | ||
322 | for (st->state++; st->state < st->target; st->state++) { | ||
323 | struct cpuhp_step *step = cpuhp_get_step(st->state); | ||
324 | |||
325 | if (!step->skip_onerr) | ||
326 | cpuhp_invoke_callback(cpu, st->state, true, NULL); | ||
327 | } | ||
328 | } | ||
329 | |||
330 | static int cpuhp_down_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st, | ||
331 | enum cpuhp_state target) | ||
332 | { | ||
333 | enum cpuhp_state prev_state = st->state; | ||
334 | int ret = 0; | ||
335 | |||
336 | for (; st->state > target; st->state--) { | ||
337 | ret = cpuhp_invoke_callback(cpu, st->state, false, NULL); | ||
338 | if (ret) { | ||
339 | st->target = prev_state; | ||
340 | undo_cpu_down(cpu, st); | ||
341 | break; | ||
342 | } | ||
343 | } | ||
344 | return ret; | ||
345 | } | ||
346 | 458 | ||
347 | static void undo_cpu_up(unsigned int cpu, struct cpuhp_cpu_state *st) | 459 | static void undo_cpu_up(unsigned int cpu, struct cpuhp_cpu_state *st) |
348 | { | 460 | { |
@@ -350,7 +462,7 @@ static void undo_cpu_up(unsigned int cpu, struct cpuhp_cpu_state *st) | |||
350 | struct cpuhp_step *step = cpuhp_get_step(st->state); | 462 | struct cpuhp_step *step = cpuhp_get_step(st->state); |
351 | 463 | ||
352 | if (!step->skip_onerr) | 464 | if (!step->skip_onerr) |
353 | cpuhp_invoke_callback(cpu, st->state, false, NULL); | 465 | cpuhp_invoke_callback(cpu, st->state, false, NULL, NULL); |
354 | } | 466 | } |
355 | } | 467 | } |
356 | 468 | ||
@@ -362,7 +474,7 @@ static int cpuhp_up_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st, | |||
362 | 474 | ||
363 | while (st->state < target) { | 475 | while (st->state < target) { |
364 | st->state++; | 476 | st->state++; |
365 | ret = cpuhp_invoke_callback(cpu, st->state, true, NULL); | 477 | ret = cpuhp_invoke_callback(cpu, st->state, true, NULL, NULL); |
366 | if (ret) { | 478 | if (ret) { |
367 | st->target = prev_state; | 479 | st->target = prev_state; |
368 | undo_cpu_up(cpu, st); | 480 | undo_cpu_up(cpu, st); |
@@ -379,7 +491,8 @@ static void cpuhp_create(unsigned int cpu) | |||
379 | { | 491 | { |
380 | struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); | 492 | struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); |
381 | 493 | ||
382 | init_completion(&st->done); | 494 | init_completion(&st->done_up); |
495 | init_completion(&st->done_down); | ||
383 | } | 496 | } |
384 | 497 | ||
385 | static int cpuhp_should_run(unsigned int cpu) | 498 | static int cpuhp_should_run(unsigned int cpu) |
@@ -389,69 +502,90 @@ static int cpuhp_should_run(unsigned int cpu) | |||
389 | return st->should_run; | 502 | return st->should_run; |
390 | } | 503 | } |
391 | 504 | ||
392 | /* Execute the teardown callbacks. Used to be CPU_DOWN_PREPARE */ | ||
393 | static int cpuhp_ap_offline(unsigned int cpu, struct cpuhp_cpu_state *st) | ||
394 | { | ||
395 | enum cpuhp_state target = max((int)st->target, CPUHP_TEARDOWN_CPU); | ||
396 | |||
397 | return cpuhp_down_callbacks(cpu, st, target); | ||
398 | } | ||
399 | |||
400 | /* Execute the online startup callbacks. Used to be CPU_ONLINE */ | ||
401 | static int cpuhp_ap_online(unsigned int cpu, struct cpuhp_cpu_state *st) | ||
402 | { | ||
403 | return cpuhp_up_callbacks(cpu, st, st->target); | ||
404 | } | ||
405 | |||
406 | /* | 505 | /* |
407 | * Execute teardown/startup callbacks on the plugged cpu. Also used to invoke | 506 | * Execute teardown/startup callbacks on the plugged cpu. Also used to invoke |
408 | * callbacks when a state gets [un]installed at runtime. | 507 | * callbacks when a state gets [un]installed at runtime. |
508 | * | ||
509 | * Each invocation of this function by the smpboot thread does a single AP | ||
510 | * state callback. | ||
511 | * | ||
512 | * It has 3 modes of operation: | ||
513 | * - single: runs st->cb_state | ||
514 | * - up: runs ++st->state, while st->state < st->target | ||
515 | * - down: runs st->state--, while st->state > st->target | ||
516 | * | ||
517 | * When complete or on error, should_run is cleared and the completion is fired. | ||
409 | */ | 518 | */ |
410 | static void cpuhp_thread_fun(unsigned int cpu) | 519 | static void cpuhp_thread_fun(unsigned int cpu) |
411 | { | 520 | { |
412 | struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state); | 521 | struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state); |
413 | int ret = 0; | 522 | bool bringup = st->bringup; |
523 | enum cpuhp_state state; | ||
414 | 524 | ||
415 | /* | 525 | /* |
416 | * Paired with the mb() in cpuhp_kick_ap_work and | 526 | * ACQUIRE for the cpuhp_should_run() load of ->should_run. Ensures |
417 | * cpuhp_invoke_ap_callback, so the work set is consistent visible. | 527 | * that if we see ->should_run we also see the rest of the state. |
418 | */ | 528 | */ |
419 | smp_mb(); | 529 | smp_mb(); |
420 | if (!st->should_run) | 530 | |
531 | if (WARN_ON_ONCE(!st->should_run)) | ||
421 | return; | 532 | return; |
422 | 533 | ||
423 | st->should_run = false; | 534 | cpuhp_lock_acquire(bringup); |
424 | 535 | ||
425 | lock_map_acquire(&cpuhp_state_lock_map); | ||
426 | /* Single callback invocation for [un]install ? */ | ||
427 | if (st->single) { | 536 | if (st->single) { |
428 | if (st->cb_state < CPUHP_AP_ONLINE) { | 537 | state = st->cb_state; |
429 | local_irq_disable(); | 538 | st->should_run = false; |
430 | ret = cpuhp_invoke_callback(cpu, st->cb_state, | 539 | } else { |
431 | st->bringup, st->node); | 540 | if (bringup) { |
432 | local_irq_enable(); | 541 | st->state++; |
542 | state = st->state; | ||
543 | st->should_run = (st->state < st->target); | ||
544 | WARN_ON_ONCE(st->state > st->target); | ||
433 | } else { | 545 | } else { |
434 | ret = cpuhp_invoke_callback(cpu, st->cb_state, | 546 | state = st->state; |
435 | st->bringup, st->node); | 547 | st->state--; |
548 | st->should_run = (st->state > st->target); | ||
549 | WARN_ON_ONCE(st->state < st->target); | ||
436 | } | 550 | } |
437 | } else if (st->rollback) { | 551 | } |
438 | BUG_ON(st->state < CPUHP_AP_ONLINE_IDLE); | 552 | |
553 | WARN_ON_ONCE(!cpuhp_is_ap_state(state)); | ||
554 | |||
555 | if (st->rollback) { | ||
556 | struct cpuhp_step *step = cpuhp_get_step(state); | ||
557 | if (step->skip_onerr) | ||
558 | goto next; | ||
559 | } | ||
560 | |||
561 | if (cpuhp_is_atomic_state(state)) { | ||
562 | local_irq_disable(); | ||
563 | st->result = cpuhp_invoke_callback(cpu, state, bringup, st->node, &st->last); | ||
564 | local_irq_enable(); | ||
439 | 565 | ||
440 | undo_cpu_down(cpu, st); | 566 | /* |
441 | st->rollback = false; | 567 | * STARTING/DYING must not fail! |
568 | */ | ||
569 | WARN_ON_ONCE(st->result); | ||
442 | } else { | 570 | } else { |
443 | /* Cannot happen .... */ | 571 | st->result = cpuhp_invoke_callback(cpu, state, bringup, st->node, &st->last); |
444 | BUG_ON(st->state < CPUHP_AP_ONLINE_IDLE); | 572 | } |
445 | 573 | ||
446 | /* Regular hotplug work */ | 574 | if (st->result) { |
447 | if (st->state < st->target) | 575 | /* |
448 | ret = cpuhp_ap_online(cpu, st); | 576 | * If we fail on a rollback, we're up a creek without no |
449 | else if (st->state > st->target) | 577 | * paddle, no way forward, no way back. We loose, thanks for |
450 | ret = cpuhp_ap_offline(cpu, st); | 578 | * playing. |
579 | */ | ||
580 | WARN_ON_ONCE(st->rollback); | ||
581 | st->should_run = false; | ||
451 | } | 582 | } |
452 | lock_map_release(&cpuhp_state_lock_map); | 583 | |
453 | st->result = ret; | 584 | next: |
454 | complete(&st->done); | 585 | cpuhp_lock_release(bringup); |
586 | |||
587 | if (!st->should_run) | ||
588 | complete_ap_thread(st, bringup); | ||
455 | } | 589 | } |
456 | 590 | ||
457 | /* Invoke a single callback on a remote cpu */ | 591 | /* Invoke a single callback on a remote cpu */ |
@@ -460,62 +594,64 @@ cpuhp_invoke_ap_callback(int cpu, enum cpuhp_state state, bool bringup, | |||
460 | struct hlist_node *node) | 594 | struct hlist_node *node) |
461 | { | 595 | { |
462 | struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); | 596 | struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); |
597 | int ret; | ||
463 | 598 | ||
464 | if (!cpu_online(cpu)) | 599 | if (!cpu_online(cpu)) |
465 | return 0; | 600 | return 0; |
466 | 601 | ||
467 | lock_map_acquire(&cpuhp_state_lock_map); | 602 | cpuhp_lock_acquire(false); |
468 | lock_map_release(&cpuhp_state_lock_map); | 603 | cpuhp_lock_release(false); |
604 | |||
605 | cpuhp_lock_acquire(true); | ||
606 | cpuhp_lock_release(true); | ||
469 | 607 | ||
470 | /* | 608 | /* |
471 | * If we are up and running, use the hotplug thread. For early calls | 609 | * If we are up and running, use the hotplug thread. For early calls |
472 | * we invoke the thread function directly. | 610 | * we invoke the thread function directly. |
473 | */ | 611 | */ |
474 | if (!st->thread) | 612 | if (!st->thread) |
475 | return cpuhp_invoke_callback(cpu, state, bringup, node); | 613 | return cpuhp_invoke_callback(cpu, state, bringup, node, NULL); |
614 | |||
615 | st->rollback = false; | ||
616 | st->last = NULL; | ||
476 | 617 | ||
618 | st->node = node; | ||
619 | st->bringup = bringup; | ||
477 | st->cb_state = state; | 620 | st->cb_state = state; |
478 | st->single = true; | 621 | st->single = true; |
479 | st->bringup = bringup; | ||
480 | st->node = node; | ||
481 | 622 | ||
482 | /* | 623 | __cpuhp_kick_ap(st); |
483 | * Make sure the above stores are visible before should_run becomes | ||
484 | * true. Paired with the mb() above in cpuhp_thread_fun() | ||
485 | */ | ||
486 | smp_mb(); | ||
487 | st->should_run = true; | ||
488 | wake_up_process(st->thread); | ||
489 | wait_for_completion(&st->done); | ||
490 | return st->result; | ||
491 | } | ||
492 | 624 | ||
493 | /* Regular hotplug invocation of the AP hotplug thread */ | ||
494 | static void __cpuhp_kick_ap_work(struct cpuhp_cpu_state *st) | ||
495 | { | ||
496 | st->result = 0; | ||
497 | st->single = false; | ||
498 | /* | 625 | /* |
499 | * Make sure the above stores are visible before should_run becomes | 626 | * If we failed and did a partial, do a rollback. |
500 | * true. Paired with the mb() above in cpuhp_thread_fun() | ||
501 | */ | 627 | */ |
502 | smp_mb(); | 628 | if ((ret = st->result) && st->last) { |
503 | st->should_run = true; | 629 | st->rollback = true; |
504 | wake_up_process(st->thread); | 630 | st->bringup = !bringup; |
631 | |||
632 | __cpuhp_kick_ap(st); | ||
633 | } | ||
634 | |||
635 | return ret; | ||
505 | } | 636 | } |
506 | 637 | ||
507 | static int cpuhp_kick_ap_work(unsigned int cpu) | 638 | static int cpuhp_kick_ap_work(unsigned int cpu) |
508 | { | 639 | { |
509 | struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); | 640 | struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); |
510 | enum cpuhp_state state = st->state; | 641 | enum cpuhp_state prev_state = st->state; |
642 | int ret; | ||
643 | |||
644 | cpuhp_lock_acquire(false); | ||
645 | cpuhp_lock_release(false); | ||
511 | 646 | ||
512 | trace_cpuhp_enter(cpu, st->target, state, cpuhp_kick_ap_work); | 647 | cpuhp_lock_acquire(true); |
513 | lock_map_acquire(&cpuhp_state_lock_map); | 648 | cpuhp_lock_release(true); |
514 | lock_map_release(&cpuhp_state_lock_map); | 649 | |
515 | __cpuhp_kick_ap_work(st); | 650 | trace_cpuhp_enter(cpu, st->target, prev_state, cpuhp_kick_ap_work); |
516 | wait_for_completion(&st->done); | 651 | ret = cpuhp_kick_ap(st, st->target); |
517 | trace_cpuhp_exit(cpu, st->state, state, st->result); | 652 | trace_cpuhp_exit(cpu, st->state, prev_state, ret); |
518 | return st->result; | 653 | |
654 | return ret; | ||
519 | } | 655 | } |
520 | 656 | ||
521 | static struct smp_hotplug_thread cpuhp_threads = { | 657 | static struct smp_hotplug_thread cpuhp_threads = { |
@@ -581,6 +717,7 @@ static int take_cpu_down(void *_param) | |||
581 | struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state); | 717 | struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state); |
582 | enum cpuhp_state target = max((int)st->target, CPUHP_AP_OFFLINE); | 718 | enum cpuhp_state target = max((int)st->target, CPUHP_AP_OFFLINE); |
583 | int err, cpu = smp_processor_id(); | 719 | int err, cpu = smp_processor_id(); |
720 | int ret; | ||
584 | 721 | ||
585 | /* Ensure this CPU doesn't handle any more interrupts. */ | 722 | /* Ensure this CPU doesn't handle any more interrupts. */ |
586 | err = __cpu_disable(); | 723 | err = __cpu_disable(); |
@@ -594,8 +731,13 @@ static int take_cpu_down(void *_param) | |||
594 | WARN_ON(st->state != CPUHP_TEARDOWN_CPU); | 731 | WARN_ON(st->state != CPUHP_TEARDOWN_CPU); |
595 | st->state--; | 732 | st->state--; |
596 | /* Invoke the former CPU_DYING callbacks */ | 733 | /* Invoke the former CPU_DYING callbacks */ |
597 | for (; st->state > target; st->state--) | 734 | for (; st->state > target; st->state--) { |
598 | cpuhp_invoke_callback(cpu, st->state, false, NULL); | 735 | ret = cpuhp_invoke_callback(cpu, st->state, false, NULL, NULL); |
736 | /* | ||
737 | * DYING must not fail! | ||
738 | */ | ||
739 | WARN_ON_ONCE(ret); | ||
740 | } | ||
599 | 741 | ||
600 | /* Give up timekeeping duties */ | 742 | /* Give up timekeeping duties */ |
601 | tick_handover_do_timer(); | 743 | tick_handover_do_timer(); |
@@ -639,7 +781,7 @@ static int takedown_cpu(unsigned int cpu) | |||
639 | * | 781 | * |
640 | * Wait for the stop thread to go away. | 782 | * Wait for the stop thread to go away. |
641 | */ | 783 | */ |
642 | wait_for_completion(&st->done); | 784 | wait_for_ap_thread(st, false); |
643 | BUG_ON(st->state != CPUHP_AP_IDLE_DEAD); | 785 | BUG_ON(st->state != CPUHP_AP_IDLE_DEAD); |
644 | 786 | ||
645 | /* Interrupts are moved away from the dying cpu, reenable alloc/free */ | 787 | /* Interrupts are moved away from the dying cpu, reenable alloc/free */ |
@@ -658,7 +800,7 @@ static void cpuhp_complete_idle_dead(void *arg) | |||
658 | { | 800 | { |
659 | struct cpuhp_cpu_state *st = arg; | 801 | struct cpuhp_cpu_state *st = arg; |
660 | 802 | ||
661 | complete(&st->done); | 803 | complete_ap_thread(st, false); |
662 | } | 804 | } |
663 | 805 | ||
664 | void cpuhp_report_idle_dead(void) | 806 | void cpuhp_report_idle_dead(void) |
@@ -676,11 +818,32 @@ void cpuhp_report_idle_dead(void) | |||
676 | cpuhp_complete_idle_dead, st, 0); | 818 | cpuhp_complete_idle_dead, st, 0); |
677 | } | 819 | } |
678 | 820 | ||
679 | #else | 821 | static void undo_cpu_down(unsigned int cpu, struct cpuhp_cpu_state *st) |
680 | #define takedown_cpu NULL | 822 | { |
681 | #endif | 823 | for (st->state++; st->state < st->target; st->state++) { |
824 | struct cpuhp_step *step = cpuhp_get_step(st->state); | ||
682 | 825 | ||
683 | #ifdef CONFIG_HOTPLUG_CPU | 826 | if (!step->skip_onerr) |
827 | cpuhp_invoke_callback(cpu, st->state, true, NULL, NULL); | ||
828 | } | ||
829 | } | ||
830 | |||
831 | static int cpuhp_down_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st, | ||
832 | enum cpuhp_state target) | ||
833 | { | ||
834 | enum cpuhp_state prev_state = st->state; | ||
835 | int ret = 0; | ||
836 | |||
837 | for (; st->state > target; st->state--) { | ||
838 | ret = cpuhp_invoke_callback(cpu, st->state, false, NULL, NULL); | ||
839 | if (ret) { | ||
840 | st->target = prev_state; | ||
841 | undo_cpu_down(cpu, st); | ||
842 | break; | ||
843 | } | ||
844 | } | ||
845 | return ret; | ||
846 | } | ||
684 | 847 | ||
685 | /* Requires cpu_add_remove_lock to be held */ | 848 | /* Requires cpu_add_remove_lock to be held */ |
686 | static int __ref _cpu_down(unsigned int cpu, int tasks_frozen, | 849 | static int __ref _cpu_down(unsigned int cpu, int tasks_frozen, |
@@ -699,13 +862,13 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen, | |||
699 | 862 | ||
700 | cpuhp_tasks_frozen = tasks_frozen; | 863 | cpuhp_tasks_frozen = tasks_frozen; |
701 | 864 | ||
702 | prev_state = st->state; | 865 | prev_state = cpuhp_set_state(st, target); |
703 | st->target = target; | ||
704 | /* | 866 | /* |
705 | * If the current CPU state is in the range of the AP hotplug thread, | 867 | * If the current CPU state is in the range of the AP hotplug thread, |
706 | * then we need to kick the thread. | 868 | * then we need to kick the thread. |
707 | */ | 869 | */ |
708 | if (st->state > CPUHP_TEARDOWN_CPU) { | 870 | if (st->state > CPUHP_TEARDOWN_CPU) { |
871 | st->target = max((int)target, CPUHP_TEARDOWN_CPU); | ||
709 | ret = cpuhp_kick_ap_work(cpu); | 872 | ret = cpuhp_kick_ap_work(cpu); |
710 | /* | 873 | /* |
711 | * The AP side has done the error rollback already. Just | 874 | * The AP side has done the error rollback already. Just |
@@ -720,6 +883,8 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen, | |||
720 | */ | 883 | */ |
721 | if (st->state > CPUHP_TEARDOWN_CPU) | 884 | if (st->state > CPUHP_TEARDOWN_CPU) |
722 | goto out; | 885 | goto out; |
886 | |||
887 | st->target = target; | ||
723 | } | 888 | } |
724 | /* | 889 | /* |
725 | * The AP brought itself down to CPUHP_TEARDOWN_CPU. So we need | 890 | * The AP brought itself down to CPUHP_TEARDOWN_CPU. So we need |
@@ -727,13 +892,17 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen, | |||
727 | */ | 892 | */ |
728 | ret = cpuhp_down_callbacks(cpu, st, target); | 893 | ret = cpuhp_down_callbacks(cpu, st, target); |
729 | if (ret && st->state > CPUHP_TEARDOWN_CPU && st->state < prev_state) { | 894 | if (ret && st->state > CPUHP_TEARDOWN_CPU && st->state < prev_state) { |
730 | st->target = prev_state; | 895 | cpuhp_reset_state(st, prev_state); |
731 | st->rollback = true; | 896 | __cpuhp_kick_ap(st); |
732 | cpuhp_kick_ap_work(cpu); | ||
733 | } | 897 | } |
734 | 898 | ||
735 | out: | 899 | out: |
736 | cpus_write_unlock(); | 900 | cpus_write_unlock(); |
901 | /* | ||
902 | * Do post unplug cleanup. This is still protected against | ||
903 | * concurrent CPU hotplug via cpu_add_remove_lock. | ||
904 | */ | ||
905 | lockup_detector_cleanup(); | ||
737 | return ret; | 906 | return ret; |
738 | } | 907 | } |
739 | 908 | ||
@@ -754,11 +923,15 @@ out: | |||
754 | cpu_maps_update_done(); | 923 | cpu_maps_update_done(); |
755 | return err; | 924 | return err; |
756 | } | 925 | } |
926 | |||
757 | int cpu_down(unsigned int cpu) | 927 | int cpu_down(unsigned int cpu) |
758 | { | 928 | { |
759 | return do_cpu_down(cpu, CPUHP_OFFLINE); | 929 | return do_cpu_down(cpu, CPUHP_OFFLINE); |
760 | } | 930 | } |
761 | EXPORT_SYMBOL(cpu_down); | 931 | EXPORT_SYMBOL(cpu_down); |
932 | |||
933 | #else | ||
934 | #define takedown_cpu NULL | ||
762 | #endif /*CONFIG_HOTPLUG_CPU*/ | 935 | #endif /*CONFIG_HOTPLUG_CPU*/ |
763 | 936 | ||
764 | /** | 937 | /** |
@@ -772,11 +945,16 @@ void notify_cpu_starting(unsigned int cpu) | |||
772 | { | 945 | { |
773 | struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); | 946 | struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); |
774 | enum cpuhp_state target = min((int)st->target, CPUHP_AP_ONLINE); | 947 | enum cpuhp_state target = min((int)st->target, CPUHP_AP_ONLINE); |
948 | int ret; | ||
775 | 949 | ||
776 | rcu_cpu_starting(cpu); /* Enables RCU usage on this CPU. */ | 950 | rcu_cpu_starting(cpu); /* Enables RCU usage on this CPU. */ |
777 | while (st->state < target) { | 951 | while (st->state < target) { |
778 | st->state++; | 952 | st->state++; |
779 | cpuhp_invoke_callback(cpu, st->state, true, NULL); | 953 | ret = cpuhp_invoke_callback(cpu, st->state, true, NULL, NULL); |
954 | /* | ||
955 | * STARTING must not fail! | ||
956 | */ | ||
957 | WARN_ON_ONCE(ret); | ||
780 | } | 958 | } |
781 | } | 959 | } |
782 | 960 | ||
@@ -794,7 +972,7 @@ void cpuhp_online_idle(enum cpuhp_state state) | |||
794 | return; | 972 | return; |
795 | 973 | ||
796 | st->state = CPUHP_AP_ONLINE_IDLE; | 974 | st->state = CPUHP_AP_ONLINE_IDLE; |
797 | complete(&st->done); | 975 | complete_ap_thread(st, true); |
798 | } | 976 | } |
799 | 977 | ||
800 | /* Requires cpu_add_remove_lock to be held */ | 978 | /* Requires cpu_add_remove_lock to be held */ |
@@ -829,7 +1007,7 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen, enum cpuhp_state target) | |||
829 | 1007 | ||
830 | cpuhp_tasks_frozen = tasks_frozen; | 1008 | cpuhp_tasks_frozen = tasks_frozen; |
831 | 1009 | ||
832 | st->target = target; | 1010 | cpuhp_set_state(st, target); |
833 | /* | 1011 | /* |
834 | * If the current CPU state is in the range of the AP hotplug thread, | 1012 | * If the current CPU state is in the range of the AP hotplug thread, |
835 | * then we need to kick the thread once more. | 1013 | * then we need to kick the thread once more. |
@@ -1296,6 +1474,10 @@ static int cpuhp_issue_call(int cpu, enum cpuhp_state state, bool bringup, | |||
1296 | struct cpuhp_step *sp = cpuhp_get_step(state); | 1474 | struct cpuhp_step *sp = cpuhp_get_step(state); |
1297 | int ret; | 1475 | int ret; |
1298 | 1476 | ||
1477 | /* | ||
1478 | * If there's nothing to do, we done. | ||
1479 | * Relies on the union for multi_instance. | ||
1480 | */ | ||
1299 | if ((bringup && !sp->startup.single) || | 1481 | if ((bringup && !sp->startup.single) || |
1300 | (!bringup && !sp->teardown.single)) | 1482 | (!bringup && !sp->teardown.single)) |
1301 | return 0; | 1483 | return 0; |
@@ -1307,9 +1489,9 @@ static int cpuhp_issue_call(int cpu, enum cpuhp_state state, bool bringup, | |||
1307 | if (cpuhp_is_ap_state(state)) | 1489 | if (cpuhp_is_ap_state(state)) |
1308 | ret = cpuhp_invoke_ap_callback(cpu, state, bringup, node); | 1490 | ret = cpuhp_invoke_ap_callback(cpu, state, bringup, node); |
1309 | else | 1491 | else |
1310 | ret = cpuhp_invoke_callback(cpu, state, bringup, node); | 1492 | ret = cpuhp_invoke_callback(cpu, state, bringup, node, NULL); |
1311 | #else | 1493 | #else |
1312 | ret = cpuhp_invoke_callback(cpu, state, bringup, node); | 1494 | ret = cpuhp_invoke_callback(cpu, state, bringup, node, NULL); |
1313 | #endif | 1495 | #endif |
1314 | BUG_ON(ret && !bringup); | 1496 | BUG_ON(ret && !bringup); |
1315 | return ret; | 1497 | return ret; |
@@ -1641,9 +1823,55 @@ static ssize_t show_cpuhp_target(struct device *dev, | |||
1641 | } | 1823 | } |
1642 | static DEVICE_ATTR(target, 0644, show_cpuhp_target, write_cpuhp_target); | 1824 | static DEVICE_ATTR(target, 0644, show_cpuhp_target, write_cpuhp_target); |
1643 | 1825 | ||
1826 | |||
1827 | static ssize_t write_cpuhp_fail(struct device *dev, | ||
1828 | struct device_attribute *attr, | ||
1829 | const char *buf, size_t count) | ||
1830 | { | ||
1831 | struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id); | ||
1832 | struct cpuhp_step *sp; | ||
1833 | int fail, ret; | ||
1834 | |||
1835 | ret = kstrtoint(buf, 10, &fail); | ||
1836 | if (ret) | ||
1837 | return ret; | ||
1838 | |||
1839 | /* | ||
1840 | * Cannot fail STARTING/DYING callbacks. | ||
1841 | */ | ||
1842 | if (cpuhp_is_atomic_state(fail)) | ||
1843 | return -EINVAL; | ||
1844 | |||
1845 | /* | ||
1846 | * Cannot fail anything that doesn't have callbacks. | ||
1847 | */ | ||
1848 | mutex_lock(&cpuhp_state_mutex); | ||
1849 | sp = cpuhp_get_step(fail); | ||
1850 | if (!sp->startup.single && !sp->teardown.single) | ||
1851 | ret = -EINVAL; | ||
1852 | mutex_unlock(&cpuhp_state_mutex); | ||
1853 | if (ret) | ||
1854 | return ret; | ||
1855 | |||
1856 | st->fail = fail; | ||
1857 | |||
1858 | return count; | ||
1859 | } | ||
1860 | |||
1861 | static ssize_t show_cpuhp_fail(struct device *dev, | ||
1862 | struct device_attribute *attr, char *buf) | ||
1863 | { | ||
1864 | struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id); | ||
1865 | |||
1866 | return sprintf(buf, "%d\n", st->fail); | ||
1867 | } | ||
1868 | |||
1869 | static DEVICE_ATTR(fail, 0644, show_cpuhp_fail, write_cpuhp_fail); | ||
1870 | |||
1644 | static struct attribute *cpuhp_cpu_attrs[] = { | 1871 | static struct attribute *cpuhp_cpu_attrs[] = { |
1645 | &dev_attr_state.attr, | 1872 | &dev_attr_state.attr, |
1646 | &dev_attr_target.attr, | 1873 | &dev_attr_target.attr, |
1874 | &dev_attr_fail.attr, | ||
1647 | NULL | 1875 | NULL |
1648 | }; | 1876 | }; |
1649 | 1877 | ||
diff --git a/kernel/events/core.c b/kernel/events/core.c index 3e691b75b2db..6bc21e202ae4 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c | |||
@@ -8171,6 +8171,7 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) | |||
8171 | } | 8171 | } |
8172 | } | 8172 | } |
8173 | event->tp_event->prog = prog; | 8173 | event->tp_event->prog = prog; |
8174 | event->tp_event->bpf_prog_owner = event; | ||
8174 | 8175 | ||
8175 | return 0; | 8176 | return 0; |
8176 | } | 8177 | } |
@@ -8185,7 +8186,7 @@ static void perf_event_free_bpf_prog(struct perf_event *event) | |||
8185 | return; | 8186 | return; |
8186 | 8187 | ||
8187 | prog = event->tp_event->prog; | 8188 | prog = event->tp_event->prog; |
8188 | if (prog) { | 8189 | if (prog && event->tp_event->bpf_prog_owner == event) { |
8189 | event->tp_event->prog = NULL; | 8190 | event->tp_event->prog = NULL; |
8190 | bpf_prog_put(prog); | 8191 | bpf_prog_put(prog); |
8191 | } | 8192 | } |
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index af71a84e12ee..f684d8e5fa2b 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c | |||
@@ -412,6 +412,19 @@ err: | |||
412 | return NULL; | 412 | return NULL; |
413 | } | 413 | } |
414 | 414 | ||
415 | static bool __always_inline rb_need_aux_wakeup(struct ring_buffer *rb) | ||
416 | { | ||
417 | if (rb->aux_overwrite) | ||
418 | return false; | ||
419 | |||
420 | if (rb->aux_head - rb->aux_wakeup >= rb->aux_watermark) { | ||
421 | rb->aux_wakeup = rounddown(rb->aux_head, rb->aux_watermark); | ||
422 | return true; | ||
423 | } | ||
424 | |||
425 | return false; | ||
426 | } | ||
427 | |||
415 | /* | 428 | /* |
416 | * Commit the data written by hardware into the ring buffer by adjusting | 429 | * Commit the data written by hardware into the ring buffer by adjusting |
417 | * aux_head and posting a PERF_RECORD_AUX into the perf buffer. It is the | 430 | * aux_head and posting a PERF_RECORD_AUX into the perf buffer. It is the |
@@ -451,10 +464,8 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size) | |||
451 | } | 464 | } |
452 | 465 | ||
453 | rb->user_page->aux_head = rb->aux_head; | 466 | rb->user_page->aux_head = rb->aux_head; |
454 | if (rb->aux_head - rb->aux_wakeup >= rb->aux_watermark) { | 467 | if (rb_need_aux_wakeup(rb)) |
455 | wakeup = true; | 468 | wakeup = true; |
456 | rb->aux_wakeup = rounddown(rb->aux_head, rb->aux_watermark); | ||
457 | } | ||
458 | 469 | ||
459 | if (wakeup) { | 470 | if (wakeup) { |
460 | if (handle->aux_flags & PERF_AUX_FLAG_TRUNCATED) | 471 | if (handle->aux_flags & PERF_AUX_FLAG_TRUNCATED) |
@@ -484,9 +495,8 @@ int perf_aux_output_skip(struct perf_output_handle *handle, unsigned long size) | |||
484 | rb->aux_head += size; | 495 | rb->aux_head += size; |
485 | 496 | ||
486 | rb->user_page->aux_head = rb->aux_head; | 497 | rb->user_page->aux_head = rb->aux_head; |
487 | if (rb->aux_head - rb->aux_wakeup >= rb->aux_watermark) { | 498 | if (rb_need_aux_wakeup(rb)) { |
488 | perf_output_wakeup(handle); | 499 | perf_output_wakeup(handle); |
489 | rb->aux_wakeup = rounddown(rb->aux_head, rb->aux_watermark); | ||
490 | handle->wakeup = rb->aux_wakeup + rb->aux_watermark; | 500 | handle->wakeup = rb->aux_wakeup + rb->aux_watermark; |
491 | } | 501 | } |
492 | 502 | ||
diff --git a/kernel/exit.c b/kernel/exit.c index 3481ababd06a..f2cd53e92147 100644 --- a/kernel/exit.c +++ b/kernel/exit.c | |||
@@ -1600,12 +1600,10 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *, | |||
1600 | struct waitid_info info = {.status = 0}; | 1600 | struct waitid_info info = {.status = 0}; |
1601 | long err = kernel_waitid(which, upid, &info, options, ru ? &r : NULL); | 1601 | long err = kernel_waitid(which, upid, &info, options, ru ? &r : NULL); |
1602 | int signo = 0; | 1602 | int signo = 0; |
1603 | |||
1603 | if (err > 0) { | 1604 | if (err > 0) { |
1604 | signo = SIGCHLD; | 1605 | signo = SIGCHLD; |
1605 | err = 0; | 1606 | err = 0; |
1606 | } | ||
1607 | |||
1608 | if (!err) { | ||
1609 | if (ru && copy_to_user(ru, &r, sizeof(struct rusage))) | 1607 | if (ru && copy_to_user(ru, &r, sizeof(struct rusage))) |
1610 | return -EFAULT; | 1608 | return -EFAULT; |
1611 | } | 1609 | } |
@@ -1723,16 +1721,15 @@ COMPAT_SYSCALL_DEFINE5(waitid, | |||
1723 | if (err > 0) { | 1721 | if (err > 0) { |
1724 | signo = SIGCHLD; | 1722 | signo = SIGCHLD; |
1725 | err = 0; | 1723 | err = 0; |
1726 | } | 1724 | if (uru) { |
1727 | 1725 | /* kernel_waitid() overwrites everything in ru */ | |
1728 | if (!err && uru) { | 1726 | if (COMPAT_USE_64BIT_TIME) |
1729 | /* kernel_waitid() overwrites everything in ru */ | 1727 | err = copy_to_user(uru, &ru, sizeof(ru)); |
1730 | if (COMPAT_USE_64BIT_TIME) | 1728 | else |
1731 | err = copy_to_user(uru, &ru, sizeof(ru)); | 1729 | err = put_compat_rusage(&ru, uru); |
1732 | else | 1730 | if (err) |
1733 | err = put_compat_rusage(&ru, uru); | 1731 | return -EFAULT; |
1734 | if (err) | 1732 | } |
1735 | return -EFAULT; | ||
1736 | } | 1733 | } |
1737 | 1734 | ||
1738 | if (!infop) | 1735 | if (!infop) |
diff --git a/kernel/extable.c b/kernel/extable.c index 38c2412401a1..9aa1cc41ecf7 100644 --- a/kernel/extable.c +++ b/kernel/extable.c | |||
@@ -102,15 +102,7 @@ int core_kernel_data(unsigned long addr) | |||
102 | 102 | ||
103 | int __kernel_text_address(unsigned long addr) | 103 | int __kernel_text_address(unsigned long addr) |
104 | { | 104 | { |
105 | if (core_kernel_text(addr)) | 105 | if (kernel_text_address(addr)) |
106 | return 1; | ||
107 | if (is_module_text_address(addr)) | ||
108 | return 1; | ||
109 | if (is_ftrace_trampoline(addr)) | ||
110 | return 1; | ||
111 | if (is_kprobe_optinsn_slot(addr) || is_kprobe_insn_slot(addr)) | ||
112 | return 1; | ||
113 | if (is_bpf_text_address(addr)) | ||
114 | return 1; | 106 | return 1; |
115 | /* | 107 | /* |
116 | * There might be init symbols in saved stacktraces. | 108 | * There might be init symbols in saved stacktraces. |
@@ -127,17 +119,42 @@ int __kernel_text_address(unsigned long addr) | |||
127 | 119 | ||
128 | int kernel_text_address(unsigned long addr) | 120 | int kernel_text_address(unsigned long addr) |
129 | { | 121 | { |
122 | bool no_rcu; | ||
123 | int ret = 1; | ||
124 | |||
130 | if (core_kernel_text(addr)) | 125 | if (core_kernel_text(addr)) |
131 | return 1; | 126 | return 1; |
127 | |||
128 | /* | ||
129 | * If a stack dump happens while RCU is not watching, then | ||
130 | * RCU needs to be notified that it requires to start | ||
131 | * watching again. This can happen either by tracing that | ||
132 | * triggers a stack trace, or a WARN() that happens during | ||
133 | * coming back from idle, or cpu on or offlining. | ||
134 | * | ||
135 | * is_module_text_address() as well as the kprobe slots | ||
136 | * and is_bpf_text_address() require RCU to be watching. | ||
137 | */ | ||
138 | no_rcu = !rcu_is_watching(); | ||
139 | |||
140 | /* Treat this like an NMI as it can happen anywhere */ | ||
141 | if (no_rcu) | ||
142 | rcu_nmi_enter(); | ||
143 | |||
132 | if (is_module_text_address(addr)) | 144 | if (is_module_text_address(addr)) |
133 | return 1; | 145 | goto out; |
134 | if (is_ftrace_trampoline(addr)) | 146 | if (is_ftrace_trampoline(addr)) |
135 | return 1; | 147 | goto out; |
136 | if (is_kprobe_optinsn_slot(addr) || is_kprobe_insn_slot(addr)) | 148 | if (is_kprobe_optinsn_slot(addr) || is_kprobe_insn_slot(addr)) |
137 | return 1; | 149 | goto out; |
138 | if (is_bpf_text_address(addr)) | 150 | if (is_bpf_text_address(addr)) |
139 | return 1; | 151 | goto out; |
140 | return 0; | 152 | ret = 0; |
153 | out: | ||
154 | if (no_rcu) | ||
155 | rcu_nmi_exit(); | ||
156 | |||
157 | return ret; | ||
141 | } | 158 | } |
142 | 159 | ||
143 | /* | 160 | /* |
diff --git a/kernel/fork.c b/kernel/fork.c index 10646182440f..e702cb9ffbd8 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -946,6 +946,24 @@ void mmput(struct mm_struct *mm) | |||
946 | } | 946 | } |
947 | EXPORT_SYMBOL_GPL(mmput); | 947 | EXPORT_SYMBOL_GPL(mmput); |
948 | 948 | ||
949 | #ifdef CONFIG_MMU | ||
950 | static void mmput_async_fn(struct work_struct *work) | ||
951 | { | ||
952 | struct mm_struct *mm = container_of(work, struct mm_struct, | ||
953 | async_put_work); | ||
954 | |||
955 | __mmput(mm); | ||
956 | } | ||
957 | |||
958 | void mmput_async(struct mm_struct *mm) | ||
959 | { | ||
960 | if (atomic_dec_and_test(&mm->mm_users)) { | ||
961 | INIT_WORK(&mm->async_put_work, mmput_async_fn); | ||
962 | schedule_work(&mm->async_put_work); | ||
963 | } | ||
964 | } | ||
965 | #endif | ||
966 | |||
949 | /** | 967 | /** |
950 | * set_mm_exe_file - change a reference to the mm's executable file | 968 | * set_mm_exe_file - change a reference to the mm's executable file |
951 | * | 969 | * |
diff --git a/kernel/futex.c b/kernel/futex.c index 3d38eaf05492..0518a0bfc746 100644 --- a/kernel/futex.c +++ b/kernel/futex.c | |||
@@ -821,8 +821,6 @@ static void get_pi_state(struct futex_pi_state *pi_state) | |||
821 | /* | 821 | /* |
822 | * Drops a reference to the pi_state object and frees or caches it | 822 | * Drops a reference to the pi_state object and frees or caches it |
823 | * when the last reference is gone. | 823 | * when the last reference is gone. |
824 | * | ||
825 | * Must be called with the hb lock held. | ||
826 | */ | 824 | */ |
827 | static void put_pi_state(struct futex_pi_state *pi_state) | 825 | static void put_pi_state(struct futex_pi_state *pi_state) |
828 | { | 826 | { |
@@ -837,16 +835,22 @@ static void put_pi_state(struct futex_pi_state *pi_state) | |||
837 | * and has cleaned up the pi_state already | 835 | * and has cleaned up the pi_state already |
838 | */ | 836 | */ |
839 | if (pi_state->owner) { | 837 | if (pi_state->owner) { |
840 | raw_spin_lock_irq(&pi_state->owner->pi_lock); | 838 | struct task_struct *owner; |
841 | list_del_init(&pi_state->list); | ||
842 | raw_spin_unlock_irq(&pi_state->owner->pi_lock); | ||
843 | 839 | ||
844 | rt_mutex_proxy_unlock(&pi_state->pi_mutex, pi_state->owner); | 840 | raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); |
841 | owner = pi_state->owner; | ||
842 | if (owner) { | ||
843 | raw_spin_lock(&owner->pi_lock); | ||
844 | list_del_init(&pi_state->list); | ||
845 | raw_spin_unlock(&owner->pi_lock); | ||
846 | } | ||
847 | rt_mutex_proxy_unlock(&pi_state->pi_mutex, owner); | ||
848 | raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); | ||
845 | } | 849 | } |
846 | 850 | ||
847 | if (current->pi_state_cache) | 851 | if (current->pi_state_cache) { |
848 | kfree(pi_state); | 852 | kfree(pi_state); |
849 | else { | 853 | } else { |
850 | /* | 854 | /* |
851 | * pi_state->list is already empty. | 855 | * pi_state->list is already empty. |
852 | * clear pi_state->owner. | 856 | * clear pi_state->owner. |
@@ -907,13 +911,14 @@ void exit_pi_state_list(struct task_struct *curr) | |||
907 | raw_spin_unlock_irq(&curr->pi_lock); | 911 | raw_spin_unlock_irq(&curr->pi_lock); |
908 | 912 | ||
909 | spin_lock(&hb->lock); | 913 | spin_lock(&hb->lock); |
910 | 914 | raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); | |
911 | raw_spin_lock_irq(&curr->pi_lock); | 915 | raw_spin_lock(&curr->pi_lock); |
912 | /* | 916 | /* |
913 | * We dropped the pi-lock, so re-check whether this | 917 | * We dropped the pi-lock, so re-check whether this |
914 | * task still owns the PI-state: | 918 | * task still owns the PI-state: |
915 | */ | 919 | */ |
916 | if (head->next != next) { | 920 | if (head->next != next) { |
921 | raw_spin_unlock(&pi_state->pi_mutex.wait_lock); | ||
917 | spin_unlock(&hb->lock); | 922 | spin_unlock(&hb->lock); |
918 | continue; | 923 | continue; |
919 | } | 924 | } |
@@ -922,9 +927,10 @@ void exit_pi_state_list(struct task_struct *curr) | |||
922 | WARN_ON(list_empty(&pi_state->list)); | 927 | WARN_ON(list_empty(&pi_state->list)); |
923 | list_del_init(&pi_state->list); | 928 | list_del_init(&pi_state->list); |
924 | pi_state->owner = NULL; | 929 | pi_state->owner = NULL; |
925 | raw_spin_unlock_irq(&curr->pi_lock); | 930 | raw_spin_unlock(&curr->pi_lock); |
926 | 931 | ||
927 | get_pi_state(pi_state); | 932 | get_pi_state(pi_state); |
933 | raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); | ||
928 | spin_unlock(&hb->lock); | 934 | spin_unlock(&hb->lock); |
929 | 935 | ||
930 | rt_mutex_futex_unlock(&pi_state->pi_mutex); | 936 | rt_mutex_futex_unlock(&pi_state->pi_mutex); |
@@ -1208,6 +1214,10 @@ static int attach_to_pi_owner(u32 uval, union futex_key *key, | |||
1208 | 1214 | ||
1209 | WARN_ON(!list_empty(&pi_state->list)); | 1215 | WARN_ON(!list_empty(&pi_state->list)); |
1210 | list_add(&pi_state->list, &p->pi_state_list); | 1216 | list_add(&pi_state->list, &p->pi_state_list); |
1217 | /* | ||
1218 | * Assignment without holding pi_state->pi_mutex.wait_lock is safe | ||
1219 | * because there is no concurrency as the object is not published yet. | ||
1220 | */ | ||
1211 | pi_state->owner = p; | 1221 | pi_state->owner = p; |
1212 | raw_spin_unlock_irq(&p->pi_lock); | 1222 | raw_spin_unlock_irq(&p->pi_lock); |
1213 | 1223 | ||
@@ -2878,6 +2888,7 @@ retry: | |||
2878 | raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); | 2888 | raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); |
2879 | spin_unlock(&hb->lock); | 2889 | spin_unlock(&hb->lock); |
2880 | 2890 | ||
2891 | /* drops pi_state->pi_mutex.wait_lock */ | ||
2881 | ret = wake_futex_pi(uaddr, uval, pi_state); | 2892 | ret = wake_futex_pi(uaddr, uval, pi_state); |
2882 | 2893 | ||
2883 | put_pi_state(pi_state); | 2894 | put_pi_state(pi_state); |
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index f51b7b6d2451..6fc89fd93824 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c | |||
@@ -202,7 +202,7 @@ __irq_startup_managed(struct irq_desc *desc, struct cpumask *aff, bool force) | |||
202 | 202 | ||
203 | irqd_clr_managed_shutdown(d); | 203 | irqd_clr_managed_shutdown(d); |
204 | 204 | ||
205 | if (cpumask_any_and(aff, cpu_online_mask) > nr_cpu_ids) { | 205 | if (cpumask_any_and(aff, cpu_online_mask) >= nr_cpu_ids) { |
206 | /* | 206 | /* |
207 | * Catch code which fiddles with enable_irq() on a managed | 207 | * Catch code which fiddles with enable_irq() on a managed |
208 | * and potentially shutdown IRQ. Chained interrupt | 208 | * and potentially shutdown IRQ. Chained interrupt |
diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c index f7086b78ad6e..5270a54b9fa4 100644 --- a/kernel/irq/generic-chip.c +++ b/kernel/irq/generic-chip.c | |||
@@ -322,7 +322,6 @@ int __irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip, | |||
322 | /* Calc pointer to the next generic chip */ | 322 | /* Calc pointer to the next generic chip */ |
323 | tmp += sizeof(*gc) + num_ct * sizeof(struct irq_chip_type); | 323 | tmp += sizeof(*gc) + num_ct * sizeof(struct irq_chip_type); |
324 | } | 324 | } |
325 | d->name = name; | ||
326 | return 0; | 325 | return 0; |
327 | } | 326 | } |
328 | EXPORT_SYMBOL_GPL(__irq_alloc_domain_generic_chips); | 327 | EXPORT_SYMBOL_GPL(__irq_alloc_domain_generic_chips); |
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index e84b7056bb08..ac4644e92b49 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c | |||
@@ -945,7 +945,7 @@ static int virq_debug_show(struct seq_file *m, void *private) | |||
945 | struct irq_desc *desc; | 945 | struct irq_desc *desc; |
946 | struct irq_domain *domain; | 946 | struct irq_domain *domain; |
947 | struct radix_tree_iter iter; | 947 | struct radix_tree_iter iter; |
948 | void **slot; | 948 | void __rcu **slot; |
949 | int i; | 949 | int i; |
950 | 950 | ||
951 | seq_printf(m, " %-16s %-6s %-10s %-10s %s\n", | 951 | seq_printf(m, " %-16s %-6s %-10s %-10s %s\n", |
@@ -1453,7 +1453,7 @@ out_free_desc: | |||
1453 | /* The irq_data was moved, fix the revmap to refer to the new location */ | 1453 | /* The irq_data was moved, fix the revmap to refer to the new location */ |
1454 | static void irq_domain_fix_revmap(struct irq_data *d) | 1454 | static void irq_domain_fix_revmap(struct irq_data *d) |
1455 | { | 1455 | { |
1456 | void **slot; | 1456 | void __rcu **slot; |
1457 | 1457 | ||
1458 | if (d->hwirq < d->domain->revmap_size) | 1458 | if (d->hwirq < d->domain->revmap_size) |
1459 | return; /* Not using radix tree. */ | 1459 | return; /* Not using radix tree. */ |
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 573dc52b0806..d00132b5c325 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c | |||
@@ -1643,6 +1643,10 @@ const void *free_irq(unsigned int irq, void *dev_id) | |||
1643 | #endif | 1643 | #endif |
1644 | 1644 | ||
1645 | action = __free_irq(irq, dev_id); | 1645 | action = __free_irq(irq, dev_id); |
1646 | |||
1647 | if (!action) | ||
1648 | return NULL; | ||
1649 | |||
1646 | devname = action->name; | 1650 | devname = action->name; |
1647 | kfree(action); | 1651 | kfree(action); |
1648 | return devname; | 1652 | return devname; |
diff --git a/kernel/kcmp.c b/kernel/kcmp.c index ea34ed8bb952..055bb2962a0b 100644 --- a/kernel/kcmp.c +++ b/kernel/kcmp.c | |||
@@ -131,7 +131,7 @@ static int kcmp_epoll_target(struct task_struct *task1, | |||
131 | if (filp_epoll) { | 131 | if (filp_epoll) { |
132 | filp_tgt = get_epoll_tfile_raw_ptr(filp_epoll, slot.tfd, slot.toff); | 132 | filp_tgt = get_epoll_tfile_raw_ptr(filp_epoll, slot.tfd, slot.toff); |
133 | fput(filp_epoll); | 133 | fput(filp_epoll); |
134 | } else | 134 | } |
135 | 135 | ||
136 | if (IS_ERR(filp_tgt)) | 136 | if (IS_ERR(filp_tgt)) |
137 | return PTR_ERR(filp_tgt); | 137 | return PTR_ERR(filp_tgt); |
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index 02f660666ab8..1fefe6dcafd7 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c | |||
@@ -613,6 +613,33 @@ struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem) | |||
613 | DEFINE_WAKE_Q(wake_q); | 613 | DEFINE_WAKE_Q(wake_q); |
614 | 614 | ||
615 | /* | 615 | /* |
616 | * __rwsem_down_write_failed_common(sem) | ||
617 | * rwsem_optimistic_spin(sem) | ||
618 | * osq_unlock(sem->osq) | ||
619 | * ... | ||
620 | * atomic_long_add_return(&sem->count) | ||
621 | * | ||
622 | * - VS - | ||
623 | * | ||
624 | * __up_write() | ||
625 | * if (atomic_long_sub_return_release(&sem->count) < 0) | ||
626 | * rwsem_wake(sem) | ||
627 | * osq_is_locked(&sem->osq) | ||
628 | * | ||
629 | * And __up_write() must observe !osq_is_locked() when it observes the | ||
630 | * atomic_long_add_return() in order to not miss a wakeup. | ||
631 | * | ||
632 | * This boils down to: | ||
633 | * | ||
634 | * [S.rel] X = 1 [RmW] r0 = (Y += 0) | ||
635 | * MB RMB | ||
636 | * [RmW] Y += 1 [L] r1 = X | ||
637 | * | ||
638 | * exists (r0=1 /\ r1=0) | ||
639 | */ | ||
640 | smp_rmb(); | ||
641 | |||
642 | /* | ||
616 | * If a spinner is present, it is not necessary to do the wakeup. | 643 | * If a spinner is present, it is not necessary to do the wakeup. |
617 | * Try to do wakeup only if the trylock succeeds to minimize | 644 | * Try to do wakeup only if the trylock succeeds to minimize |
618 | * spinlock contention which may introduce too much delay in the | 645 | * spinlock contention which may introduce too much delay in the |
diff --git a/kernel/memremap.c b/kernel/memremap.c index 6bcbfbf1a8fd..403ab9cdb949 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c | |||
@@ -350,7 +350,7 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, | |||
350 | pgprot_t pgprot = PAGE_KERNEL; | 350 | pgprot_t pgprot = PAGE_KERNEL; |
351 | struct dev_pagemap *pgmap; | 351 | struct dev_pagemap *pgmap; |
352 | struct page_map *page_map; | 352 | struct page_map *page_map; |
353 | int error, nid, is_ram; | 353 | int error, nid, is_ram, i = 0; |
354 | 354 | ||
355 | align_start = res->start & ~(SECTION_SIZE - 1); | 355 | align_start = res->start & ~(SECTION_SIZE - 1); |
356 | align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE) | 356 | align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE) |
@@ -448,6 +448,8 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, | |||
448 | list_del(&page->lru); | 448 | list_del(&page->lru); |
449 | page->pgmap = pgmap; | 449 | page->pgmap = pgmap; |
450 | percpu_ref_get(ref); | 450 | percpu_ref_get(ref); |
451 | if (!(++i % 1024)) | ||
452 | cond_resched(); | ||
451 | } | 453 | } |
452 | devres_add(dev, page_map); | 454 | devres_add(dev, page_map); |
453 | return __va(res->start); | 455 | return __va(res->start); |
diff --git a/kernel/params.c b/kernel/params.c index 60b2d8101355..cc9108c2a1fd 100644 --- a/kernel/params.c +++ b/kernel/params.c | |||
@@ -224,7 +224,7 @@ char *parse_args(const char *doing, | |||
224 | } \ | 224 | } \ |
225 | int param_get_##name(char *buffer, const struct kernel_param *kp) \ | 225 | int param_get_##name(char *buffer, const struct kernel_param *kp) \ |
226 | { \ | 226 | { \ |
227 | return scnprintf(buffer, PAGE_SIZE, format, \ | 227 | return scnprintf(buffer, PAGE_SIZE, format "\n", \ |
228 | *((type *)kp->arg)); \ | 228 | *((type *)kp->arg)); \ |
229 | } \ | 229 | } \ |
230 | const struct kernel_param_ops param_ops_##name = { \ | 230 | const struct kernel_param_ops param_ops_##name = { \ |
@@ -236,14 +236,14 @@ char *parse_args(const char *doing, | |||
236 | EXPORT_SYMBOL(param_ops_##name) | 236 | EXPORT_SYMBOL(param_ops_##name) |
237 | 237 | ||
238 | 238 | ||
239 | STANDARD_PARAM_DEF(byte, unsigned char, "%hhu", kstrtou8); | 239 | STANDARD_PARAM_DEF(byte, unsigned char, "%hhu", kstrtou8); |
240 | STANDARD_PARAM_DEF(short, short, "%hi", kstrtos16); | 240 | STANDARD_PARAM_DEF(short, short, "%hi", kstrtos16); |
241 | STANDARD_PARAM_DEF(ushort, unsigned short, "%hu", kstrtou16); | 241 | STANDARD_PARAM_DEF(ushort, unsigned short, "%hu", kstrtou16); |
242 | STANDARD_PARAM_DEF(int, int, "%i", kstrtoint); | 242 | STANDARD_PARAM_DEF(int, int, "%i", kstrtoint); |
243 | STANDARD_PARAM_DEF(uint, unsigned int, "%u", kstrtouint); | 243 | STANDARD_PARAM_DEF(uint, unsigned int, "%u", kstrtouint); |
244 | STANDARD_PARAM_DEF(long, long, "%li", kstrtol); | 244 | STANDARD_PARAM_DEF(long, long, "%li", kstrtol); |
245 | STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", kstrtoul); | 245 | STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", kstrtoul); |
246 | STANDARD_PARAM_DEF(ullong, unsigned long long, "%llu", kstrtoull); | 246 | STANDARD_PARAM_DEF(ullong, unsigned long long, "%llu", kstrtoull); |
247 | 247 | ||
248 | int param_set_charp(const char *val, const struct kernel_param *kp) | 248 | int param_set_charp(const char *val, const struct kernel_param *kp) |
249 | { | 249 | { |
@@ -270,7 +270,7 @@ EXPORT_SYMBOL(param_set_charp); | |||
270 | 270 | ||
271 | int param_get_charp(char *buffer, const struct kernel_param *kp) | 271 | int param_get_charp(char *buffer, const struct kernel_param *kp) |
272 | { | 272 | { |
273 | return scnprintf(buffer, PAGE_SIZE, "%s", *((char **)kp->arg)); | 273 | return scnprintf(buffer, PAGE_SIZE, "%s\n", *((char **)kp->arg)); |
274 | } | 274 | } |
275 | EXPORT_SYMBOL(param_get_charp); | 275 | EXPORT_SYMBOL(param_get_charp); |
276 | 276 | ||
@@ -301,7 +301,7 @@ EXPORT_SYMBOL(param_set_bool); | |||
301 | int param_get_bool(char *buffer, const struct kernel_param *kp) | 301 | int param_get_bool(char *buffer, const struct kernel_param *kp) |
302 | { | 302 | { |
303 | /* Y and N chosen as being relatively non-coder friendly */ | 303 | /* Y and N chosen as being relatively non-coder friendly */ |
304 | return sprintf(buffer, "%c", *(bool *)kp->arg ? 'Y' : 'N'); | 304 | return sprintf(buffer, "%c\n", *(bool *)kp->arg ? 'Y' : 'N'); |
305 | } | 305 | } |
306 | EXPORT_SYMBOL(param_get_bool); | 306 | EXPORT_SYMBOL(param_get_bool); |
307 | 307 | ||
@@ -360,7 +360,7 @@ EXPORT_SYMBOL(param_set_invbool); | |||
360 | 360 | ||
361 | int param_get_invbool(char *buffer, const struct kernel_param *kp) | 361 | int param_get_invbool(char *buffer, const struct kernel_param *kp) |
362 | { | 362 | { |
363 | return sprintf(buffer, "%c", (*(bool *)kp->arg) ? 'N' : 'Y'); | 363 | return sprintf(buffer, "%c\n", (*(bool *)kp->arg) ? 'N' : 'Y'); |
364 | } | 364 | } |
365 | EXPORT_SYMBOL(param_get_invbool); | 365 | EXPORT_SYMBOL(param_get_invbool); |
366 | 366 | ||
@@ -460,8 +460,9 @@ static int param_array_get(char *buffer, const struct kernel_param *kp) | |||
460 | struct kernel_param p = *kp; | 460 | struct kernel_param p = *kp; |
461 | 461 | ||
462 | for (i = off = 0; i < (arr->num ? *arr->num : arr->max); i++) { | 462 | for (i = off = 0; i < (arr->num ? *arr->num : arr->max); i++) { |
463 | /* Replace \n with comma */ | ||
463 | if (i) | 464 | if (i) |
464 | buffer[off++] = ','; | 465 | buffer[off - 1] = ','; |
465 | p.arg = arr->elem + arr->elemsize * i; | 466 | p.arg = arr->elem + arr->elemsize * i; |
466 | check_kparam_locked(p.mod); | 467 | check_kparam_locked(p.mod); |
467 | ret = arr->ops->get(buffer + off, &p); | 468 | ret = arr->ops->get(buffer + off, &p); |
@@ -507,7 +508,7 @@ EXPORT_SYMBOL(param_set_copystring); | |||
507 | int param_get_string(char *buffer, const struct kernel_param *kp) | 508 | int param_get_string(char *buffer, const struct kernel_param *kp) |
508 | { | 509 | { |
509 | const struct kparam_string *kps = kp->str; | 510 | const struct kparam_string *kps = kp->str; |
510 | return strlcpy(buffer, kps->string, kps->maxlen); | 511 | return scnprintf(buffer, PAGE_SIZE, "%s\n", kps->string); |
511 | } | 512 | } |
512 | EXPORT_SYMBOL(param_get_string); | 513 | EXPORT_SYMBOL(param_get_string); |
513 | 514 | ||
@@ -549,10 +550,6 @@ static ssize_t param_attr_show(struct module_attribute *mattr, | |||
549 | kernel_param_lock(mk->mod); | 550 | kernel_param_lock(mk->mod); |
550 | count = attribute->param->ops->get(buf, attribute->param); | 551 | count = attribute->param->ops->get(buf, attribute->param); |
551 | kernel_param_unlock(mk->mod); | 552 | kernel_param_unlock(mk->mod); |
552 | if (count > 0) { | ||
553 | strcat(buf, "\n"); | ||
554 | ++count; | ||
555 | } | ||
556 | return count; | 553 | return count; |
557 | } | 554 | } |
558 | 555 | ||
@@ -600,7 +597,7 @@ EXPORT_SYMBOL(kernel_param_unlock); | |||
600 | /* | 597 | /* |
601 | * add_sysfs_param - add a parameter to sysfs | 598 | * add_sysfs_param - add a parameter to sysfs |
602 | * @mk: struct module_kobject | 599 | * @mk: struct module_kobject |
603 | * @kparam: the actual parameter definition to add to sysfs | 600 | * @kp: the actual parameter definition to add to sysfs |
604 | * @name: name of parameter | 601 | * @name: name of parameter |
605 | * | 602 | * |
606 | * Create a kobject if for a (per-module) parameter if mp NULL, and | 603 | * Create a kobject if for a (per-module) parameter if mp NULL, and |
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 3e2b4f519009..ccd2d20e6b06 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c | |||
@@ -120,22 +120,26 @@ static void s2idle_loop(void) | |||
120 | * frozen processes + suspended devices + idle processors. | 120 | * frozen processes + suspended devices + idle processors. |
121 | * Thus s2idle_enter() should be called right after | 121 | * Thus s2idle_enter() should be called right after |
122 | * all devices have been suspended. | 122 | * all devices have been suspended. |
123 | * | ||
124 | * Wakeups during the noirq suspend of devices may be spurious, | ||
125 | * so prevent them from terminating the loop right away. | ||
123 | */ | 126 | */ |
124 | error = dpm_noirq_suspend_devices(PMSG_SUSPEND); | 127 | error = dpm_noirq_suspend_devices(PMSG_SUSPEND); |
125 | if (!error) | 128 | if (!error) |
126 | s2idle_enter(); | 129 | s2idle_enter(); |
130 | else if (error == -EBUSY && pm_wakeup_pending()) | ||
131 | error = 0; | ||
127 | 132 | ||
128 | dpm_noirq_resume_devices(PMSG_RESUME); | 133 | if (!error && s2idle_ops && s2idle_ops->wake) |
129 | if (error && (error != -EBUSY || !pm_wakeup_pending())) { | ||
130 | dpm_noirq_end(); | ||
131 | break; | ||
132 | } | ||
133 | |||
134 | if (s2idle_ops && s2idle_ops->wake) | ||
135 | s2idle_ops->wake(); | 134 | s2idle_ops->wake(); |
136 | 135 | ||
136 | dpm_noirq_resume_devices(PMSG_RESUME); | ||
137 | |||
137 | dpm_noirq_end(); | 138 | dpm_noirq_end(); |
138 | 139 | ||
140 | if (error) | ||
141 | break; | ||
142 | |||
139 | if (s2idle_ops && s2idle_ops->sync) | 143 | if (s2idle_ops && s2idle_ops->sync) |
140 | s2idle_ops->sync(); | 144 | s2idle_ops->sync(); |
141 | 145 | ||
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 1250e4bd4b85..b0ad62b0e7b8 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c | |||
@@ -882,6 +882,11 @@ void rcu_irq_exit(void) | |||
882 | 882 | ||
883 | RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_irq_exit() invoked with irqs enabled!!!"); | 883 | RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_irq_exit() invoked with irqs enabled!!!"); |
884 | rdtp = this_cpu_ptr(&rcu_dynticks); | 884 | rdtp = this_cpu_ptr(&rcu_dynticks); |
885 | |||
886 | /* Page faults can happen in NMI handlers, so check... */ | ||
887 | if (rdtp->dynticks_nmi_nesting) | ||
888 | return; | ||
889 | |||
885 | WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && | 890 | WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && |
886 | rdtp->dynticks_nesting < 1); | 891 | rdtp->dynticks_nesting < 1); |
887 | if (rdtp->dynticks_nesting <= 1) { | 892 | if (rdtp->dynticks_nesting <= 1) { |
@@ -1015,6 +1020,11 @@ void rcu_irq_enter(void) | |||
1015 | 1020 | ||
1016 | RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_irq_enter() invoked with irqs enabled!!!"); | 1021 | RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_irq_enter() invoked with irqs enabled!!!"); |
1017 | rdtp = this_cpu_ptr(&rcu_dynticks); | 1022 | rdtp = this_cpu_ptr(&rcu_dynticks); |
1023 | |||
1024 | /* Page faults can happen in NMI handlers, so check... */ | ||
1025 | if (rdtp->dynticks_nmi_nesting) | ||
1026 | return; | ||
1027 | |||
1018 | oldval = rdtp->dynticks_nesting; | 1028 | oldval = rdtp->dynticks_nesting; |
1019 | rdtp->dynticks_nesting++; | 1029 | rdtp->dynticks_nesting++; |
1020 | WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && | 1030 | WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && |
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 18a6966567da..d17c5da523a0 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
@@ -5166,6 +5166,28 @@ void sched_show_task(struct task_struct *p) | |||
5166 | put_task_stack(p); | 5166 | put_task_stack(p); |
5167 | } | 5167 | } |
5168 | 5168 | ||
5169 | static inline bool | ||
5170 | state_filter_match(unsigned long state_filter, struct task_struct *p) | ||
5171 | { | ||
5172 | /* no filter, everything matches */ | ||
5173 | if (!state_filter) | ||
5174 | return true; | ||
5175 | |||
5176 | /* filter, but doesn't match */ | ||
5177 | if (!(p->state & state_filter)) | ||
5178 | return false; | ||
5179 | |||
5180 | /* | ||
5181 | * When looking for TASK_UNINTERRUPTIBLE skip TASK_IDLE (allows | ||
5182 | * TASK_KILLABLE). | ||
5183 | */ | ||
5184 | if (state_filter == TASK_UNINTERRUPTIBLE && p->state == TASK_IDLE) | ||
5185 | return false; | ||
5186 | |||
5187 | return true; | ||
5188 | } | ||
5189 | |||
5190 | |||
5169 | void show_state_filter(unsigned long state_filter) | 5191 | void show_state_filter(unsigned long state_filter) |
5170 | { | 5192 | { |
5171 | struct task_struct *g, *p; | 5193 | struct task_struct *g, *p; |
@@ -5188,7 +5210,7 @@ void show_state_filter(unsigned long state_filter) | |||
5188 | */ | 5210 | */ |
5189 | touch_nmi_watchdog(); | 5211 | touch_nmi_watchdog(); |
5190 | touch_all_softlockup_watchdogs(); | 5212 | touch_all_softlockup_watchdogs(); |
5191 | if (!state_filter || (p->state & state_filter)) | 5213 | if (state_filter_match(state_filter, p)) |
5192 | sched_show_task(p); | 5214 | sched_show_task(p); |
5193 | } | 5215 | } |
5194 | 5216 | ||
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 01217fb5a5de..2f93e4a2d9f6 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c | |||
@@ -466,8 +466,6 @@ static char *task_group_path(struct task_group *tg) | |||
466 | } | 466 | } |
467 | #endif | 467 | #endif |
468 | 468 | ||
469 | static const char stat_nam[] = TASK_STATE_TO_CHAR_STR; | ||
470 | |||
471 | static void | 469 | static void |
472 | print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) | 470 | print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) |
473 | { | 471 | { |
diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 98b59b5db90b..bb3a38005b9c 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c | |||
@@ -17,11 +17,13 @@ | |||
17 | #include <linux/audit.h> | 17 | #include <linux/audit.h> |
18 | #include <linux/compat.h> | 18 | #include <linux/compat.h> |
19 | #include <linux/coredump.h> | 19 | #include <linux/coredump.h> |
20 | #include <linux/kmemleak.h> | ||
20 | #include <linux/sched.h> | 21 | #include <linux/sched.h> |
21 | #include <linux/sched/task_stack.h> | 22 | #include <linux/sched/task_stack.h> |
22 | #include <linux/seccomp.h> | 23 | #include <linux/seccomp.h> |
23 | #include <linux/slab.h> | 24 | #include <linux/slab.h> |
24 | #include <linux/syscalls.h> | 25 | #include <linux/syscalls.h> |
26 | #include <linux/sysctl.h> | ||
25 | 27 | ||
26 | #ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER | 28 | #ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER |
27 | #include <asm/syscall.h> | 29 | #include <asm/syscall.h> |
@@ -42,6 +44,7 @@ | |||
42 | * get/put helpers should be used when accessing an instance | 44 | * get/put helpers should be used when accessing an instance |
43 | * outside of a lifetime-guarded section. In general, this | 45 | * outside of a lifetime-guarded section. In general, this |
44 | * is only needed for handling filters shared across tasks. | 46 | * is only needed for handling filters shared across tasks. |
47 | * @log: true if all actions except for SECCOMP_RET_ALLOW should be logged | ||
45 | * @prev: points to a previously installed, or inherited, filter | 48 | * @prev: points to a previously installed, or inherited, filter |
46 | * @prog: the BPF program to evaluate | 49 | * @prog: the BPF program to evaluate |
47 | * | 50 | * |
@@ -57,6 +60,7 @@ | |||
57 | */ | 60 | */ |
58 | struct seccomp_filter { | 61 | struct seccomp_filter { |
59 | refcount_t usage; | 62 | refcount_t usage; |
63 | bool log; | ||
60 | struct seccomp_filter *prev; | 64 | struct seccomp_filter *prev; |
61 | struct bpf_prog *prog; | 65 | struct bpf_prog *prog; |
62 | }; | 66 | }; |
@@ -171,10 +175,15 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen) | |||
171 | /** | 175 | /** |
172 | * seccomp_run_filters - evaluates all seccomp filters against @sd | 176 | * seccomp_run_filters - evaluates all seccomp filters against @sd |
173 | * @sd: optional seccomp data to be passed to filters | 177 | * @sd: optional seccomp data to be passed to filters |
178 | * @match: stores struct seccomp_filter that resulted in the return value, | ||
179 | * unless filter returned SECCOMP_RET_ALLOW, in which case it will | ||
180 | * be unchanged. | ||
174 | * | 181 | * |
175 | * Returns valid seccomp BPF response codes. | 182 | * Returns valid seccomp BPF response codes. |
176 | */ | 183 | */ |
177 | static u32 seccomp_run_filters(const struct seccomp_data *sd) | 184 | #define ACTION_ONLY(ret) ((s32)((ret) & (SECCOMP_RET_ACTION_FULL))) |
185 | static u32 seccomp_run_filters(const struct seccomp_data *sd, | ||
186 | struct seccomp_filter **match) | ||
178 | { | 187 | { |
179 | struct seccomp_data sd_local; | 188 | struct seccomp_data sd_local; |
180 | u32 ret = SECCOMP_RET_ALLOW; | 189 | u32 ret = SECCOMP_RET_ALLOW; |
@@ -184,7 +193,7 @@ static u32 seccomp_run_filters(const struct seccomp_data *sd) | |||
184 | 193 | ||
185 | /* Ensure unexpected behavior doesn't result in failing open. */ | 194 | /* Ensure unexpected behavior doesn't result in failing open. */ |
186 | if (unlikely(WARN_ON(f == NULL))) | 195 | if (unlikely(WARN_ON(f == NULL))) |
187 | return SECCOMP_RET_KILL; | 196 | return SECCOMP_RET_KILL_PROCESS; |
188 | 197 | ||
189 | if (!sd) { | 198 | if (!sd) { |
190 | populate_seccomp_data(&sd_local); | 199 | populate_seccomp_data(&sd_local); |
@@ -198,8 +207,10 @@ static u32 seccomp_run_filters(const struct seccomp_data *sd) | |||
198 | for (; f; f = f->prev) { | 207 | for (; f; f = f->prev) { |
199 | u32 cur_ret = BPF_PROG_RUN(f->prog, sd); | 208 | u32 cur_ret = BPF_PROG_RUN(f->prog, sd); |
200 | 209 | ||
201 | if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION)) | 210 | if (ACTION_ONLY(cur_ret) < ACTION_ONLY(ret)) { |
202 | ret = cur_ret; | 211 | ret = cur_ret; |
212 | *match = f; | ||
213 | } | ||
203 | } | 214 | } |
204 | return ret; | 215 | return ret; |
205 | } | 216 | } |
@@ -444,6 +455,10 @@ static long seccomp_attach_filter(unsigned int flags, | |||
444 | return ret; | 455 | return ret; |
445 | } | 456 | } |
446 | 457 | ||
458 | /* Set log flag, if present. */ | ||
459 | if (flags & SECCOMP_FILTER_FLAG_LOG) | ||
460 | filter->log = true; | ||
461 | |||
447 | /* | 462 | /* |
448 | * If there is an existing filter, make it the prev and don't drop its | 463 | * If there is an existing filter, make it the prev and don't drop its |
449 | * task reference. | 464 | * task reference. |
@@ -458,14 +473,19 @@ static long seccomp_attach_filter(unsigned int flags, | |||
458 | return 0; | 473 | return 0; |
459 | } | 474 | } |
460 | 475 | ||
476 | void __get_seccomp_filter(struct seccomp_filter *filter) | ||
477 | { | ||
478 | /* Reference count is bounded by the number of total processes. */ | ||
479 | refcount_inc(&filter->usage); | ||
480 | } | ||
481 | |||
461 | /* get_seccomp_filter - increments the reference count of the filter on @tsk */ | 482 | /* get_seccomp_filter - increments the reference count of the filter on @tsk */ |
462 | void get_seccomp_filter(struct task_struct *tsk) | 483 | void get_seccomp_filter(struct task_struct *tsk) |
463 | { | 484 | { |
464 | struct seccomp_filter *orig = tsk->seccomp.filter; | 485 | struct seccomp_filter *orig = tsk->seccomp.filter; |
465 | if (!orig) | 486 | if (!orig) |
466 | return; | 487 | return; |
467 | /* Reference count is bounded by the number of total processes. */ | 488 | __get_seccomp_filter(orig); |
468 | refcount_inc(&orig->usage); | ||
469 | } | 489 | } |
470 | 490 | ||
471 | static inline void seccomp_filter_free(struct seccomp_filter *filter) | 491 | static inline void seccomp_filter_free(struct seccomp_filter *filter) |
@@ -476,10 +496,8 @@ static inline void seccomp_filter_free(struct seccomp_filter *filter) | |||
476 | } | 496 | } |
477 | } | 497 | } |
478 | 498 | ||
479 | /* put_seccomp_filter - decrements the ref count of tsk->seccomp.filter */ | 499 | static void __put_seccomp_filter(struct seccomp_filter *orig) |
480 | void put_seccomp_filter(struct task_struct *tsk) | ||
481 | { | 500 | { |
482 | struct seccomp_filter *orig = tsk->seccomp.filter; | ||
483 | /* Clean up single-reference branches iteratively. */ | 501 | /* Clean up single-reference branches iteratively. */ |
484 | while (orig && refcount_dec_and_test(&orig->usage)) { | 502 | while (orig && refcount_dec_and_test(&orig->usage)) { |
485 | struct seccomp_filter *freeme = orig; | 503 | struct seccomp_filter *freeme = orig; |
@@ -488,6 +506,12 @@ void put_seccomp_filter(struct task_struct *tsk) | |||
488 | } | 506 | } |
489 | } | 507 | } |
490 | 508 | ||
509 | /* put_seccomp_filter - decrements the ref count of tsk->seccomp.filter */ | ||
510 | void put_seccomp_filter(struct task_struct *tsk) | ||
511 | { | ||
512 | __put_seccomp_filter(tsk->seccomp.filter); | ||
513 | } | ||
514 | |||
491 | static void seccomp_init_siginfo(siginfo_t *info, int syscall, int reason) | 515 | static void seccomp_init_siginfo(siginfo_t *info, int syscall, int reason) |
492 | { | 516 | { |
493 | memset(info, 0, sizeof(*info)); | 517 | memset(info, 0, sizeof(*info)); |
@@ -514,6 +538,65 @@ static void seccomp_send_sigsys(int syscall, int reason) | |||
514 | } | 538 | } |
515 | #endif /* CONFIG_SECCOMP_FILTER */ | 539 | #endif /* CONFIG_SECCOMP_FILTER */ |
516 | 540 | ||
541 | /* For use with seccomp_actions_logged */ | ||
542 | #define SECCOMP_LOG_KILL_PROCESS (1 << 0) | ||
543 | #define SECCOMP_LOG_KILL_THREAD (1 << 1) | ||
544 | #define SECCOMP_LOG_TRAP (1 << 2) | ||
545 | #define SECCOMP_LOG_ERRNO (1 << 3) | ||
546 | #define SECCOMP_LOG_TRACE (1 << 4) | ||
547 | #define SECCOMP_LOG_LOG (1 << 5) | ||
548 | #define SECCOMP_LOG_ALLOW (1 << 6) | ||
549 | |||
550 | static u32 seccomp_actions_logged = SECCOMP_LOG_KILL_PROCESS | | ||
551 | SECCOMP_LOG_KILL_THREAD | | ||
552 | SECCOMP_LOG_TRAP | | ||
553 | SECCOMP_LOG_ERRNO | | ||
554 | SECCOMP_LOG_TRACE | | ||
555 | SECCOMP_LOG_LOG; | ||
556 | |||
557 | static inline void seccomp_log(unsigned long syscall, long signr, u32 action, | ||
558 | bool requested) | ||
559 | { | ||
560 | bool log = false; | ||
561 | |||
562 | switch (action) { | ||
563 | case SECCOMP_RET_ALLOW: | ||
564 | break; | ||
565 | case SECCOMP_RET_TRAP: | ||
566 | log = requested && seccomp_actions_logged & SECCOMP_LOG_TRAP; | ||
567 | break; | ||
568 | case SECCOMP_RET_ERRNO: | ||
569 | log = requested && seccomp_actions_logged & SECCOMP_LOG_ERRNO; | ||
570 | break; | ||
571 | case SECCOMP_RET_TRACE: | ||
572 | log = requested && seccomp_actions_logged & SECCOMP_LOG_TRACE; | ||
573 | break; | ||
574 | case SECCOMP_RET_LOG: | ||
575 | log = seccomp_actions_logged & SECCOMP_LOG_LOG; | ||
576 | break; | ||
577 | case SECCOMP_RET_KILL_THREAD: | ||
578 | log = seccomp_actions_logged & SECCOMP_LOG_KILL_THREAD; | ||
579 | break; | ||
580 | case SECCOMP_RET_KILL_PROCESS: | ||
581 | default: | ||
582 | log = seccomp_actions_logged & SECCOMP_LOG_KILL_PROCESS; | ||
583 | } | ||
584 | |||
585 | /* | ||
586 | * Force an audit message to be emitted when the action is RET_KILL_*, | ||
587 | * RET_LOG, or the FILTER_FLAG_LOG bit was set and the action is | ||
588 | * allowed to be logged by the admin. | ||
589 | */ | ||
590 | if (log) | ||
591 | return __audit_seccomp(syscall, signr, action); | ||
592 | |||
593 | /* | ||
594 | * Let the audit subsystem decide if the action should be audited based | ||
595 | * on whether the current task itself is being audited. | ||
596 | */ | ||
597 | return audit_seccomp(syscall, signr, action); | ||
598 | } | ||
599 | |||
517 | /* | 600 | /* |
518 | * Secure computing mode 1 allows only read/write/exit/sigreturn. | 601 | * Secure computing mode 1 allows only read/write/exit/sigreturn. |
519 | * To be fully secure this must be combined with rlimit | 602 | * To be fully secure this must be combined with rlimit |
@@ -539,7 +622,7 @@ static void __secure_computing_strict(int this_syscall) | |||
539 | #ifdef SECCOMP_DEBUG | 622 | #ifdef SECCOMP_DEBUG |
540 | dump_stack(); | 623 | dump_stack(); |
541 | #endif | 624 | #endif |
542 | audit_seccomp(this_syscall, SIGKILL, SECCOMP_RET_KILL); | 625 | seccomp_log(this_syscall, SIGKILL, SECCOMP_RET_KILL_THREAD, true); |
543 | do_exit(SIGKILL); | 626 | do_exit(SIGKILL); |
544 | } | 627 | } |
545 | 628 | ||
@@ -566,6 +649,7 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, | |||
566 | const bool recheck_after_trace) | 649 | const bool recheck_after_trace) |
567 | { | 650 | { |
568 | u32 filter_ret, action; | 651 | u32 filter_ret, action; |
652 | struct seccomp_filter *match = NULL; | ||
569 | int data; | 653 | int data; |
570 | 654 | ||
571 | /* | 655 | /* |
@@ -574,9 +658,9 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, | |||
574 | */ | 658 | */ |
575 | rmb(); | 659 | rmb(); |
576 | 660 | ||
577 | filter_ret = seccomp_run_filters(sd); | 661 | filter_ret = seccomp_run_filters(sd, &match); |
578 | data = filter_ret & SECCOMP_RET_DATA; | 662 | data = filter_ret & SECCOMP_RET_DATA; |
579 | action = filter_ret & SECCOMP_RET_ACTION; | 663 | action = filter_ret & SECCOMP_RET_ACTION_FULL; |
580 | 664 | ||
581 | switch (action) { | 665 | switch (action) { |
582 | case SECCOMP_RET_ERRNO: | 666 | case SECCOMP_RET_ERRNO: |
@@ -637,14 +721,25 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, | |||
637 | 721 | ||
638 | return 0; | 722 | return 0; |
639 | 723 | ||
724 | case SECCOMP_RET_LOG: | ||
725 | seccomp_log(this_syscall, 0, action, true); | ||
726 | return 0; | ||
727 | |||
640 | case SECCOMP_RET_ALLOW: | 728 | case SECCOMP_RET_ALLOW: |
729 | /* | ||
730 | * Note that the "match" filter will always be NULL for | ||
731 | * this action since SECCOMP_RET_ALLOW is the starting | ||
732 | * state in seccomp_run_filters(). | ||
733 | */ | ||
641 | return 0; | 734 | return 0; |
642 | 735 | ||
643 | case SECCOMP_RET_KILL: | 736 | case SECCOMP_RET_KILL_THREAD: |
737 | case SECCOMP_RET_KILL_PROCESS: | ||
644 | default: | 738 | default: |
645 | audit_seccomp(this_syscall, SIGSYS, action); | 739 | seccomp_log(this_syscall, SIGSYS, action, true); |
646 | /* Dump core only if this is the last remaining thread. */ | 740 | /* Dump core only if this is the last remaining thread. */ |
647 | if (get_nr_threads(current) == 1) { | 741 | if (action == SECCOMP_RET_KILL_PROCESS || |
742 | get_nr_threads(current) == 1) { | ||
648 | siginfo_t info; | 743 | siginfo_t info; |
649 | 744 | ||
650 | /* Show the original registers in the dump. */ | 745 | /* Show the original registers in the dump. */ |
@@ -653,13 +748,16 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, | |||
653 | seccomp_init_siginfo(&info, this_syscall, data); | 748 | seccomp_init_siginfo(&info, this_syscall, data); |
654 | do_coredump(&info); | 749 | do_coredump(&info); |
655 | } | 750 | } |
656 | do_exit(SIGSYS); | 751 | if (action == SECCOMP_RET_KILL_PROCESS) |
752 | do_group_exit(SIGSYS); | ||
753 | else | ||
754 | do_exit(SIGSYS); | ||
657 | } | 755 | } |
658 | 756 | ||
659 | unreachable(); | 757 | unreachable(); |
660 | 758 | ||
661 | skip: | 759 | skip: |
662 | audit_seccomp(this_syscall, 0, action); | 760 | seccomp_log(this_syscall, 0, action, match ? match->log : false); |
663 | return -1; | 761 | return -1; |
664 | } | 762 | } |
665 | #else | 763 | #else |
@@ -794,6 +892,29 @@ static inline long seccomp_set_mode_filter(unsigned int flags, | |||
794 | } | 892 | } |
795 | #endif | 893 | #endif |
796 | 894 | ||
895 | static long seccomp_get_action_avail(const char __user *uaction) | ||
896 | { | ||
897 | u32 action; | ||
898 | |||
899 | if (copy_from_user(&action, uaction, sizeof(action))) | ||
900 | return -EFAULT; | ||
901 | |||
902 | switch (action) { | ||
903 | case SECCOMP_RET_KILL_PROCESS: | ||
904 | case SECCOMP_RET_KILL_THREAD: | ||
905 | case SECCOMP_RET_TRAP: | ||
906 | case SECCOMP_RET_ERRNO: | ||
907 | case SECCOMP_RET_TRACE: | ||
908 | case SECCOMP_RET_LOG: | ||
909 | case SECCOMP_RET_ALLOW: | ||
910 | break; | ||
911 | default: | ||
912 | return -EOPNOTSUPP; | ||
913 | } | ||
914 | |||
915 | return 0; | ||
916 | } | ||
917 | |||
797 | /* Common entry point for both prctl and syscall. */ | 918 | /* Common entry point for both prctl and syscall. */ |
798 | static long do_seccomp(unsigned int op, unsigned int flags, | 919 | static long do_seccomp(unsigned int op, unsigned int flags, |
799 | const char __user *uargs) | 920 | const char __user *uargs) |
@@ -805,6 +926,11 @@ static long do_seccomp(unsigned int op, unsigned int flags, | |||
805 | return seccomp_set_mode_strict(); | 926 | return seccomp_set_mode_strict(); |
806 | case SECCOMP_SET_MODE_FILTER: | 927 | case SECCOMP_SET_MODE_FILTER: |
807 | return seccomp_set_mode_filter(flags, uargs); | 928 | return seccomp_set_mode_filter(flags, uargs); |
929 | case SECCOMP_GET_ACTION_AVAIL: | ||
930 | if (flags != 0) | ||
931 | return -EINVAL; | ||
932 | |||
933 | return seccomp_get_action_avail(uargs); | ||
808 | default: | 934 | default: |
809 | return -EINVAL; | 935 | return -EINVAL; |
810 | } | 936 | } |
@@ -908,13 +1034,13 @@ long seccomp_get_filter(struct task_struct *task, unsigned long filter_off, | |||
908 | if (!data) | 1034 | if (!data) |
909 | goto out; | 1035 | goto out; |
910 | 1036 | ||
911 | get_seccomp_filter(task); | 1037 | __get_seccomp_filter(filter); |
912 | spin_unlock_irq(&task->sighand->siglock); | 1038 | spin_unlock_irq(&task->sighand->siglock); |
913 | 1039 | ||
914 | if (copy_to_user(data, fprog->filter, bpf_classic_proglen(fprog))) | 1040 | if (copy_to_user(data, fprog->filter, bpf_classic_proglen(fprog))) |
915 | ret = -EFAULT; | 1041 | ret = -EFAULT; |
916 | 1042 | ||
917 | put_seccomp_filter(task); | 1043 | __put_seccomp_filter(filter); |
918 | return ret; | 1044 | return ret; |
919 | 1045 | ||
920 | out: | 1046 | out: |
@@ -922,3 +1048,185 @@ out: | |||
922 | return ret; | 1048 | return ret; |
923 | } | 1049 | } |
924 | #endif | 1050 | #endif |
1051 | |||
1052 | #ifdef CONFIG_SYSCTL | ||
1053 | |||
1054 | /* Human readable action names for friendly sysctl interaction */ | ||
1055 | #define SECCOMP_RET_KILL_PROCESS_NAME "kill_process" | ||
1056 | #define SECCOMP_RET_KILL_THREAD_NAME "kill_thread" | ||
1057 | #define SECCOMP_RET_TRAP_NAME "trap" | ||
1058 | #define SECCOMP_RET_ERRNO_NAME "errno" | ||
1059 | #define SECCOMP_RET_TRACE_NAME "trace" | ||
1060 | #define SECCOMP_RET_LOG_NAME "log" | ||
1061 | #define SECCOMP_RET_ALLOW_NAME "allow" | ||
1062 | |||
1063 | static const char seccomp_actions_avail[] = | ||
1064 | SECCOMP_RET_KILL_PROCESS_NAME " " | ||
1065 | SECCOMP_RET_KILL_THREAD_NAME " " | ||
1066 | SECCOMP_RET_TRAP_NAME " " | ||
1067 | SECCOMP_RET_ERRNO_NAME " " | ||
1068 | SECCOMP_RET_TRACE_NAME " " | ||
1069 | SECCOMP_RET_LOG_NAME " " | ||
1070 | SECCOMP_RET_ALLOW_NAME; | ||
1071 | |||
1072 | struct seccomp_log_name { | ||
1073 | u32 log; | ||
1074 | const char *name; | ||
1075 | }; | ||
1076 | |||
1077 | static const struct seccomp_log_name seccomp_log_names[] = { | ||
1078 | { SECCOMP_LOG_KILL_PROCESS, SECCOMP_RET_KILL_PROCESS_NAME }, | ||
1079 | { SECCOMP_LOG_KILL_THREAD, SECCOMP_RET_KILL_THREAD_NAME }, | ||
1080 | { SECCOMP_LOG_TRAP, SECCOMP_RET_TRAP_NAME }, | ||
1081 | { SECCOMP_LOG_ERRNO, SECCOMP_RET_ERRNO_NAME }, | ||
1082 | { SECCOMP_LOG_TRACE, SECCOMP_RET_TRACE_NAME }, | ||
1083 | { SECCOMP_LOG_LOG, SECCOMP_RET_LOG_NAME }, | ||
1084 | { SECCOMP_LOG_ALLOW, SECCOMP_RET_ALLOW_NAME }, | ||
1085 | { } | ||
1086 | }; | ||
1087 | |||
1088 | static bool seccomp_names_from_actions_logged(char *names, size_t size, | ||
1089 | u32 actions_logged) | ||
1090 | { | ||
1091 | const struct seccomp_log_name *cur; | ||
1092 | bool append_space = false; | ||
1093 | |||
1094 | for (cur = seccomp_log_names; cur->name && size; cur++) { | ||
1095 | ssize_t ret; | ||
1096 | |||
1097 | if (!(actions_logged & cur->log)) | ||
1098 | continue; | ||
1099 | |||
1100 | if (append_space) { | ||
1101 | ret = strscpy(names, " ", size); | ||
1102 | if (ret < 0) | ||
1103 | return false; | ||
1104 | |||
1105 | names += ret; | ||
1106 | size -= ret; | ||
1107 | } else | ||
1108 | append_space = true; | ||
1109 | |||
1110 | ret = strscpy(names, cur->name, size); | ||
1111 | if (ret < 0) | ||
1112 | return false; | ||
1113 | |||
1114 | names += ret; | ||
1115 | size -= ret; | ||
1116 | } | ||
1117 | |||
1118 | return true; | ||
1119 | } | ||
1120 | |||
1121 | static bool seccomp_action_logged_from_name(u32 *action_logged, | ||
1122 | const char *name) | ||
1123 | { | ||
1124 | const struct seccomp_log_name *cur; | ||
1125 | |||
1126 | for (cur = seccomp_log_names; cur->name; cur++) { | ||
1127 | if (!strcmp(cur->name, name)) { | ||
1128 | *action_logged = cur->log; | ||
1129 | return true; | ||
1130 | } | ||
1131 | } | ||
1132 | |||
1133 | return false; | ||
1134 | } | ||
1135 | |||
1136 | static bool seccomp_actions_logged_from_names(u32 *actions_logged, char *names) | ||
1137 | { | ||
1138 | char *name; | ||
1139 | |||
1140 | *actions_logged = 0; | ||
1141 | while ((name = strsep(&names, " ")) && *name) { | ||
1142 | u32 action_logged = 0; | ||
1143 | |||
1144 | if (!seccomp_action_logged_from_name(&action_logged, name)) | ||
1145 | return false; | ||
1146 | |||
1147 | *actions_logged |= action_logged; | ||
1148 | } | ||
1149 | |||
1150 | return true; | ||
1151 | } | ||
1152 | |||
1153 | static int seccomp_actions_logged_handler(struct ctl_table *ro_table, int write, | ||
1154 | void __user *buffer, size_t *lenp, | ||
1155 | loff_t *ppos) | ||
1156 | { | ||
1157 | char names[sizeof(seccomp_actions_avail)]; | ||
1158 | struct ctl_table table; | ||
1159 | int ret; | ||
1160 | |||
1161 | if (write && !capable(CAP_SYS_ADMIN)) | ||
1162 | return -EPERM; | ||
1163 | |||
1164 | memset(names, 0, sizeof(names)); | ||
1165 | |||
1166 | if (!write) { | ||
1167 | if (!seccomp_names_from_actions_logged(names, sizeof(names), | ||
1168 | seccomp_actions_logged)) | ||
1169 | return -EINVAL; | ||
1170 | } | ||
1171 | |||
1172 | table = *ro_table; | ||
1173 | table.data = names; | ||
1174 | table.maxlen = sizeof(names); | ||
1175 | ret = proc_dostring(&table, write, buffer, lenp, ppos); | ||
1176 | if (ret) | ||
1177 | return ret; | ||
1178 | |||
1179 | if (write) { | ||
1180 | u32 actions_logged; | ||
1181 | |||
1182 | if (!seccomp_actions_logged_from_names(&actions_logged, | ||
1183 | table.data)) | ||
1184 | return -EINVAL; | ||
1185 | |||
1186 | if (actions_logged & SECCOMP_LOG_ALLOW) | ||
1187 | return -EINVAL; | ||
1188 | |||
1189 | seccomp_actions_logged = actions_logged; | ||
1190 | } | ||
1191 | |||
1192 | return 0; | ||
1193 | } | ||
1194 | |||
1195 | static struct ctl_path seccomp_sysctl_path[] = { | ||
1196 | { .procname = "kernel", }, | ||
1197 | { .procname = "seccomp", }, | ||
1198 | { } | ||
1199 | }; | ||
1200 | |||
1201 | static struct ctl_table seccomp_sysctl_table[] = { | ||
1202 | { | ||
1203 | .procname = "actions_avail", | ||
1204 | .data = (void *) &seccomp_actions_avail, | ||
1205 | .maxlen = sizeof(seccomp_actions_avail), | ||
1206 | .mode = 0444, | ||
1207 | .proc_handler = proc_dostring, | ||
1208 | }, | ||
1209 | { | ||
1210 | .procname = "actions_logged", | ||
1211 | .mode = 0644, | ||
1212 | .proc_handler = seccomp_actions_logged_handler, | ||
1213 | }, | ||
1214 | { } | ||
1215 | }; | ||
1216 | |||
1217 | static int __init seccomp_sysctl_init(void) | ||
1218 | { | ||
1219 | struct ctl_table_header *hdr; | ||
1220 | |||
1221 | hdr = register_sysctl_paths(seccomp_sysctl_path, seccomp_sysctl_table); | ||
1222 | if (!hdr) | ||
1223 | pr_warn("seccomp: sysctl registration failed\n"); | ||
1224 | else | ||
1225 | kmemleak_not_leak(hdr); | ||
1226 | |||
1227 | return 0; | ||
1228 | } | ||
1229 | |||
1230 | device_initcall(seccomp_sysctl_init) | ||
1231 | |||
1232 | #endif /* CONFIG_SYSCTL */ | ||
diff --git a/kernel/smpboot.c b/kernel/smpboot.c index 1d71c051a951..5043e7433f4b 100644 --- a/kernel/smpboot.c +++ b/kernel/smpboot.c | |||
@@ -344,39 +344,30 @@ EXPORT_SYMBOL_GPL(smpboot_unregister_percpu_thread); | |||
344 | * by the client, but only by calling this function. | 344 | * by the client, but only by calling this function. |
345 | * This function can only be called on a registered smp_hotplug_thread. | 345 | * This function can only be called on a registered smp_hotplug_thread. |
346 | */ | 346 | */ |
347 | int smpboot_update_cpumask_percpu_thread(struct smp_hotplug_thread *plug_thread, | 347 | void smpboot_update_cpumask_percpu_thread(struct smp_hotplug_thread *plug_thread, |
348 | const struct cpumask *new) | 348 | const struct cpumask *new) |
349 | { | 349 | { |
350 | struct cpumask *old = plug_thread->cpumask; | 350 | struct cpumask *old = plug_thread->cpumask; |
351 | cpumask_var_t tmp; | 351 | static struct cpumask tmp; |
352 | unsigned int cpu; | 352 | unsigned int cpu; |
353 | 353 | ||
354 | if (!alloc_cpumask_var(&tmp, GFP_KERNEL)) | 354 | lockdep_assert_cpus_held(); |
355 | return -ENOMEM; | ||
356 | |||
357 | get_online_cpus(); | ||
358 | mutex_lock(&smpboot_threads_lock); | 355 | mutex_lock(&smpboot_threads_lock); |
359 | 356 | ||
360 | /* Park threads that were exclusively enabled on the old mask. */ | 357 | /* Park threads that were exclusively enabled on the old mask. */ |
361 | cpumask_andnot(tmp, old, new); | 358 | cpumask_andnot(&tmp, old, new); |
362 | for_each_cpu_and(cpu, tmp, cpu_online_mask) | 359 | for_each_cpu_and(cpu, &tmp, cpu_online_mask) |
363 | smpboot_park_thread(plug_thread, cpu); | 360 | smpboot_park_thread(plug_thread, cpu); |
364 | 361 | ||
365 | /* Unpark threads that are exclusively enabled on the new mask. */ | 362 | /* Unpark threads that are exclusively enabled on the new mask. */ |
366 | cpumask_andnot(tmp, new, old); | 363 | cpumask_andnot(&tmp, new, old); |
367 | for_each_cpu_and(cpu, tmp, cpu_online_mask) | 364 | for_each_cpu_and(cpu, &tmp, cpu_online_mask) |
368 | smpboot_unpark_thread(plug_thread, cpu); | 365 | smpboot_unpark_thread(plug_thread, cpu); |
369 | 366 | ||
370 | cpumask_copy(old, new); | 367 | cpumask_copy(old, new); |
371 | 368 | ||
372 | mutex_unlock(&smpboot_threads_lock); | 369 | mutex_unlock(&smpboot_threads_lock); |
373 | put_online_cpus(); | ||
374 | |||
375 | free_cpumask_var(tmp); | ||
376 | |||
377 | return 0; | ||
378 | } | 370 | } |
379 | EXPORT_SYMBOL_GPL(smpboot_update_cpumask_percpu_thread); | ||
380 | 371 | ||
381 | static DEFINE_PER_CPU(atomic_t, cpu_hotplug_state) = ATOMIC_INIT(CPU_POST_DEAD); | 372 | static DEFINE_PER_CPU(atomic_t, cpu_hotplug_state) = ATOMIC_INIT(CPU_POST_DEAD); |
382 | 373 | ||
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 6648fbbb8157..d9c31bc2eaea 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
@@ -367,7 +367,8 @@ static struct ctl_table kern_table[] = { | |||
367 | .data = &sysctl_sched_time_avg, | 367 | .data = &sysctl_sched_time_avg, |
368 | .maxlen = sizeof(unsigned int), | 368 | .maxlen = sizeof(unsigned int), |
369 | .mode = 0644, | 369 | .mode = 0644, |
370 | .proc_handler = proc_dointvec, | 370 | .proc_handler = proc_dointvec_minmax, |
371 | .extra1 = &one, | ||
371 | }, | 372 | }, |
372 | #ifdef CONFIG_SCHEDSTATS | 373 | #ifdef CONFIG_SCHEDSTATS |
373 | { | 374 | { |
@@ -871,9 +872,9 @@ static struct ctl_table kern_table[] = { | |||
871 | #if defined(CONFIG_LOCKUP_DETECTOR) | 872 | #if defined(CONFIG_LOCKUP_DETECTOR) |
872 | { | 873 | { |
873 | .procname = "watchdog", | 874 | .procname = "watchdog", |
874 | .data = &watchdog_user_enabled, | 875 | .data = &watchdog_user_enabled, |
875 | .maxlen = sizeof (int), | 876 | .maxlen = sizeof(int), |
876 | .mode = 0644, | 877 | .mode = 0644, |
877 | .proc_handler = proc_watchdog, | 878 | .proc_handler = proc_watchdog, |
878 | .extra1 = &zero, | 879 | .extra1 = &zero, |
879 | .extra2 = &one, | 880 | .extra2 = &one, |
@@ -889,16 +890,12 @@ static struct ctl_table kern_table[] = { | |||
889 | }, | 890 | }, |
890 | { | 891 | { |
891 | .procname = "nmi_watchdog", | 892 | .procname = "nmi_watchdog", |
892 | .data = &nmi_watchdog_enabled, | 893 | .data = &nmi_watchdog_user_enabled, |
893 | .maxlen = sizeof (int), | 894 | .maxlen = sizeof(int), |
894 | .mode = 0644, | 895 | .mode = NMI_WATCHDOG_SYSCTL_PERM, |
895 | .proc_handler = proc_nmi_watchdog, | 896 | .proc_handler = proc_nmi_watchdog, |
896 | .extra1 = &zero, | 897 | .extra1 = &zero, |
897 | #if defined(CONFIG_HAVE_NMI_WATCHDOG) || defined(CONFIG_HARDLOCKUP_DETECTOR) | ||
898 | .extra2 = &one, | 898 | .extra2 = &one, |
899 | #else | ||
900 | .extra2 = &zero, | ||
901 | #endif | ||
902 | }, | 899 | }, |
903 | { | 900 | { |
904 | .procname = "watchdog_cpumask", | 901 | .procname = "watchdog_cpumask", |
@@ -910,9 +907,9 @@ static struct ctl_table kern_table[] = { | |||
910 | #ifdef CONFIG_SOFTLOCKUP_DETECTOR | 907 | #ifdef CONFIG_SOFTLOCKUP_DETECTOR |
911 | { | 908 | { |
912 | .procname = "soft_watchdog", | 909 | .procname = "soft_watchdog", |
913 | .data = &soft_watchdog_enabled, | 910 | .data = &soft_watchdog_user_enabled, |
914 | .maxlen = sizeof (int), | 911 | .maxlen = sizeof(int), |
915 | .mode = 0644, | 912 | .mode = 0644, |
916 | .proc_handler = proc_soft_watchdog, | 913 | .proc_handler = proc_soft_watchdog, |
917 | .extra1 = &zero, | 914 | .extra1 = &zero, |
918 | .extra2 = &one, | 915 | .extra2 = &one, |
@@ -2187,8 +2184,6 @@ static int do_proc_douintvec_conv(unsigned long *lvalp, | |||
2187 | if (write) { | 2184 | if (write) { |
2188 | if (*lvalp > UINT_MAX) | 2185 | if (*lvalp > UINT_MAX) |
2189 | return -EINVAL; | 2186 | return -EINVAL; |
2190 | if (*lvalp > UINT_MAX) | ||
2191 | return -EINVAL; | ||
2192 | *valp = *lvalp; | 2187 | *valp = *lvalp; |
2193 | } else { | 2188 | } else { |
2194 | unsigned int val = *valp; | 2189 | unsigned int val = *valp; |
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 2a685b45b73b..45a3928544ce 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c | |||
@@ -648,6 +648,12 @@ int blk_trace_startstop(struct request_queue *q, int start) | |||
648 | } | 648 | } |
649 | EXPORT_SYMBOL_GPL(blk_trace_startstop); | 649 | EXPORT_SYMBOL_GPL(blk_trace_startstop); |
650 | 650 | ||
651 | /* | ||
652 | * When reading or writing the blktrace sysfs files, the references to the | ||
653 | * opened sysfs or device files should prevent the underlying block device | ||
654 | * from being removed. So no further delete protection is really needed. | ||
655 | */ | ||
656 | |||
651 | /** | 657 | /** |
652 | * blk_trace_ioctl: - handle the ioctls associated with tracing | 658 | * blk_trace_ioctl: - handle the ioctls associated with tracing |
653 | * @bdev: the block device | 659 | * @bdev: the block device |
@@ -665,7 +671,7 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg) | |||
665 | if (!q) | 671 | if (!q) |
666 | return -ENXIO; | 672 | return -ENXIO; |
667 | 673 | ||
668 | mutex_lock(&bdev->bd_mutex); | 674 | mutex_lock(&q->blk_trace_mutex); |
669 | 675 | ||
670 | switch (cmd) { | 676 | switch (cmd) { |
671 | case BLKTRACESETUP: | 677 | case BLKTRACESETUP: |
@@ -691,7 +697,7 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg) | |||
691 | break; | 697 | break; |
692 | } | 698 | } |
693 | 699 | ||
694 | mutex_unlock(&bdev->bd_mutex); | 700 | mutex_unlock(&q->blk_trace_mutex); |
695 | return ret; | 701 | return ret; |
696 | } | 702 | } |
697 | 703 | ||
@@ -1727,7 +1733,7 @@ static ssize_t sysfs_blk_trace_attr_show(struct device *dev, | |||
1727 | if (q == NULL) | 1733 | if (q == NULL) |
1728 | goto out_bdput; | 1734 | goto out_bdput; |
1729 | 1735 | ||
1730 | mutex_lock(&bdev->bd_mutex); | 1736 | mutex_lock(&q->blk_trace_mutex); |
1731 | 1737 | ||
1732 | if (attr == &dev_attr_enable) { | 1738 | if (attr == &dev_attr_enable) { |
1733 | ret = sprintf(buf, "%u\n", !!q->blk_trace); | 1739 | ret = sprintf(buf, "%u\n", !!q->blk_trace); |
@@ -1746,7 +1752,7 @@ static ssize_t sysfs_blk_trace_attr_show(struct device *dev, | |||
1746 | ret = sprintf(buf, "%llu\n", q->blk_trace->end_lba); | 1752 | ret = sprintf(buf, "%llu\n", q->blk_trace->end_lba); |
1747 | 1753 | ||
1748 | out_unlock_bdev: | 1754 | out_unlock_bdev: |
1749 | mutex_unlock(&bdev->bd_mutex); | 1755 | mutex_unlock(&q->blk_trace_mutex); |
1750 | out_bdput: | 1756 | out_bdput: |
1751 | bdput(bdev); | 1757 | bdput(bdev); |
1752 | out: | 1758 | out: |
@@ -1788,7 +1794,7 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev, | |||
1788 | if (q == NULL) | 1794 | if (q == NULL) |
1789 | goto out_bdput; | 1795 | goto out_bdput; |
1790 | 1796 | ||
1791 | mutex_lock(&bdev->bd_mutex); | 1797 | mutex_lock(&q->blk_trace_mutex); |
1792 | 1798 | ||
1793 | if (attr == &dev_attr_enable) { | 1799 | if (attr == &dev_attr_enable) { |
1794 | if (value) | 1800 | if (value) |
@@ -1814,7 +1820,7 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev, | |||
1814 | } | 1820 | } |
1815 | 1821 | ||
1816 | out_unlock_bdev: | 1822 | out_unlock_bdev: |
1817 | mutex_unlock(&bdev->bd_mutex); | 1823 | mutex_unlock(&q->blk_trace_mutex); |
1818 | out_bdput: | 1824 | out_bdput: |
1819 | bdput(bdev); | 1825 | bdput(bdev); |
1820 | out: | 1826 | out: |
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 6abfafd7f173..8319e09e15b9 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c | |||
@@ -4954,9 +4954,6 @@ static char ftrace_graph_buf[FTRACE_FILTER_SIZE] __initdata; | |||
4954 | static char ftrace_graph_notrace_buf[FTRACE_FILTER_SIZE] __initdata; | 4954 | static char ftrace_graph_notrace_buf[FTRACE_FILTER_SIZE] __initdata; |
4955 | static int ftrace_graph_set_hash(struct ftrace_hash *hash, char *buffer); | 4955 | static int ftrace_graph_set_hash(struct ftrace_hash *hash, char *buffer); |
4956 | 4956 | ||
4957 | static unsigned long save_global_trampoline; | ||
4958 | static unsigned long save_global_flags; | ||
4959 | |||
4960 | static int __init set_graph_function(char *str) | 4957 | static int __init set_graph_function(char *str) |
4961 | { | 4958 | { |
4962 | strlcpy(ftrace_graph_buf, str, FTRACE_FILTER_SIZE); | 4959 | strlcpy(ftrace_graph_buf, str, FTRACE_FILTER_SIZE); |
@@ -6808,17 +6805,6 @@ void unregister_ftrace_graph(void) | |||
6808 | unregister_pm_notifier(&ftrace_suspend_notifier); | 6805 | unregister_pm_notifier(&ftrace_suspend_notifier); |
6809 | unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL); | 6806 | unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL); |
6810 | 6807 | ||
6811 | #ifdef CONFIG_DYNAMIC_FTRACE | ||
6812 | /* | ||
6813 | * Function graph does not allocate the trampoline, but | ||
6814 | * other global_ops do. We need to reset the ALLOC_TRAMP flag | ||
6815 | * if one was used. | ||
6816 | */ | ||
6817 | global_ops.trampoline = save_global_trampoline; | ||
6818 | if (save_global_flags & FTRACE_OPS_FL_ALLOC_TRAMP) | ||
6819 | global_ops.flags |= FTRACE_OPS_FL_ALLOC_TRAMP; | ||
6820 | #endif | ||
6821 | |||
6822 | out: | 6808 | out: |
6823 | mutex_unlock(&ftrace_lock); | 6809 | mutex_unlock(&ftrace_lock); |
6824 | } | 6810 | } |
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 5360b7aec57a..752e5daf0896 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c | |||
@@ -4020,11 +4020,17 @@ static int tracing_open(struct inode *inode, struct file *file) | |||
4020 | /* If this file was open for write, then erase contents */ | 4020 | /* If this file was open for write, then erase contents */ |
4021 | if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) { | 4021 | if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) { |
4022 | int cpu = tracing_get_cpu(inode); | 4022 | int cpu = tracing_get_cpu(inode); |
4023 | struct trace_buffer *trace_buf = &tr->trace_buffer; | ||
4024 | |||
4025 | #ifdef CONFIG_TRACER_MAX_TRACE | ||
4026 | if (tr->current_trace->print_max) | ||
4027 | trace_buf = &tr->max_buffer; | ||
4028 | #endif | ||
4023 | 4029 | ||
4024 | if (cpu == RING_BUFFER_ALL_CPUS) | 4030 | if (cpu == RING_BUFFER_ALL_CPUS) |
4025 | tracing_reset_online_cpus(&tr->trace_buffer); | 4031 | tracing_reset_online_cpus(trace_buf); |
4026 | else | 4032 | else |
4027 | tracing_reset(&tr->trace_buffer, cpu); | 4033 | tracing_reset(trace_buf, cpu); |
4028 | } | 4034 | } |
4029 | 4035 | ||
4030 | if (file->f_mode & FMODE_READ) { | 4036 | if (file->f_mode & FMODE_READ) { |
@@ -5358,6 +5364,13 @@ static int tracing_set_tracer(struct trace_array *tr, const char *buf) | |||
5358 | if (t == tr->current_trace) | 5364 | if (t == tr->current_trace) |
5359 | goto out; | 5365 | goto out; |
5360 | 5366 | ||
5367 | /* Some tracers won't work on kernel command line */ | ||
5368 | if (system_state < SYSTEM_RUNNING && t->noboot) { | ||
5369 | pr_warn("Tracer '%s' is not allowed on command line, ignored\n", | ||
5370 | t->name); | ||
5371 | goto out; | ||
5372 | } | ||
5373 | |||
5361 | /* Some tracers are only allowed for the top level buffer */ | 5374 | /* Some tracers are only allowed for the top level buffer */ |
5362 | if (!trace_ok_for_array(t, tr)) { | 5375 | if (!trace_ok_for_array(t, tr)) { |
5363 | ret = -EINVAL; | 5376 | ret = -EINVAL; |
@@ -5667,7 +5680,7 @@ static int tracing_wait_pipe(struct file *filp) | |||
5667 | * | 5680 | * |
5668 | * iter->pos will be 0 if we haven't read anything. | 5681 | * iter->pos will be 0 if we haven't read anything. |
5669 | */ | 5682 | */ |
5670 | if (!tracing_is_on() && iter->pos) | 5683 | if (!tracer_tracing_is_on(iter->tr) && iter->pos) |
5671 | break; | 5684 | break; |
5672 | 5685 | ||
5673 | mutex_unlock(&iter->mutex); | 5686 | mutex_unlock(&iter->mutex); |
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index fb5d54d0d1b3..652c682707cd 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h | |||
@@ -444,6 +444,8 @@ struct tracer { | |||
444 | #ifdef CONFIG_TRACER_MAX_TRACE | 444 | #ifdef CONFIG_TRACER_MAX_TRACE |
445 | bool use_max_tr; | 445 | bool use_max_tr; |
446 | #endif | 446 | #endif |
447 | /* True if tracer cannot be enabled in kernel param */ | ||
448 | bool noboot; | ||
447 | }; | 449 | }; |
448 | 450 | ||
449 | 451 | ||
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index cd7480d0a201..dca78fc48439 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c | |||
@@ -282,6 +282,7 @@ static struct tracer mmio_tracer __read_mostly = | |||
282 | .close = mmio_close, | 282 | .close = mmio_close, |
283 | .read = mmio_read, | 283 | .read = mmio_read, |
284 | .print_line = mmio_print_line, | 284 | .print_line = mmio_print_line, |
285 | .noboot = true, | ||
285 | }; | 286 | }; |
286 | 287 | ||
287 | __init static int init_mmio_trace(void) | 288 | __init static int init_mmio_trace(void) |
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index bac629af2285..c738e764e2a5 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c | |||
@@ -656,15 +656,6 @@ int trace_print_lat_context(struct trace_iterator *iter) | |||
656 | return !trace_seq_has_overflowed(s); | 656 | return !trace_seq_has_overflowed(s); |
657 | } | 657 | } |
658 | 658 | ||
659 | static const char state_to_char[] = TASK_STATE_TO_CHAR_STR; | ||
660 | |||
661 | static int task_state_char(unsigned long state) | ||
662 | { | ||
663 | int bit = state ? __ffs(state) + 1 : 0; | ||
664 | |||
665 | return bit < sizeof(state_to_char) - 1 ? state_to_char[bit] : '?'; | ||
666 | } | ||
667 | |||
668 | /** | 659 | /** |
669 | * ftrace_find_event - find a registered event | 660 | * ftrace_find_event - find a registered event |
670 | * @type: the type of event to look for | 661 | * @type: the type of event to look for |
@@ -930,8 +921,8 @@ static enum print_line_t trace_ctxwake_print(struct trace_iterator *iter, | |||
930 | 921 | ||
931 | trace_assign_type(field, iter->ent); | 922 | trace_assign_type(field, iter->ent); |
932 | 923 | ||
933 | T = task_state_char(field->next_state); | 924 | T = __task_state_to_char(field->next_state); |
934 | S = task_state_char(field->prev_state); | 925 | S = __task_state_to_char(field->prev_state); |
935 | trace_find_cmdline(field->next_pid, comm); | 926 | trace_find_cmdline(field->next_pid, comm); |
936 | trace_seq_printf(&iter->seq, | 927 | trace_seq_printf(&iter->seq, |
937 | " %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n", | 928 | " %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n", |
@@ -966,8 +957,8 @@ static int trace_ctxwake_raw(struct trace_iterator *iter, char S) | |||
966 | trace_assign_type(field, iter->ent); | 957 | trace_assign_type(field, iter->ent); |
967 | 958 | ||
968 | if (!S) | 959 | if (!S) |
969 | S = task_state_char(field->prev_state); | 960 | S = __task_state_to_char(field->prev_state); |
970 | T = task_state_char(field->next_state); | 961 | T = __task_state_to_char(field->next_state); |
971 | trace_seq_printf(&iter->seq, "%d %d %c %d %d %d %c\n", | 962 | trace_seq_printf(&iter->seq, "%d %d %c %d %d %d %c\n", |
972 | field->prev_pid, | 963 | field->prev_pid, |
973 | field->prev_prio, | 964 | field->prev_prio, |
@@ -1002,8 +993,8 @@ static int trace_ctxwake_hex(struct trace_iterator *iter, char S) | |||
1002 | trace_assign_type(field, iter->ent); | 993 | trace_assign_type(field, iter->ent); |
1003 | 994 | ||
1004 | if (!S) | 995 | if (!S) |
1005 | S = task_state_char(field->prev_state); | 996 | S = __task_state_to_char(field->prev_state); |
1006 | T = task_state_char(field->next_state); | 997 | T = __task_state_to_char(field->next_state); |
1007 | 998 | ||
1008 | SEQ_PUT_HEX_FIELD(s, field->prev_pid); | 999 | SEQ_PUT_HEX_FIELD(s, field->prev_pid); |
1009 | SEQ_PUT_HEX_FIELD(s, field->prev_prio); | 1000 | SEQ_PUT_HEX_FIELD(s, field->prev_prio); |
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index ddec53b67646..0c331978b1a6 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c | |||
@@ -397,10 +397,10 @@ tracing_sched_switch_trace(struct trace_array *tr, | |||
397 | entry = ring_buffer_event_data(event); | 397 | entry = ring_buffer_event_data(event); |
398 | entry->prev_pid = prev->pid; | 398 | entry->prev_pid = prev->pid; |
399 | entry->prev_prio = prev->prio; | 399 | entry->prev_prio = prev->prio; |
400 | entry->prev_state = prev->state; | 400 | entry->prev_state = __get_task_state(prev); |
401 | entry->next_pid = next->pid; | 401 | entry->next_pid = next->pid; |
402 | entry->next_prio = next->prio; | 402 | entry->next_prio = next->prio; |
403 | entry->next_state = next->state; | 403 | entry->next_state = __get_task_state(next); |
404 | entry->next_cpu = task_cpu(next); | 404 | entry->next_cpu = task_cpu(next); |
405 | 405 | ||
406 | if (!call_filter_check_discard(call, entry, buffer, event)) | 406 | if (!call_filter_check_discard(call, entry, buffer, event)) |
@@ -425,10 +425,10 @@ tracing_sched_wakeup_trace(struct trace_array *tr, | |||
425 | entry = ring_buffer_event_data(event); | 425 | entry = ring_buffer_event_data(event); |
426 | entry->prev_pid = curr->pid; | 426 | entry->prev_pid = curr->pid; |
427 | entry->prev_prio = curr->prio; | 427 | entry->prev_prio = curr->prio; |
428 | entry->prev_state = curr->state; | 428 | entry->prev_state = __get_task_state(curr); |
429 | entry->next_pid = wakee->pid; | 429 | entry->next_pid = wakee->pid; |
430 | entry->next_prio = wakee->prio; | 430 | entry->next_prio = wakee->prio; |
431 | entry->next_state = wakee->state; | 431 | entry->next_state = __get_task_state(wakee); |
432 | entry->next_cpu = task_cpu(wakee); | 432 | entry->next_cpu = task_cpu(wakee); |
433 | 433 | ||
434 | if (!call_filter_check_discard(call, entry, buffer, event)) | 434 | if (!call_filter_check_discard(call, entry, buffer, event)) |
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index a4df67cbc711..49cb41412eec 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c | |||
@@ -96,23 +96,9 @@ check_stack(unsigned long ip, unsigned long *stack) | |||
96 | if (in_nmi()) | 96 | if (in_nmi()) |
97 | return; | 97 | return; |
98 | 98 | ||
99 | /* | ||
100 | * There's a slight chance that we are tracing inside the | ||
101 | * RCU infrastructure, and rcu_irq_enter() will not work | ||
102 | * as expected. | ||
103 | */ | ||
104 | if (unlikely(rcu_irq_enter_disabled())) | ||
105 | return; | ||
106 | |||
107 | local_irq_save(flags); | 99 | local_irq_save(flags); |
108 | arch_spin_lock(&stack_trace_max_lock); | 100 | arch_spin_lock(&stack_trace_max_lock); |
109 | 101 | ||
110 | /* | ||
111 | * RCU may not be watching, make it see us. | ||
112 | * The stack trace code uses rcu_sched. | ||
113 | */ | ||
114 | rcu_irq_enter(); | ||
115 | |||
116 | /* In case another CPU set the tracer_frame on us */ | 102 | /* In case another CPU set the tracer_frame on us */ |
117 | if (unlikely(!frame_size)) | 103 | if (unlikely(!frame_size)) |
118 | this_size -= tracer_frame; | 104 | this_size -= tracer_frame; |
@@ -205,7 +191,6 @@ check_stack(unsigned long ip, unsigned long *stack) | |||
205 | } | 191 | } |
206 | 192 | ||
207 | out: | 193 | out: |
208 | rcu_irq_exit(); | ||
209 | arch_spin_unlock(&stack_trace_max_lock); | 194 | arch_spin_unlock(&stack_trace_max_lock); |
210 | local_irq_restore(flags); | 195 | local_irq_restore(flags); |
211 | } | 196 | } |
diff --git a/kernel/watchdog.c b/kernel/watchdog.c index f5d52024f6b7..6bcb854909c0 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c | |||
@@ -29,20 +29,29 @@ | |||
29 | #include <linux/kvm_para.h> | 29 | #include <linux/kvm_para.h> |
30 | #include <linux/kthread.h> | 30 | #include <linux/kthread.h> |
31 | 31 | ||
32 | /* Watchdog configuration */ | 32 | static DEFINE_MUTEX(watchdog_mutex); |
33 | static DEFINE_MUTEX(watchdog_proc_mutex); | ||
34 | |||
35 | int __read_mostly nmi_watchdog_enabled; | ||
36 | 33 | ||
37 | #if defined(CONFIG_HARDLOCKUP_DETECTOR) || defined(CONFIG_HAVE_NMI_WATCHDOG) | 34 | #if defined(CONFIG_HARDLOCKUP_DETECTOR) || defined(CONFIG_HAVE_NMI_WATCHDOG) |
38 | unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED | | 35 | # define WATCHDOG_DEFAULT (SOFT_WATCHDOG_ENABLED | NMI_WATCHDOG_ENABLED) |
39 | NMI_WATCHDOG_ENABLED; | 36 | # define NMI_WATCHDOG_DEFAULT 1 |
40 | #else | 37 | #else |
41 | unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED; | 38 | # define WATCHDOG_DEFAULT (SOFT_WATCHDOG_ENABLED) |
39 | # define NMI_WATCHDOG_DEFAULT 0 | ||
42 | #endif | 40 | #endif |
43 | 41 | ||
42 | unsigned long __read_mostly watchdog_enabled; | ||
43 | int __read_mostly watchdog_user_enabled = 1; | ||
44 | int __read_mostly nmi_watchdog_user_enabled = NMI_WATCHDOG_DEFAULT; | ||
45 | int __read_mostly soft_watchdog_user_enabled = 1; | ||
46 | int __read_mostly watchdog_thresh = 10; | ||
47 | int __read_mostly nmi_watchdog_available; | ||
48 | |||
49 | struct cpumask watchdog_allowed_mask __read_mostly; | ||
50 | |||
51 | struct cpumask watchdog_cpumask __read_mostly; | ||
52 | unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask); | ||
53 | |||
44 | #ifdef CONFIG_HARDLOCKUP_DETECTOR | 54 | #ifdef CONFIG_HARDLOCKUP_DETECTOR |
45 | /* boot commands */ | ||
46 | /* | 55 | /* |
47 | * Should we panic when a soft-lockup or hard-lockup occurs: | 56 | * Should we panic when a soft-lockup or hard-lockup occurs: |
48 | */ | 57 | */ |
@@ -56,9 +65,9 @@ unsigned int __read_mostly hardlockup_panic = | |||
56 | * kernel command line parameters are parsed, because otherwise it is not | 65 | * kernel command line parameters are parsed, because otherwise it is not |
57 | * possible to override this in hardlockup_panic_setup(). | 66 | * possible to override this in hardlockup_panic_setup(). |
58 | */ | 67 | */ |
59 | void hardlockup_detector_disable(void) | 68 | void __init hardlockup_detector_disable(void) |
60 | { | 69 | { |
61 | watchdog_enabled &= ~NMI_WATCHDOG_ENABLED; | 70 | nmi_watchdog_user_enabled = 0; |
62 | } | 71 | } |
63 | 72 | ||
64 | static int __init hardlockup_panic_setup(char *str) | 73 | static int __init hardlockup_panic_setup(char *str) |
@@ -68,48 +77,24 @@ static int __init hardlockup_panic_setup(char *str) | |||
68 | else if (!strncmp(str, "nopanic", 7)) | 77 | else if (!strncmp(str, "nopanic", 7)) |
69 | hardlockup_panic = 0; | 78 | hardlockup_panic = 0; |
70 | else if (!strncmp(str, "0", 1)) | 79 | else if (!strncmp(str, "0", 1)) |
71 | watchdog_enabled &= ~NMI_WATCHDOG_ENABLED; | 80 | nmi_watchdog_user_enabled = 0; |
72 | else if (!strncmp(str, "1", 1)) | 81 | else if (!strncmp(str, "1", 1)) |
73 | watchdog_enabled |= NMI_WATCHDOG_ENABLED; | 82 | nmi_watchdog_user_enabled = 1; |
74 | return 1; | 83 | return 1; |
75 | } | 84 | } |
76 | __setup("nmi_watchdog=", hardlockup_panic_setup); | 85 | __setup("nmi_watchdog=", hardlockup_panic_setup); |
77 | 86 | ||
78 | #endif | 87 | # ifdef CONFIG_SMP |
79 | |||
80 | #ifdef CONFIG_SOFTLOCKUP_DETECTOR | ||
81 | int __read_mostly soft_watchdog_enabled; | ||
82 | #endif | ||
83 | |||
84 | int __read_mostly watchdog_user_enabled; | ||
85 | int __read_mostly watchdog_thresh = 10; | ||
86 | |||
87 | #ifdef CONFIG_SMP | ||
88 | int __read_mostly sysctl_softlockup_all_cpu_backtrace; | ||
89 | int __read_mostly sysctl_hardlockup_all_cpu_backtrace; | 88 | int __read_mostly sysctl_hardlockup_all_cpu_backtrace; |
90 | #endif | ||
91 | struct cpumask watchdog_cpumask __read_mostly; | ||
92 | unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask); | ||
93 | 89 | ||
94 | /* | 90 | static int __init hardlockup_all_cpu_backtrace_setup(char *str) |
95 | * The 'watchdog_running' variable is set to 1 when the watchdog threads | 91 | { |
96 | * are registered/started and is set to 0 when the watchdog threads are | 92 | sysctl_hardlockup_all_cpu_backtrace = !!simple_strtol(str, NULL, 0); |
97 | * unregistered/stopped, so it is an indicator whether the threads exist. | 93 | return 1; |
98 | */ | 94 | } |
99 | static int __read_mostly watchdog_running; | 95 | __setup("hardlockup_all_cpu_backtrace=", hardlockup_all_cpu_backtrace_setup); |
100 | /* | 96 | # endif /* CONFIG_SMP */ |
101 | * If a subsystem has a need to deactivate the watchdog temporarily, it | 97 | #endif /* CONFIG_HARDLOCKUP_DETECTOR */ |
102 | * can use the suspend/resume interface to achieve this. The content of | ||
103 | * the 'watchdog_suspended' variable reflects this state. Existing threads | ||
104 | * are parked/unparked by the lockup_detector_{suspend|resume} functions | ||
105 | * (see comment blocks pertaining to those functions for further details). | ||
106 | * | ||
107 | * 'watchdog_suspended' also prevents threads from being registered/started | ||
108 | * or unregistered/stopped via parameters in /proc/sys/kernel, so the state | ||
109 | * of 'watchdog_running' cannot change while the watchdog is deactivated | ||
110 | * temporarily (see related code in 'proc' handlers). | ||
111 | */ | ||
112 | int __read_mostly watchdog_suspended; | ||
113 | 98 | ||
114 | /* | 99 | /* |
115 | * These functions can be overridden if an architecture implements its | 100 | * These functions can be overridden if an architecture implements its |
@@ -121,36 +106,68 @@ int __read_mostly watchdog_suspended; | |||
121 | */ | 106 | */ |
122 | int __weak watchdog_nmi_enable(unsigned int cpu) | 107 | int __weak watchdog_nmi_enable(unsigned int cpu) |
123 | { | 108 | { |
109 | hardlockup_detector_perf_enable(); | ||
124 | return 0; | 110 | return 0; |
125 | } | 111 | } |
112 | |||
126 | void __weak watchdog_nmi_disable(unsigned int cpu) | 113 | void __weak watchdog_nmi_disable(unsigned int cpu) |
127 | { | 114 | { |
115 | hardlockup_detector_perf_disable(); | ||
128 | } | 116 | } |
129 | 117 | ||
130 | /* | 118 | /* Return 0, if a NMI watchdog is available. Error code otherwise */ |
131 | * watchdog_nmi_reconfigure can be implemented to be notified after any | 119 | int __weak __init watchdog_nmi_probe(void) |
132 | * watchdog configuration change. The arch hardlockup watchdog should | 120 | { |
133 | * respond to the following variables: | 121 | return hardlockup_detector_perf_init(); |
134 | * - nmi_watchdog_enabled | 122 | } |
123 | |||
124 | /** | ||
125 | * watchdog_nmi_stop - Stop the watchdog for reconfiguration | ||
126 | * | ||
127 | * The reconfiguration steps are: | ||
128 | * watchdog_nmi_stop(); | ||
129 | * update_variables(); | ||
130 | * watchdog_nmi_start(); | ||
131 | */ | ||
132 | void __weak watchdog_nmi_stop(void) { } | ||
133 | |||
134 | /** | ||
135 | * watchdog_nmi_start - Start the watchdog after reconfiguration | ||
136 | * | ||
137 | * Counterpart to watchdog_nmi_stop(). | ||
138 | * | ||
139 | * The following variables have been updated in update_variables() and | ||
140 | * contain the currently valid configuration: | ||
141 | * - watchdog_enabled | ||
135 | * - watchdog_thresh | 142 | * - watchdog_thresh |
136 | * - watchdog_cpumask | 143 | * - watchdog_cpumask |
137 | * - sysctl_hardlockup_all_cpu_backtrace | ||
138 | * - hardlockup_panic | ||
139 | * - watchdog_suspended | ||
140 | */ | 144 | */ |
141 | void __weak watchdog_nmi_reconfigure(void) | 145 | void __weak watchdog_nmi_start(void) { } |
146 | |||
147 | /** | ||
148 | * lockup_detector_update_enable - Update the sysctl enable bit | ||
149 | * | ||
150 | * Caller needs to make sure that the NMI/perf watchdogs are off, so this | ||
151 | * can't race with watchdog_nmi_disable(). | ||
152 | */ | ||
153 | static void lockup_detector_update_enable(void) | ||
142 | { | 154 | { |
155 | watchdog_enabled = 0; | ||
156 | if (!watchdog_user_enabled) | ||
157 | return; | ||
158 | if (nmi_watchdog_available && nmi_watchdog_user_enabled) | ||
159 | watchdog_enabled |= NMI_WATCHDOG_ENABLED; | ||
160 | if (soft_watchdog_user_enabled) | ||
161 | watchdog_enabled |= SOFT_WATCHDOG_ENABLED; | ||
143 | } | 162 | } |
144 | 163 | ||
145 | |||
146 | #ifdef CONFIG_SOFTLOCKUP_DETECTOR | 164 | #ifdef CONFIG_SOFTLOCKUP_DETECTOR |
147 | 165 | ||
148 | /* Helper for online, unparked cpus. */ | 166 | /* Global variables, exported for sysctl */ |
149 | #define for_each_watchdog_cpu(cpu) \ | 167 | unsigned int __read_mostly softlockup_panic = |
150 | for_each_cpu_and((cpu), cpu_online_mask, &watchdog_cpumask) | 168 | CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE; |
151 | |||
152 | atomic_t watchdog_park_in_progress = ATOMIC_INIT(0); | ||
153 | 169 | ||
170 | static bool softlockup_threads_initialized __read_mostly; | ||
154 | static u64 __read_mostly sample_period; | 171 | static u64 __read_mostly sample_period; |
155 | 172 | ||
156 | static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); | 173 | static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); |
@@ -164,50 +181,40 @@ static DEFINE_PER_CPU(struct task_struct *, softlockup_task_ptr_saved); | |||
164 | static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved); | 181 | static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved); |
165 | static unsigned long soft_lockup_nmi_warn; | 182 | static unsigned long soft_lockup_nmi_warn; |
166 | 183 | ||
167 | unsigned int __read_mostly softlockup_panic = | ||
168 | CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE; | ||
169 | |||
170 | static int __init softlockup_panic_setup(char *str) | 184 | static int __init softlockup_panic_setup(char *str) |
171 | { | 185 | { |
172 | softlockup_panic = simple_strtoul(str, NULL, 0); | 186 | softlockup_panic = simple_strtoul(str, NULL, 0); |
173 | |||
174 | return 1; | 187 | return 1; |
175 | } | 188 | } |
176 | __setup("softlockup_panic=", softlockup_panic_setup); | 189 | __setup("softlockup_panic=", softlockup_panic_setup); |
177 | 190 | ||
178 | static int __init nowatchdog_setup(char *str) | 191 | static int __init nowatchdog_setup(char *str) |
179 | { | 192 | { |
180 | watchdog_enabled = 0; | 193 | watchdog_user_enabled = 0; |
181 | return 1; | 194 | return 1; |
182 | } | 195 | } |
183 | __setup("nowatchdog", nowatchdog_setup); | 196 | __setup("nowatchdog", nowatchdog_setup); |
184 | 197 | ||
185 | static int __init nosoftlockup_setup(char *str) | 198 | static int __init nosoftlockup_setup(char *str) |
186 | { | 199 | { |
187 | watchdog_enabled &= ~SOFT_WATCHDOG_ENABLED; | 200 | soft_watchdog_user_enabled = 0; |
188 | return 1; | 201 | return 1; |
189 | } | 202 | } |
190 | __setup("nosoftlockup", nosoftlockup_setup); | 203 | __setup("nosoftlockup", nosoftlockup_setup); |
191 | 204 | ||
192 | #ifdef CONFIG_SMP | 205 | #ifdef CONFIG_SMP |
206 | int __read_mostly sysctl_softlockup_all_cpu_backtrace; | ||
207 | |||
193 | static int __init softlockup_all_cpu_backtrace_setup(char *str) | 208 | static int __init softlockup_all_cpu_backtrace_setup(char *str) |
194 | { | 209 | { |
195 | sysctl_softlockup_all_cpu_backtrace = | 210 | sysctl_softlockup_all_cpu_backtrace = !!simple_strtol(str, NULL, 0); |
196 | !!simple_strtol(str, NULL, 0); | ||
197 | return 1; | 211 | return 1; |
198 | } | 212 | } |
199 | __setup("softlockup_all_cpu_backtrace=", softlockup_all_cpu_backtrace_setup); | 213 | __setup("softlockup_all_cpu_backtrace=", softlockup_all_cpu_backtrace_setup); |
200 | #ifdef CONFIG_HARDLOCKUP_DETECTOR | ||
201 | static int __init hardlockup_all_cpu_backtrace_setup(char *str) | ||
202 | { | ||
203 | sysctl_hardlockup_all_cpu_backtrace = | ||
204 | !!simple_strtol(str, NULL, 0); | ||
205 | return 1; | ||
206 | } | ||
207 | __setup("hardlockup_all_cpu_backtrace=", hardlockup_all_cpu_backtrace_setup); | ||
208 | #endif | ||
209 | #endif | 214 | #endif |
210 | 215 | ||
216 | static void __lockup_detector_cleanup(void); | ||
217 | |||
211 | /* | 218 | /* |
212 | * Hard-lockup warnings should be triggered after just a few seconds. Soft- | 219 | * Hard-lockup warnings should be triggered after just a few seconds. Soft- |
213 | * lockups can have false positives under extreme conditions. So we generally | 220 | * lockups can have false positives under extreme conditions. So we generally |
@@ -278,11 +285,15 @@ void touch_all_softlockup_watchdogs(void) | |||
278 | int cpu; | 285 | int cpu; |
279 | 286 | ||
280 | /* | 287 | /* |
281 | * this is done lockless | 288 | * watchdog_mutex cannpt be taken here, as this might be called |
282 | * do we care if a 0 races with a timestamp? | 289 | * from (soft)interrupt context, so the access to |
283 | * all it means is the softlock check starts one cycle later | 290 | * watchdog_allowed_cpumask might race with a concurrent update. |
291 | * | ||
292 | * The watchdog time stamp can race against a concurrent real | ||
293 | * update as well, the only side effect might be a cycle delay for | ||
294 | * the softlockup check. | ||
284 | */ | 295 | */ |
285 | for_each_watchdog_cpu(cpu) | 296 | for_each_cpu(cpu, &watchdog_allowed_mask) |
286 | per_cpu(watchdog_touch_ts, cpu) = 0; | 297 | per_cpu(watchdog_touch_ts, cpu) = 0; |
287 | wq_watchdog_touch(-1); | 298 | wq_watchdog_touch(-1); |
288 | } | 299 | } |
@@ -322,9 +333,6 @@ static void watchdog_interrupt_count(void) | |||
322 | __this_cpu_inc(hrtimer_interrupts); | 333 | __this_cpu_inc(hrtimer_interrupts); |
323 | } | 334 | } |
324 | 335 | ||
325 | static int watchdog_enable_all_cpus(void); | ||
326 | static void watchdog_disable_all_cpus(void); | ||
327 | |||
328 | /* watchdog kicker functions */ | 336 | /* watchdog kicker functions */ |
329 | static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) | 337 | static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) |
330 | { | 338 | { |
@@ -333,7 +341,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) | |||
333 | int duration; | 341 | int duration; |
334 | int softlockup_all_cpu_backtrace = sysctl_softlockup_all_cpu_backtrace; | 342 | int softlockup_all_cpu_backtrace = sysctl_softlockup_all_cpu_backtrace; |
335 | 343 | ||
336 | if (atomic_read(&watchdog_park_in_progress) != 0) | 344 | if (!watchdog_enabled) |
337 | return HRTIMER_NORESTART; | 345 | return HRTIMER_NORESTART; |
338 | 346 | ||
339 | /* kick the hardlockup detector */ | 347 | /* kick the hardlockup detector */ |
@@ -447,32 +455,38 @@ static void watchdog_set_prio(unsigned int policy, unsigned int prio) | |||
447 | 455 | ||
448 | static void watchdog_enable(unsigned int cpu) | 456 | static void watchdog_enable(unsigned int cpu) |
449 | { | 457 | { |
450 | struct hrtimer *hrtimer = raw_cpu_ptr(&watchdog_hrtimer); | 458 | struct hrtimer *hrtimer = this_cpu_ptr(&watchdog_hrtimer); |
451 | 459 | ||
452 | /* kick off the timer for the hardlockup detector */ | 460 | /* |
461 | * Start the timer first to prevent the NMI watchdog triggering | ||
462 | * before the timer has a chance to fire. | ||
463 | */ | ||
453 | hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); | 464 | hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); |
454 | hrtimer->function = watchdog_timer_fn; | 465 | hrtimer->function = watchdog_timer_fn; |
455 | |||
456 | /* Enable the perf event */ | ||
457 | watchdog_nmi_enable(cpu); | ||
458 | |||
459 | /* done here because hrtimer_start can only pin to smp_processor_id() */ | ||
460 | hrtimer_start(hrtimer, ns_to_ktime(sample_period), | 466 | hrtimer_start(hrtimer, ns_to_ktime(sample_period), |
461 | HRTIMER_MODE_REL_PINNED); | 467 | HRTIMER_MODE_REL_PINNED); |
462 | 468 | ||
463 | /* initialize timestamp */ | 469 | /* Initialize timestamp */ |
464 | watchdog_set_prio(SCHED_FIFO, MAX_RT_PRIO - 1); | ||
465 | __touch_watchdog(); | 470 | __touch_watchdog(); |
471 | /* Enable the perf event */ | ||
472 | if (watchdog_enabled & NMI_WATCHDOG_ENABLED) | ||
473 | watchdog_nmi_enable(cpu); | ||
474 | |||
475 | watchdog_set_prio(SCHED_FIFO, MAX_RT_PRIO - 1); | ||
466 | } | 476 | } |
467 | 477 | ||
468 | static void watchdog_disable(unsigned int cpu) | 478 | static void watchdog_disable(unsigned int cpu) |
469 | { | 479 | { |
470 | struct hrtimer *hrtimer = raw_cpu_ptr(&watchdog_hrtimer); | 480 | struct hrtimer *hrtimer = this_cpu_ptr(&watchdog_hrtimer); |
471 | 481 | ||
472 | watchdog_set_prio(SCHED_NORMAL, 0); | 482 | watchdog_set_prio(SCHED_NORMAL, 0); |
473 | hrtimer_cancel(hrtimer); | 483 | /* |
474 | /* disable the perf event */ | 484 | * Disable the perf event first. That prevents that a large delay |
485 | * between disabling the timer and disabling the perf event causes | ||
486 | * the perf NMI to detect a false positive. | ||
487 | */ | ||
475 | watchdog_nmi_disable(cpu); | 488 | watchdog_nmi_disable(cpu); |
489 | hrtimer_cancel(hrtimer); | ||
476 | } | 490 | } |
477 | 491 | ||
478 | static void watchdog_cleanup(unsigned int cpu, bool online) | 492 | static void watchdog_cleanup(unsigned int cpu, bool online) |
@@ -499,21 +513,6 @@ static void watchdog(unsigned int cpu) | |||
499 | __this_cpu_write(soft_lockup_hrtimer_cnt, | 513 | __this_cpu_write(soft_lockup_hrtimer_cnt, |
500 | __this_cpu_read(hrtimer_interrupts)); | 514 | __this_cpu_read(hrtimer_interrupts)); |
501 | __touch_watchdog(); | 515 | __touch_watchdog(); |
502 | |||
503 | /* | ||
504 | * watchdog_nmi_enable() clears the NMI_WATCHDOG_ENABLED bit in the | ||
505 | * failure path. Check for failures that can occur asynchronously - | ||
506 | * for example, when CPUs are on-lined - and shut down the hardware | ||
507 | * perf event on each CPU accordingly. | ||
508 | * | ||
509 | * The only non-obvious place this bit can be cleared is through | ||
510 | * watchdog_nmi_enable(), so a pr_info() is placed there. Placing a | ||
511 | * pr_info here would be too noisy as it would result in a message | ||
512 | * every few seconds if the hardlockup was disabled but the softlockup | ||
513 | * enabled. | ||
514 | */ | ||
515 | if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED)) | ||
516 | watchdog_nmi_disable(cpu); | ||
517 | } | 516 | } |
518 | 517 | ||
519 | static struct smp_hotplug_thread watchdog_threads = { | 518 | static struct smp_hotplug_thread watchdog_threads = { |
@@ -527,295 +526,174 @@ static struct smp_hotplug_thread watchdog_threads = { | |||
527 | .unpark = watchdog_enable, | 526 | .unpark = watchdog_enable, |
528 | }; | 527 | }; |
529 | 528 | ||
530 | /* | 529 | static void softlockup_update_smpboot_threads(void) |
531 | * park all watchdog threads that are specified in 'watchdog_cpumask' | ||
532 | * | ||
533 | * This function returns an error if kthread_park() of a watchdog thread | ||
534 | * fails. In this situation, the watchdog threads of some CPUs can already | ||
535 | * be parked and the watchdog threads of other CPUs can still be runnable. | ||
536 | * Callers are expected to handle this special condition as appropriate in | ||
537 | * their context. | ||
538 | * | ||
539 | * This function may only be called in a context that is protected against | ||
540 | * races with CPU hotplug - for example, via get_online_cpus(). | ||
541 | */ | ||
542 | static int watchdog_park_threads(void) | ||
543 | { | 530 | { |
544 | int cpu, ret = 0; | 531 | lockdep_assert_held(&watchdog_mutex); |
545 | 532 | ||
546 | atomic_set(&watchdog_park_in_progress, 1); | 533 | if (!softlockup_threads_initialized) |
534 | return; | ||
547 | 535 | ||
548 | for_each_watchdog_cpu(cpu) { | 536 | smpboot_update_cpumask_percpu_thread(&watchdog_threads, |
549 | ret = kthread_park(per_cpu(softlockup_watchdog, cpu)); | 537 | &watchdog_allowed_mask); |
550 | if (ret) | ||
551 | break; | ||
552 | } | ||
553 | |||
554 | atomic_set(&watchdog_park_in_progress, 0); | ||
555 | |||
556 | return ret; | ||
557 | } | 538 | } |
558 | 539 | ||
559 | /* | 540 | /* Temporarily park all watchdog threads */ |
560 | * unpark all watchdog threads that are specified in 'watchdog_cpumask' | 541 | static void softlockup_park_all_threads(void) |
561 | * | ||
562 | * This function may only be called in a context that is protected against | ||
563 | * races with CPU hotplug - for example, via get_online_cpus(). | ||
564 | */ | ||
565 | static void watchdog_unpark_threads(void) | ||
566 | { | 542 | { |
567 | int cpu; | 543 | cpumask_clear(&watchdog_allowed_mask); |
568 | 544 | softlockup_update_smpboot_threads(); | |
569 | for_each_watchdog_cpu(cpu) | ||
570 | kthread_unpark(per_cpu(softlockup_watchdog, cpu)); | ||
571 | } | 545 | } |
572 | 546 | ||
573 | static int update_watchdog_all_cpus(void) | 547 | /* Unpark enabled threads */ |
548 | static void softlockup_unpark_threads(void) | ||
574 | { | 549 | { |
575 | int ret; | 550 | cpumask_copy(&watchdog_allowed_mask, &watchdog_cpumask); |
576 | 551 | softlockup_update_smpboot_threads(); | |
577 | ret = watchdog_park_threads(); | ||
578 | if (ret) | ||
579 | return ret; | ||
580 | |||
581 | watchdog_unpark_threads(); | ||
582 | |||
583 | return 0; | ||
584 | } | 552 | } |
585 | 553 | ||
586 | static int watchdog_enable_all_cpus(void) | 554 | static void lockup_detector_reconfigure(void) |
587 | { | 555 | { |
588 | int err = 0; | 556 | cpus_read_lock(); |
589 | 557 | watchdog_nmi_stop(); | |
590 | if (!watchdog_running) { | 558 | softlockup_park_all_threads(); |
591 | err = smpboot_register_percpu_thread_cpumask(&watchdog_threads, | 559 | set_sample_period(); |
592 | &watchdog_cpumask); | 560 | lockup_detector_update_enable(); |
593 | if (err) | 561 | if (watchdog_enabled && watchdog_thresh) |
594 | pr_err("Failed to create watchdog threads, disabled\n"); | 562 | softlockup_unpark_threads(); |
595 | else | 563 | watchdog_nmi_start(); |
596 | watchdog_running = 1; | 564 | cpus_read_unlock(); |
597 | } else { | 565 | /* |
598 | /* | 566 | * Must be called outside the cpus locked section to prevent |
599 | * Enable/disable the lockup detectors or | 567 | * recursive locking in the perf code. |
600 | * change the sample period 'on the fly'. | 568 | */ |
601 | */ | 569 | __lockup_detector_cleanup(); |
602 | err = update_watchdog_all_cpus(); | ||
603 | |||
604 | if (err) { | ||
605 | watchdog_disable_all_cpus(); | ||
606 | pr_err("Failed to update lockup detectors, disabled\n"); | ||
607 | } | ||
608 | } | ||
609 | |||
610 | if (err) | ||
611 | watchdog_enabled = 0; | ||
612 | |||
613 | return err; | ||
614 | } | 570 | } |
615 | 571 | ||
616 | static void watchdog_disable_all_cpus(void) | 572 | /* |
573 | * Create the watchdog thread infrastructure and configure the detector(s). | ||
574 | * | ||
575 | * The threads are not unparked as watchdog_allowed_mask is empty. When | ||
576 | * the threads are sucessfully initialized, take the proper locks and | ||
577 | * unpark the threads in the watchdog_cpumask if the watchdog is enabled. | ||
578 | */ | ||
579 | static __init void lockup_detector_setup(void) | ||
617 | { | 580 | { |
618 | if (watchdog_running) { | 581 | int ret; |
619 | watchdog_running = 0; | ||
620 | smpboot_unregister_percpu_thread(&watchdog_threads); | ||
621 | } | ||
622 | } | ||
623 | 582 | ||
624 | #ifdef CONFIG_SYSCTL | 583 | /* |
625 | static int watchdog_update_cpus(void) | 584 | * If sysctl is off and watchdog got disabled on the command line, |
626 | { | 585 | * nothing to do here. |
627 | return smpboot_update_cpumask_percpu_thread( | 586 | */ |
628 | &watchdog_threads, &watchdog_cpumask); | 587 | lockup_detector_update_enable(); |
629 | } | ||
630 | #endif | ||
631 | 588 | ||
632 | #else /* SOFTLOCKUP */ | 589 | if (!IS_ENABLED(CONFIG_SYSCTL) && |
633 | static int watchdog_park_threads(void) | 590 | !(watchdog_enabled && watchdog_thresh)) |
634 | { | 591 | return; |
635 | return 0; | ||
636 | } | ||
637 | 592 | ||
638 | static void watchdog_unpark_threads(void) | 593 | ret = smpboot_register_percpu_thread_cpumask(&watchdog_threads, |
639 | { | 594 | &watchdog_allowed_mask); |
640 | } | 595 | if (ret) { |
596 | pr_err("Failed to initialize soft lockup detector threads\n"); | ||
597 | return; | ||
598 | } | ||
641 | 599 | ||
642 | static int watchdog_enable_all_cpus(void) | 600 | mutex_lock(&watchdog_mutex); |
643 | { | 601 | softlockup_threads_initialized = true; |
644 | return 0; | 602 | lockup_detector_reconfigure(); |
603 | mutex_unlock(&watchdog_mutex); | ||
645 | } | 604 | } |
646 | 605 | ||
647 | static void watchdog_disable_all_cpus(void) | 606 | #else /* CONFIG_SOFTLOCKUP_DETECTOR */ |
607 | static inline int watchdog_park_threads(void) { return 0; } | ||
608 | static inline void watchdog_unpark_threads(void) { } | ||
609 | static inline int watchdog_enable_all_cpus(void) { return 0; } | ||
610 | static inline void watchdog_disable_all_cpus(void) { } | ||
611 | static void lockup_detector_reconfigure(void) | ||
648 | { | 612 | { |
613 | cpus_read_lock(); | ||
614 | watchdog_nmi_stop(); | ||
615 | lockup_detector_update_enable(); | ||
616 | watchdog_nmi_start(); | ||
617 | cpus_read_unlock(); | ||
649 | } | 618 | } |
650 | 619 | static inline void lockup_detector_setup(void) | |
651 | #ifdef CONFIG_SYSCTL | ||
652 | static int watchdog_update_cpus(void) | ||
653 | { | 620 | { |
654 | return 0; | 621 | lockup_detector_reconfigure(); |
655 | } | 622 | } |
656 | #endif | 623 | #endif /* !CONFIG_SOFTLOCKUP_DETECTOR */ |
657 | 624 | ||
658 | static void set_sample_period(void) | 625 | static void __lockup_detector_cleanup(void) |
659 | { | 626 | { |
627 | lockdep_assert_held(&watchdog_mutex); | ||
628 | hardlockup_detector_perf_cleanup(); | ||
660 | } | 629 | } |
661 | #endif /* SOFTLOCKUP */ | ||
662 | 630 | ||
663 | /* | 631 | /** |
664 | * Suspend the hard and soft lockup detector by parking the watchdog threads. | 632 | * lockup_detector_cleanup - Cleanup after cpu hotplug or sysctl changes |
633 | * | ||
634 | * Caller must not hold the cpu hotplug rwsem. | ||
665 | */ | 635 | */ |
666 | int lockup_detector_suspend(void) | 636 | void lockup_detector_cleanup(void) |
667 | { | 637 | { |
668 | int ret = 0; | 638 | mutex_lock(&watchdog_mutex); |
669 | 639 | __lockup_detector_cleanup(); | |
670 | get_online_cpus(); | 640 | mutex_unlock(&watchdog_mutex); |
671 | mutex_lock(&watchdog_proc_mutex); | ||
672 | /* | ||
673 | * Multiple suspend requests can be active in parallel (counted by | ||
674 | * the 'watchdog_suspended' variable). If the watchdog threads are | ||
675 | * running, the first caller takes care that they will be parked. | ||
676 | * The state of 'watchdog_running' cannot change while a suspend | ||
677 | * request is active (see related code in 'proc' handlers). | ||
678 | */ | ||
679 | if (watchdog_running && !watchdog_suspended) | ||
680 | ret = watchdog_park_threads(); | ||
681 | |||
682 | if (ret == 0) | ||
683 | watchdog_suspended++; | ||
684 | else { | ||
685 | watchdog_disable_all_cpus(); | ||
686 | pr_err("Failed to suspend lockup detectors, disabled\n"); | ||
687 | watchdog_enabled = 0; | ||
688 | } | ||
689 | |||
690 | watchdog_nmi_reconfigure(); | ||
691 | |||
692 | mutex_unlock(&watchdog_proc_mutex); | ||
693 | |||
694 | return ret; | ||
695 | } | 641 | } |
696 | 642 | ||
697 | /* | 643 | /** |
698 | * Resume the hard and soft lockup detector by unparking the watchdog threads. | 644 | * lockup_detector_soft_poweroff - Interface to stop lockup detector(s) |
645 | * | ||
646 | * Special interface for parisc. It prevents lockup detector warnings from | ||
647 | * the default pm_poweroff() function which busy loops forever. | ||
699 | */ | 648 | */ |
700 | void lockup_detector_resume(void) | 649 | void lockup_detector_soft_poweroff(void) |
701 | { | 650 | { |
702 | mutex_lock(&watchdog_proc_mutex); | 651 | watchdog_enabled = 0; |
703 | |||
704 | watchdog_suspended--; | ||
705 | /* | ||
706 | * The watchdog threads are unparked if they were previously running | ||
707 | * and if there is no more active suspend request. | ||
708 | */ | ||
709 | if (watchdog_running && !watchdog_suspended) | ||
710 | watchdog_unpark_threads(); | ||
711 | |||
712 | watchdog_nmi_reconfigure(); | ||
713 | |||
714 | mutex_unlock(&watchdog_proc_mutex); | ||
715 | put_online_cpus(); | ||
716 | } | 652 | } |
717 | 653 | ||
718 | #ifdef CONFIG_SYSCTL | 654 | #ifdef CONFIG_SYSCTL |
719 | 655 | ||
720 | /* | 656 | /* Propagate any changes to the watchdog threads */ |
721 | * Update the run state of the lockup detectors. | 657 | static void proc_watchdog_update(void) |
722 | */ | ||
723 | static int proc_watchdog_update(void) | ||
724 | { | 658 | { |
725 | int err = 0; | 659 | /* Remove impossible cpus to keep sysctl output clean. */ |
726 | 660 | cpumask_and(&watchdog_cpumask, &watchdog_cpumask, cpu_possible_mask); | |
727 | /* | 661 | lockup_detector_reconfigure(); |
728 | * Watchdog threads won't be started if they are already active. | ||
729 | * The 'watchdog_running' variable in watchdog_*_all_cpus() takes | ||
730 | * care of this. If those threads are already active, the sample | ||
731 | * period will be updated and the lockup detectors will be enabled | ||
732 | * or disabled 'on the fly'. | ||
733 | */ | ||
734 | if (watchdog_enabled && watchdog_thresh) | ||
735 | err = watchdog_enable_all_cpus(); | ||
736 | else | ||
737 | watchdog_disable_all_cpus(); | ||
738 | |||
739 | watchdog_nmi_reconfigure(); | ||
740 | |||
741 | return err; | ||
742 | |||
743 | } | 662 | } |
744 | 663 | ||
745 | /* | 664 | /* |
746 | * common function for watchdog, nmi_watchdog and soft_watchdog parameter | 665 | * common function for watchdog, nmi_watchdog and soft_watchdog parameter |
747 | * | 666 | * |
748 | * caller | table->data points to | 'which' contains the flag(s) | 667 | * caller | table->data points to | 'which' |
749 | * -------------------|-----------------------|----------------------------- | 668 | * -------------------|----------------------------|-------------------------- |
750 | * proc_watchdog | watchdog_user_enabled | NMI_WATCHDOG_ENABLED or'ed | 669 | * proc_watchdog | watchdog_user_enabled | NMI_WATCHDOG_ENABLED | |
751 | * | | with SOFT_WATCHDOG_ENABLED | 670 | * | | SOFT_WATCHDOG_ENABLED |
752 | * -------------------|-----------------------|----------------------------- | 671 | * -------------------|----------------------------|-------------------------- |
753 | * proc_nmi_watchdog | nmi_watchdog_enabled | NMI_WATCHDOG_ENABLED | 672 | * proc_nmi_watchdog | nmi_watchdog_user_enabled | NMI_WATCHDOG_ENABLED |
754 | * -------------------|-----------------------|----------------------------- | 673 | * -------------------|----------------------------|-------------------------- |
755 | * proc_soft_watchdog | soft_watchdog_enabled | SOFT_WATCHDOG_ENABLED | 674 | * proc_soft_watchdog | soft_watchdog_user_enabled | SOFT_WATCHDOG_ENABLED |
756 | */ | 675 | */ |
757 | static int proc_watchdog_common(int which, struct ctl_table *table, int write, | 676 | static int proc_watchdog_common(int which, struct ctl_table *table, int write, |
758 | void __user *buffer, size_t *lenp, loff_t *ppos) | 677 | void __user *buffer, size_t *lenp, loff_t *ppos) |
759 | { | 678 | { |
760 | int err, old, new; | 679 | int err, old, *param = table->data; |
761 | int *watchdog_param = (int *)table->data; | ||
762 | 680 | ||
763 | get_online_cpus(); | 681 | mutex_lock(&watchdog_mutex); |
764 | mutex_lock(&watchdog_proc_mutex); | ||
765 | 682 | ||
766 | if (watchdog_suspended) { | ||
767 | /* no parameter changes allowed while watchdog is suspended */ | ||
768 | err = -EAGAIN; | ||
769 | goto out; | ||
770 | } | ||
771 | |||
772 | /* | ||
773 | * If the parameter is being read return the state of the corresponding | ||
774 | * bit(s) in 'watchdog_enabled', else update 'watchdog_enabled' and the | ||
775 | * run state of the lockup detectors. | ||
776 | */ | ||
777 | if (!write) { | 683 | if (!write) { |
778 | *watchdog_param = (watchdog_enabled & which) != 0; | 684 | /* |
685 | * On read synchronize the userspace interface. This is a | ||
686 | * racy snapshot. | ||
687 | */ | ||
688 | *param = (watchdog_enabled & which) != 0; | ||
779 | err = proc_dointvec_minmax(table, write, buffer, lenp, ppos); | 689 | err = proc_dointvec_minmax(table, write, buffer, lenp, ppos); |
780 | } else { | 690 | } else { |
691 | old = READ_ONCE(*param); | ||
781 | err = proc_dointvec_minmax(table, write, buffer, lenp, ppos); | 692 | err = proc_dointvec_minmax(table, write, buffer, lenp, ppos); |
782 | if (err) | 693 | if (!err && old != READ_ONCE(*param)) |
783 | goto out; | 694 | proc_watchdog_update(); |
784 | |||
785 | /* | ||
786 | * There is a race window between fetching the current value | ||
787 | * from 'watchdog_enabled' and storing the new value. During | ||
788 | * this race window, watchdog_nmi_enable() can sneak in and | ||
789 | * clear the NMI_WATCHDOG_ENABLED bit in 'watchdog_enabled'. | ||
790 | * The 'cmpxchg' detects this race and the loop retries. | ||
791 | */ | ||
792 | do { | ||
793 | old = watchdog_enabled; | ||
794 | /* | ||
795 | * If the parameter value is not zero set the | ||
796 | * corresponding bit(s), else clear it(them). | ||
797 | */ | ||
798 | if (*watchdog_param) | ||
799 | new = old | which; | ||
800 | else | ||
801 | new = old & ~which; | ||
802 | } while (cmpxchg(&watchdog_enabled, old, new) != old); | ||
803 | |||
804 | /* | ||
805 | * Update the run state of the lockup detectors. There is _no_ | ||
806 | * need to check the value returned by proc_watchdog_update() | ||
807 | * and to restore the previous value of 'watchdog_enabled' as | ||
808 | * both lockup detectors are disabled if proc_watchdog_update() | ||
809 | * returns an error. | ||
810 | */ | ||
811 | if (old == new) | ||
812 | goto out; | ||
813 | |||
814 | err = proc_watchdog_update(); | ||
815 | } | 695 | } |
816 | out: | 696 | mutex_unlock(&watchdog_mutex); |
817 | mutex_unlock(&watchdog_proc_mutex); | ||
818 | put_online_cpus(); | ||
819 | return err; | 697 | return err; |
820 | } | 698 | } |
821 | 699 | ||
@@ -835,6 +713,8 @@ int proc_watchdog(struct ctl_table *table, int write, | |||
835 | int proc_nmi_watchdog(struct ctl_table *table, int write, | 713 | int proc_nmi_watchdog(struct ctl_table *table, int write, |
836 | void __user *buffer, size_t *lenp, loff_t *ppos) | 714 | void __user *buffer, size_t *lenp, loff_t *ppos) |
837 | { | 715 | { |
716 | if (!nmi_watchdog_available && write) | ||
717 | return -ENOTSUPP; | ||
838 | return proc_watchdog_common(NMI_WATCHDOG_ENABLED, | 718 | return proc_watchdog_common(NMI_WATCHDOG_ENABLED, |
839 | table, write, buffer, lenp, ppos); | 719 | table, write, buffer, lenp, ppos); |
840 | } | 720 | } |
@@ -855,39 +735,17 @@ int proc_soft_watchdog(struct ctl_table *table, int write, | |||
855 | int proc_watchdog_thresh(struct ctl_table *table, int write, | 735 | int proc_watchdog_thresh(struct ctl_table *table, int write, |
856 | void __user *buffer, size_t *lenp, loff_t *ppos) | 736 | void __user *buffer, size_t *lenp, loff_t *ppos) |
857 | { | 737 | { |
858 | int err, old, new; | 738 | int err, old; |
859 | |||
860 | get_online_cpus(); | ||
861 | mutex_lock(&watchdog_proc_mutex); | ||
862 | 739 | ||
863 | if (watchdog_suspended) { | 740 | mutex_lock(&watchdog_mutex); |
864 | /* no parameter changes allowed while watchdog is suspended */ | ||
865 | err = -EAGAIN; | ||
866 | goto out; | ||
867 | } | ||
868 | 741 | ||
869 | old = ACCESS_ONCE(watchdog_thresh); | 742 | old = READ_ONCE(watchdog_thresh); |
870 | err = proc_dointvec_minmax(table, write, buffer, lenp, ppos); | 743 | err = proc_dointvec_minmax(table, write, buffer, lenp, ppos); |
871 | 744 | ||
872 | if (err || !write) | 745 | if (!err && write && old != READ_ONCE(watchdog_thresh)) |
873 | goto out; | 746 | proc_watchdog_update(); |
874 | |||
875 | /* | ||
876 | * Update the sample period. Restore on failure. | ||
877 | */ | ||
878 | new = ACCESS_ONCE(watchdog_thresh); | ||
879 | if (old == new) | ||
880 | goto out; | ||
881 | 747 | ||
882 | set_sample_period(); | 748 | mutex_unlock(&watchdog_mutex); |
883 | err = proc_watchdog_update(); | ||
884 | if (err) { | ||
885 | watchdog_thresh = old; | ||
886 | set_sample_period(); | ||
887 | } | ||
888 | out: | ||
889 | mutex_unlock(&watchdog_proc_mutex); | ||
890 | put_online_cpus(); | ||
891 | return err; | 749 | return err; |
892 | } | 750 | } |
893 | 751 | ||
@@ -902,45 +760,19 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write, | |||
902 | { | 760 | { |
903 | int err; | 761 | int err; |
904 | 762 | ||
905 | get_online_cpus(); | 763 | mutex_lock(&watchdog_mutex); |
906 | mutex_lock(&watchdog_proc_mutex); | ||
907 | |||
908 | if (watchdog_suspended) { | ||
909 | /* no parameter changes allowed while watchdog is suspended */ | ||
910 | err = -EAGAIN; | ||
911 | goto out; | ||
912 | } | ||
913 | 764 | ||
914 | err = proc_do_large_bitmap(table, write, buffer, lenp, ppos); | 765 | err = proc_do_large_bitmap(table, write, buffer, lenp, ppos); |
915 | if (!err && write) { | 766 | if (!err && write) |
916 | /* Remove impossible cpus to keep sysctl output cleaner. */ | 767 | proc_watchdog_update(); |
917 | cpumask_and(&watchdog_cpumask, &watchdog_cpumask, | ||
918 | cpu_possible_mask); | ||
919 | |||
920 | if (watchdog_running) { | ||
921 | /* | ||
922 | * Failure would be due to being unable to allocate | ||
923 | * a temporary cpumask, so we are likely not in a | ||
924 | * position to do much else to make things better. | ||
925 | */ | ||
926 | if (watchdog_update_cpus() != 0) | ||
927 | pr_err("cpumask update failed\n"); | ||
928 | } | ||
929 | 768 | ||
930 | watchdog_nmi_reconfigure(); | 769 | mutex_unlock(&watchdog_mutex); |
931 | } | ||
932 | out: | ||
933 | mutex_unlock(&watchdog_proc_mutex); | ||
934 | put_online_cpus(); | ||
935 | return err; | 770 | return err; |
936 | } | 771 | } |
937 | |||
938 | #endif /* CONFIG_SYSCTL */ | 772 | #endif /* CONFIG_SYSCTL */ |
939 | 773 | ||
940 | void __init lockup_detector_init(void) | 774 | void __init lockup_detector_init(void) |
941 | { | 775 | { |
942 | set_sample_period(); | ||
943 | |||
944 | #ifdef CONFIG_NO_HZ_FULL | 776 | #ifdef CONFIG_NO_HZ_FULL |
945 | if (tick_nohz_full_enabled()) { | 777 | if (tick_nohz_full_enabled()) { |
946 | pr_info("Disabling watchdog on nohz_full cores by default\n"); | 778 | pr_info("Disabling watchdog on nohz_full cores by default\n"); |
@@ -951,6 +783,7 @@ void __init lockup_detector_init(void) | |||
951 | cpumask_copy(&watchdog_cpumask, cpu_possible_mask); | 783 | cpumask_copy(&watchdog_cpumask, cpu_possible_mask); |
952 | #endif | 784 | #endif |
953 | 785 | ||
954 | if (watchdog_enabled) | 786 | if (!watchdog_nmi_probe()) |
955 | watchdog_enable_all_cpus(); | 787 | nmi_watchdog_available = true; |
788 | lockup_detector_setup(); | ||
956 | } | 789 | } |
diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c index 3a09ea1b1d3d..71a62ceacdc8 100644 --- a/kernel/watchdog_hld.c +++ b/kernel/watchdog_hld.c | |||
@@ -21,8 +21,10 @@ | |||
21 | static DEFINE_PER_CPU(bool, hard_watchdog_warn); | 21 | static DEFINE_PER_CPU(bool, hard_watchdog_warn); |
22 | static DEFINE_PER_CPU(bool, watchdog_nmi_touch); | 22 | static DEFINE_PER_CPU(bool, watchdog_nmi_touch); |
23 | static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); | 23 | static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); |
24 | static struct cpumask dead_events_mask; | ||
24 | 25 | ||
25 | static unsigned long hardlockup_allcpu_dumped; | 26 | static unsigned long hardlockup_allcpu_dumped; |
27 | static unsigned int watchdog_cpus; | ||
26 | 28 | ||
27 | void arch_touch_nmi_watchdog(void) | 29 | void arch_touch_nmi_watchdog(void) |
28 | { | 30 | { |
@@ -103,15 +105,12 @@ static struct perf_event_attr wd_hw_attr = { | |||
103 | 105 | ||
104 | /* Callback function for perf event subsystem */ | 106 | /* Callback function for perf event subsystem */ |
105 | static void watchdog_overflow_callback(struct perf_event *event, | 107 | static void watchdog_overflow_callback(struct perf_event *event, |
106 | struct perf_sample_data *data, | 108 | struct perf_sample_data *data, |
107 | struct pt_regs *regs) | 109 | struct pt_regs *regs) |
108 | { | 110 | { |
109 | /* Ensure the watchdog never gets throttled */ | 111 | /* Ensure the watchdog never gets throttled */ |
110 | event->hw.interrupts = 0; | 112 | event->hw.interrupts = 0; |
111 | 113 | ||
112 | if (atomic_read(&watchdog_park_in_progress) != 0) | ||
113 | return; | ||
114 | |||
115 | if (__this_cpu_read(watchdog_nmi_touch) == true) { | 114 | if (__this_cpu_read(watchdog_nmi_touch) == true) { |
116 | __this_cpu_write(watchdog_nmi_touch, false); | 115 | __this_cpu_write(watchdog_nmi_touch, false); |
117 | return; | 116 | return; |
@@ -160,104 +159,131 @@ static void watchdog_overflow_callback(struct perf_event *event, | |||
160 | return; | 159 | return; |
161 | } | 160 | } |
162 | 161 | ||
163 | /* | 162 | static int hardlockup_detector_event_create(void) |
164 | * People like the simple clean cpu node info on boot. | ||
165 | * Reduce the watchdog noise by only printing messages | ||
166 | * that are different from what cpu0 displayed. | ||
167 | */ | ||
168 | static unsigned long firstcpu_err; | ||
169 | static atomic_t watchdog_cpus; | ||
170 | |||
171 | int watchdog_nmi_enable(unsigned int cpu) | ||
172 | { | 163 | { |
164 | unsigned int cpu = smp_processor_id(); | ||
173 | struct perf_event_attr *wd_attr; | 165 | struct perf_event_attr *wd_attr; |
174 | struct perf_event *event = per_cpu(watchdog_ev, cpu); | 166 | struct perf_event *evt; |
175 | int firstcpu = 0; | ||
176 | |||
177 | /* nothing to do if the hard lockup detector is disabled */ | ||
178 | if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED)) | ||
179 | goto out; | ||
180 | |||
181 | /* is it already setup and enabled? */ | ||
182 | if (event && event->state > PERF_EVENT_STATE_OFF) | ||
183 | goto out; | ||
184 | |||
185 | /* it is setup but not enabled */ | ||
186 | if (event != NULL) | ||
187 | goto out_enable; | ||
188 | |||
189 | if (atomic_inc_return(&watchdog_cpus) == 1) | ||
190 | firstcpu = 1; | ||
191 | 167 | ||
192 | wd_attr = &wd_hw_attr; | 168 | wd_attr = &wd_hw_attr; |
193 | wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh); | 169 | wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh); |
194 | 170 | ||
195 | /* Try to register using hardware perf events */ | 171 | /* Try to register using hardware perf events */ |
196 | event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL); | 172 | evt = perf_event_create_kernel_counter(wd_attr, cpu, NULL, |
173 | watchdog_overflow_callback, NULL); | ||
174 | if (IS_ERR(evt)) { | ||
175 | pr_info("Perf event create on CPU %d failed with %ld\n", cpu, | ||
176 | PTR_ERR(evt)); | ||
177 | return PTR_ERR(evt); | ||
178 | } | ||
179 | this_cpu_write(watchdog_ev, evt); | ||
180 | return 0; | ||
181 | } | ||
197 | 182 | ||
198 | /* save the first cpu's error for future comparision */ | 183 | /** |
199 | if (firstcpu && IS_ERR(event)) | 184 | * hardlockup_detector_perf_enable - Enable the local event |
200 | firstcpu_err = PTR_ERR(event); | 185 | */ |
186 | void hardlockup_detector_perf_enable(void) | ||
187 | { | ||
188 | if (hardlockup_detector_event_create()) | ||
189 | return; | ||
201 | 190 | ||
202 | if (!IS_ERR(event)) { | 191 | if (!watchdog_cpus++) |
203 | /* only print for the first cpu initialized */ | 192 | pr_info("Enabled. Permanently consumes one hw-PMU counter.\n"); |
204 | if (firstcpu || firstcpu_err) | ||
205 | pr_info("enabled on all CPUs, permanently consumes one hw-PMU counter.\n"); | ||
206 | goto out_save; | ||
207 | } | ||
208 | 193 | ||
209 | /* | 194 | perf_event_enable(this_cpu_read(watchdog_ev)); |
210 | * Disable the hard lockup detector if _any_ CPU fails to set up | ||
211 | * set up the hardware perf event. The watchdog() function checks | ||
212 | * the NMI_WATCHDOG_ENABLED bit periodically. | ||
213 | * | ||
214 | * The barriers are for syncing up watchdog_enabled across all the | ||
215 | * cpus, as clear_bit() does not use barriers. | ||
216 | */ | ||
217 | smp_mb__before_atomic(); | ||
218 | clear_bit(NMI_WATCHDOG_ENABLED_BIT, &watchdog_enabled); | ||
219 | smp_mb__after_atomic(); | ||
220 | |||
221 | /* skip displaying the same error again */ | ||
222 | if (!firstcpu && (PTR_ERR(event) == firstcpu_err)) | ||
223 | return PTR_ERR(event); | ||
224 | |||
225 | /* vary the KERN level based on the returned errno */ | ||
226 | if (PTR_ERR(event) == -EOPNOTSUPP) | ||
227 | pr_info("disabled (cpu%i): not supported (no LAPIC?)\n", cpu); | ||
228 | else if (PTR_ERR(event) == -ENOENT) | ||
229 | pr_warn("disabled (cpu%i): hardware events not enabled\n", | ||
230 | cpu); | ||
231 | else | ||
232 | pr_err("disabled (cpu%i): unable to create perf event: %ld\n", | ||
233 | cpu, PTR_ERR(event)); | ||
234 | |||
235 | pr_info("Shutting down hard lockup detector on all cpus\n"); | ||
236 | |||
237 | return PTR_ERR(event); | ||
238 | |||
239 | /* success path */ | ||
240 | out_save: | ||
241 | per_cpu(watchdog_ev, cpu) = event; | ||
242 | out_enable: | ||
243 | perf_event_enable(per_cpu(watchdog_ev, cpu)); | ||
244 | out: | ||
245 | return 0; | ||
246 | } | 195 | } |
247 | 196 | ||
248 | void watchdog_nmi_disable(unsigned int cpu) | 197 | /** |
198 | * hardlockup_detector_perf_disable - Disable the local event | ||
199 | */ | ||
200 | void hardlockup_detector_perf_disable(void) | ||
249 | { | 201 | { |
250 | struct perf_event *event = per_cpu(watchdog_ev, cpu); | 202 | struct perf_event *event = this_cpu_read(watchdog_ev); |
251 | 203 | ||
252 | if (event) { | 204 | if (event) { |
253 | perf_event_disable(event); | 205 | perf_event_disable(event); |
206 | cpumask_set_cpu(smp_processor_id(), &dead_events_mask); | ||
207 | watchdog_cpus--; | ||
208 | } | ||
209 | } | ||
210 | |||
211 | /** | ||
212 | * hardlockup_detector_perf_cleanup - Cleanup disabled events and destroy them | ||
213 | * | ||
214 | * Called from lockup_detector_cleanup(). Serialized by the caller. | ||
215 | */ | ||
216 | void hardlockup_detector_perf_cleanup(void) | ||
217 | { | ||
218 | int cpu; | ||
219 | |||
220 | for_each_cpu(cpu, &dead_events_mask) { | ||
221 | struct perf_event *event = per_cpu(watchdog_ev, cpu); | ||
222 | |||
223 | /* | ||
224 | * Required because for_each_cpu() reports unconditionally | ||
225 | * CPU0 as set on UP kernels. Sigh. | ||
226 | */ | ||
227 | if (event) | ||
228 | perf_event_release_kernel(event); | ||
254 | per_cpu(watchdog_ev, cpu) = NULL; | 229 | per_cpu(watchdog_ev, cpu) = NULL; |
230 | } | ||
231 | cpumask_clear(&dead_events_mask); | ||
232 | } | ||
233 | |||
234 | /** | ||
235 | * hardlockup_detector_perf_stop - Globally stop watchdog events | ||
236 | * | ||
237 | * Special interface for x86 to handle the perf HT bug. | ||
238 | */ | ||
239 | void __init hardlockup_detector_perf_stop(void) | ||
240 | { | ||
241 | int cpu; | ||
242 | |||
243 | lockdep_assert_cpus_held(); | ||
244 | |||
245 | for_each_online_cpu(cpu) { | ||
246 | struct perf_event *event = per_cpu(watchdog_ev, cpu); | ||
247 | |||
248 | if (event) | ||
249 | perf_event_disable(event); | ||
250 | } | ||
251 | } | ||
255 | 252 | ||
256 | /* should be in cleanup, but blocks oprofile */ | 253 | /** |
257 | perf_event_release_kernel(event); | 254 | * hardlockup_detector_perf_restart - Globally restart watchdog events |
255 | * | ||
256 | * Special interface for x86 to handle the perf HT bug. | ||
257 | */ | ||
258 | void __init hardlockup_detector_perf_restart(void) | ||
259 | { | ||
260 | int cpu; | ||
261 | |||
262 | lockdep_assert_cpus_held(); | ||
263 | |||
264 | if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED)) | ||
265 | return; | ||
266 | |||
267 | for_each_online_cpu(cpu) { | ||
268 | struct perf_event *event = per_cpu(watchdog_ev, cpu); | ||
269 | |||
270 | if (event) | ||
271 | perf_event_enable(event); | ||
272 | } | ||
273 | } | ||
274 | |||
275 | /** | ||
276 | * hardlockup_detector_perf_init - Probe whether NMI event is available at all | ||
277 | */ | ||
278 | int __init hardlockup_detector_perf_init(void) | ||
279 | { | ||
280 | int ret = hardlockup_detector_event_create(); | ||
258 | 281 | ||
259 | /* watchdog_nmi_enable() expects this to be zero initially. */ | 282 | if (ret) { |
260 | if (atomic_dec_and_test(&watchdog_cpus)) | 283 | pr_info("Perf NMI watchdog permanently disabled\n"); |
261 | firstcpu_err = 0; | 284 | } else { |
285 | perf_event_release_kernel(this_cpu_read(watchdog_ev)); | ||
286 | this_cpu_write(watchdog_ev, NULL); | ||
262 | } | 287 | } |
288 | return ret; | ||
263 | } | 289 | } |