aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile4
-rw-r--r--kernel/acct.c2
-rw-r--r--kernel/bpf/Makefile3
-rw-r--r--kernel/bpf/arraymap.c6
-rw-r--r--kernel/bpf/core.c8
-rw-r--r--kernel/bpf/hashtab.c6
-rw-r--r--kernel/bpf/helpers.c30
-rw-r--r--kernel/bpf/syscall.c18
-rw-r--r--kernel/bpf/test_stub.c78
-rw-r--r--kernel/bpf/verifier.c185
-rw-r--r--kernel/capability.c35
-rw-r--r--kernel/cgroup.c14
-rw-r--r--kernel/context_tracking.c59
-rw-r--r--kernel/cpu.c43
-rw-r--r--kernel/cpuset.c31
-rw-r--r--kernel/cred.c3
-rw-r--r--kernel/events/core.c762
-rw-r--r--kernel/events/hw_breakpoint.c8
-rw-r--r--kernel/events/internal.h33
-rw-r--r--kernel/events/ring_buffer.c327
-rw-r--r--kernel/exec_domain.c137
-rw-r--r--kernel/exit.c2
-rw-r--r--kernel/fork.c145
-rw-r--r--kernel/futex.c2
-rw-r--r--kernel/gcov/base.c5
-rw-r--r--kernel/groups.c3
-rw-r--r--kernel/hung_task.c4
-rw-r--r--kernel/irq/chip.c16
-rw-r--r--kernel/irq/manage.c127
-rw-r--r--kernel/irq/msi.c11
-rw-r--r--kernel/livepatch/core.c69
-rw-r--r--kernel/locking/lockdep.c97
-rw-r--r--kernel/locking/mcs_spinlock.h6
-rw-r--r--kernel/locking/mutex.c51
-rw-r--r--kernel/locking/osq_lock.c14
-rw-r--r--kernel/locking/rtmutex.c2
-rw-r--r--kernel/locking/rwsem-spinlock.c7
-rw-r--r--kernel/locking/rwsem-xadd.c98
-rw-r--r--kernel/locking/rwsem.c22
-rw-r--r--kernel/locking/rwsem.h20
-rw-r--r--kernel/module.c30
-rw-r--r--kernel/pid.c15
-rw-r--r--kernel/power/main.c2
-rw-r--r--kernel/power/snapshot.c21
-rw-r--r--kernel/power/suspend.c13
-rw-r--r--kernel/printk/printk.c57
-rw-r--r--kernel/ptrace.c39
-rw-r--r--kernel/rcu/rcutorture.c27
-rw-r--r--kernel/rcu/srcu.c19
-rw-r--r--kernel/rcu/tiny.c14
-rw-r--r--kernel/rcu/tree.c437
-rw-r--r--kernel/rcu/tree.h11
-rw-r--r--kernel/rcu/tree_plugin.h267
-rw-r--r--kernel/rcu/tree_trace.c4
-rw-r--r--kernel/rcu/update.c72
-rw-r--r--kernel/reboot.c53
-rw-r--r--kernel/resource.c32
-rw-r--r--kernel/sched/core.c121
-rw-r--r--kernel/sched/deadline.c77
-rw-r--r--kernel/sched/debug.c12
-rw-r--r--kernel/sched/fair.c437
-rw-r--r--kernel/sched/features.h13
-rw-r--r--kernel/sched/idle.c14
-rw-r--r--kernel/sched/rt.c181
-rw-r--r--kernel/sched/sched.h38
-rw-r--r--kernel/signal.c14
-rw-r--r--kernel/smp.c80
-rw-r--r--kernel/smpboot.c156
-rw-r--r--kernel/sys.c49
-rw-r--r--kernel/sys_ni.c14
-rw-r--r--kernel/sysctl.c69
-rw-r--r--kernel/time/Kconfig6
-rw-r--r--kernel/time/Makefile6
-rw-r--r--kernel/time/clockevents.c229
-rw-r--r--kernel/time/clocksource.c173
-rw-r--r--kernel/time/hrtimer.c9
-rw-r--r--kernel/time/jiffies.c7
-rw-r--r--kernel/time/ntp.c14
-rw-r--r--kernel/time/sched_clock.c236
-rw-r--r--kernel/time/tick-broadcast-hrtimer.c11
-rw-r--r--kernel/time/tick-broadcast.c179
-rw-r--r--kernel/time/tick-common.c82
-rw-r--r--kernel/time/tick-internal.h211
-rw-r--r--kernel/time/tick-oneshot.c6
-rw-r--r--kernel/time/tick-sched.c7
-rw-r--r--kernel/time/tick-sched.h74
-rw-r--r--kernel/time/timekeeping.c490
-rw-r--r--kernel/time/timekeeping.h7
-rw-r--r--kernel/time/timer.c149
-rw-r--r--kernel/time/timer_list.c34
-rw-r--r--kernel/trace/Kconfig36
-rw-r--r--kernel/trace/Makefile1
-rw-r--r--kernel/trace/bpf_trace.c222
-rw-r--r--kernel/trace/ftrace.c44
-rw-r--r--kernel/trace/ring_buffer.c10
-rw-r--r--kernel/trace/trace.c491
-rw-r--r--kernel/trace/trace.h4
-rw-r--r--kernel/trace/trace_entries.h6
-rw-r--r--kernel/trace/trace_events.c153
-rw-r--r--kernel/trace/trace_export.c2
-rw-r--r--kernel/trace/trace_functions_graph.c7
-rw-r--r--kernel/trace/trace_kprobe.c25
-rw-r--r--kernel/trace/trace_probe.c19
-rw-r--r--kernel/trace/trace_probe.h12
-rw-r--r--kernel/trace/trace_stack.c4
-rw-r--r--kernel/trace/trace_stat.c10
-rw-r--r--kernel/trace/trace_uprobe.c15
-rw-r--r--kernel/watchdog.c317
-rw-r--r--kernel/workqueue.c847
109 files changed, 6218 insertions, 2818 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 1408b3353a3c..0f8f8b0bc1bf 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -9,7 +9,9 @@ obj-y = fork.o exec_domain.o panic.o \
9 extable.o params.o \ 9 extable.o params.o \
10 kthread.o sys_ni.o nsproxy.o \ 10 kthread.o sys_ni.o nsproxy.o \
11 notifier.o ksysfs.o cred.o reboot.o \ 11 notifier.o ksysfs.o cred.o reboot.o \
12 async.o range.o groups.o smpboot.o 12 async.o range.o smpboot.o
13
14obj-$(CONFIG_MULTIUSER) += groups.o
13 15
14ifdef CONFIG_FUNCTION_TRACER 16ifdef CONFIG_FUNCTION_TRACER
15# Do not trace debug files and internal ftrace files 17# Do not trace debug files and internal ftrace files
diff --git a/kernel/acct.c b/kernel/acct.c
index e6c10d1a4058..74963d192c5d 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -213,7 +213,7 @@ static int acct_on(struct filename *pathname)
213 return -EACCES; 213 return -EACCES;
214 } 214 }
215 215
216 if (!file->f_op->write) { 216 if (!(file->f_mode & FMODE_CAN_WRITE)) {
217 kfree(acct); 217 kfree(acct);
218 filp_close(file, NULL); 218 filp_close(file, NULL);
219 return -EIO; 219 return -EIO;
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index a5ae60f0b0a2..e6983be12bd3 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -1,5 +1,2 @@
1obj-y := core.o 1obj-y := core.o
2obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o hashtab.o arraymap.o helpers.o 2obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o hashtab.o arraymap.o helpers.o
3ifdef CONFIG_TEST_BPF
4obj-$(CONFIG_BPF_SYSCALL) += test_stub.o
5endif
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 9eb4d8a7cd87..8a6616583f38 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -134,7 +134,7 @@ static void array_map_free(struct bpf_map *map)
134 kvfree(array); 134 kvfree(array);
135} 135}
136 136
137static struct bpf_map_ops array_ops = { 137static const struct bpf_map_ops array_ops = {
138 .map_alloc = array_map_alloc, 138 .map_alloc = array_map_alloc,
139 .map_free = array_map_free, 139 .map_free = array_map_free,
140 .map_get_next_key = array_map_get_next_key, 140 .map_get_next_key = array_map_get_next_key,
@@ -143,14 +143,14 @@ static struct bpf_map_ops array_ops = {
143 .map_delete_elem = array_map_delete_elem, 143 .map_delete_elem = array_map_delete_elem,
144}; 144};
145 145
146static struct bpf_map_type_list tl = { 146static struct bpf_map_type_list array_type __read_mostly = {
147 .ops = &array_ops, 147 .ops = &array_ops,
148 .type = BPF_MAP_TYPE_ARRAY, 148 .type = BPF_MAP_TYPE_ARRAY,
149}; 149};
150 150
151static int __init register_array_map(void) 151static int __init register_array_map(void)
152{ 152{
153 bpf_register_map_type(&tl); 153 bpf_register_map_type(&array_type);
154 return 0; 154 return 0;
155} 155}
156late_initcall(register_array_map); 156late_initcall(register_array_map);
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index a64e7a207d2b..4139a0f8b558 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -656,6 +656,14 @@ void bpf_prog_free(struct bpf_prog *fp)
656} 656}
657EXPORT_SYMBOL_GPL(bpf_prog_free); 657EXPORT_SYMBOL_GPL(bpf_prog_free);
658 658
659/* Weak definitions of helper functions in case we don't have bpf syscall. */
660const struct bpf_func_proto bpf_map_lookup_elem_proto __weak;
661const struct bpf_func_proto bpf_map_update_elem_proto __weak;
662const struct bpf_func_proto bpf_map_delete_elem_proto __weak;
663
664const struct bpf_func_proto bpf_get_prandom_u32_proto __weak;
665const struct bpf_func_proto bpf_get_smp_processor_id_proto __weak;
666
659/* To execute LD_ABS/LD_IND instructions __bpf_prog_run() may call 667/* To execute LD_ABS/LD_IND instructions __bpf_prog_run() may call
660 * skb_copy_bits(), so provide a weak definition of it for NET-less config. 668 * skb_copy_bits(), so provide a weak definition of it for NET-less config.
661 */ 669 */
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index b3ba43674310..83c209d9b17a 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -345,7 +345,7 @@ static void htab_map_free(struct bpf_map *map)
345 kfree(htab); 345 kfree(htab);
346} 346}
347 347
348static struct bpf_map_ops htab_ops = { 348static const struct bpf_map_ops htab_ops = {
349 .map_alloc = htab_map_alloc, 349 .map_alloc = htab_map_alloc,
350 .map_free = htab_map_free, 350 .map_free = htab_map_free,
351 .map_get_next_key = htab_map_get_next_key, 351 .map_get_next_key = htab_map_get_next_key,
@@ -354,14 +354,14 @@ static struct bpf_map_ops htab_ops = {
354 .map_delete_elem = htab_map_delete_elem, 354 .map_delete_elem = htab_map_delete_elem,
355}; 355};
356 356
357static struct bpf_map_type_list tl = { 357static struct bpf_map_type_list htab_type __read_mostly = {
358 .ops = &htab_ops, 358 .ops = &htab_ops,
359 .type = BPF_MAP_TYPE_HASH, 359 .type = BPF_MAP_TYPE_HASH,
360}; 360};
361 361
362static int __init register_htab_map(void) 362static int __init register_htab_map(void)
363{ 363{
364 bpf_register_map_type(&tl); 364 bpf_register_map_type(&htab_type);
365 return 0; 365 return 0;
366} 366}
367late_initcall(register_htab_map); 367late_initcall(register_htab_map);
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 9e3414d85459..bd7f5988ed9c 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -11,6 +11,8 @@
11 */ 11 */
12#include <linux/bpf.h> 12#include <linux/bpf.h>
13#include <linux/rcupdate.h> 13#include <linux/rcupdate.h>
14#include <linux/random.h>
15#include <linux/smp.h>
14 16
15/* If kernel subsystem is allowing eBPF programs to call this function, 17/* If kernel subsystem is allowing eBPF programs to call this function,
16 * inside its own verifier_ops->get_func_proto() callback it should return 18 * inside its own verifier_ops->get_func_proto() callback it should return
@@ -41,7 +43,7 @@ static u64 bpf_map_lookup_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
41 return (unsigned long) value; 43 return (unsigned long) value;
42} 44}
43 45
44struct bpf_func_proto bpf_map_lookup_elem_proto = { 46const struct bpf_func_proto bpf_map_lookup_elem_proto = {
45 .func = bpf_map_lookup_elem, 47 .func = bpf_map_lookup_elem,
46 .gpl_only = false, 48 .gpl_only = false,
47 .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, 49 .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
@@ -60,7 +62,7 @@ static u64 bpf_map_update_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
60 return map->ops->map_update_elem(map, key, value, r4); 62 return map->ops->map_update_elem(map, key, value, r4);
61} 63}
62 64
63struct bpf_func_proto bpf_map_update_elem_proto = { 65const struct bpf_func_proto bpf_map_update_elem_proto = {
64 .func = bpf_map_update_elem, 66 .func = bpf_map_update_elem,
65 .gpl_only = false, 67 .gpl_only = false,
66 .ret_type = RET_INTEGER, 68 .ret_type = RET_INTEGER,
@@ -80,10 +82,32 @@ static u64 bpf_map_delete_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
80 return map->ops->map_delete_elem(map, key); 82 return map->ops->map_delete_elem(map, key);
81} 83}
82 84
83struct bpf_func_proto bpf_map_delete_elem_proto = { 85const struct bpf_func_proto bpf_map_delete_elem_proto = {
84 .func = bpf_map_delete_elem, 86 .func = bpf_map_delete_elem,
85 .gpl_only = false, 87 .gpl_only = false,
86 .ret_type = RET_INTEGER, 88 .ret_type = RET_INTEGER,
87 .arg1_type = ARG_CONST_MAP_PTR, 89 .arg1_type = ARG_CONST_MAP_PTR,
88 .arg2_type = ARG_PTR_TO_MAP_KEY, 90 .arg2_type = ARG_PTR_TO_MAP_KEY,
89}; 91};
92
93static u64 bpf_get_prandom_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
94{
95 return prandom_u32();
96}
97
98const struct bpf_func_proto bpf_get_prandom_u32_proto = {
99 .func = bpf_get_prandom_u32,
100 .gpl_only = false,
101 .ret_type = RET_INTEGER,
102};
103
104static u64 bpf_get_smp_processor_id(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
105{
106 return raw_smp_processor_id();
107}
108
109const struct bpf_func_proto bpf_get_smp_processor_id_proto = {
110 .func = bpf_get_smp_processor_id,
111 .gpl_only = false,
112 .ret_type = RET_INTEGER,
113};
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 536edc2be307..3bae6c591914 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -16,6 +16,7 @@
16#include <linux/file.h> 16#include <linux/file.h>
17#include <linux/license.h> 17#include <linux/license.h>
18#include <linux/filter.h> 18#include <linux/filter.h>
19#include <linux/version.h>
19 20
20static LIST_HEAD(bpf_map_types); 21static LIST_HEAD(bpf_map_types);
21 22
@@ -354,10 +355,11 @@ static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog)
354 list_for_each_entry(tl, &bpf_prog_types, list_node) { 355 list_for_each_entry(tl, &bpf_prog_types, list_node) {
355 if (tl->type == type) { 356 if (tl->type == type) {
356 prog->aux->ops = tl->ops; 357 prog->aux->ops = tl->ops;
357 prog->aux->prog_type = type; 358 prog->type = type;
358 return 0; 359 return 0;
359 } 360 }
360 } 361 }
362
361 return -EINVAL; 363 return -EINVAL;
362} 364}
363 365
@@ -418,6 +420,7 @@ void bpf_prog_put(struct bpf_prog *prog)
418 bpf_prog_free(prog); 420 bpf_prog_free(prog);
419 } 421 }
420} 422}
423EXPORT_SYMBOL_GPL(bpf_prog_put);
421 424
422static int bpf_prog_release(struct inode *inode, struct file *filp) 425static int bpf_prog_release(struct inode *inode, struct file *filp)
423{ 426{
@@ -465,9 +468,10 @@ struct bpf_prog *bpf_prog_get(u32 ufd)
465 fdput(f); 468 fdput(f);
466 return prog; 469 return prog;
467} 470}
471EXPORT_SYMBOL_GPL(bpf_prog_get);
468 472
469/* last field in 'union bpf_attr' used by this command */ 473/* last field in 'union bpf_attr' used by this command */
470#define BPF_PROG_LOAD_LAST_FIELD log_buf 474#define BPF_PROG_LOAD_LAST_FIELD kern_version
471 475
472static int bpf_prog_load(union bpf_attr *attr) 476static int bpf_prog_load(union bpf_attr *attr)
473{ 477{
@@ -492,6 +496,10 @@ static int bpf_prog_load(union bpf_attr *attr)
492 if (attr->insn_cnt >= BPF_MAXINSNS) 496 if (attr->insn_cnt >= BPF_MAXINSNS)
493 return -EINVAL; 497 return -EINVAL;
494 498
499 if (type == BPF_PROG_TYPE_KPROBE &&
500 attr->kern_version != LINUX_VERSION_CODE)
501 return -EINVAL;
502
495 /* plain bpf_prog allocation */ 503 /* plain bpf_prog allocation */
496 prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER); 504 prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER);
497 if (!prog) 505 if (!prog)
@@ -508,7 +516,7 @@ static int bpf_prog_load(union bpf_attr *attr)
508 prog->jited = false; 516 prog->jited = false;
509 517
510 atomic_set(&prog->aux->refcnt, 1); 518 atomic_set(&prog->aux->refcnt, 1);
511 prog->aux->is_gpl_compatible = is_gpl; 519 prog->gpl_compatible = is_gpl;
512 520
513 /* find program type: socket_filter vs tracing_filter */ 521 /* find program type: socket_filter vs tracing_filter */
514 err = find_prog_type(type, prog); 522 err = find_prog_type(type, prog);
@@ -516,8 +524,7 @@ static int bpf_prog_load(union bpf_attr *attr)
516 goto free_prog; 524 goto free_prog;
517 525
518 /* run eBPF verifier */ 526 /* run eBPF verifier */
519 err = bpf_check(prog, attr); 527 err = bpf_check(&prog, attr);
520
521 if (err < 0) 528 if (err < 0)
522 goto free_used_maps; 529 goto free_used_maps;
523 530
@@ -528,7 +535,6 @@ static int bpf_prog_load(union bpf_attr *attr)
528 bpf_prog_select_runtime(prog); 535 bpf_prog_select_runtime(prog);
529 536
530 err = anon_inode_getfd("bpf-prog", &bpf_prog_fops, prog, O_RDWR | O_CLOEXEC); 537 err = anon_inode_getfd("bpf-prog", &bpf_prog_fops, prog, O_RDWR | O_CLOEXEC);
531
532 if (err < 0) 538 if (err < 0)
533 /* failed to allocate fd */ 539 /* failed to allocate fd */
534 goto free_used_maps; 540 goto free_used_maps;
diff --git a/kernel/bpf/test_stub.c b/kernel/bpf/test_stub.c
deleted file mode 100644
index 0ceae1e6e8b5..000000000000
--- a/kernel/bpf/test_stub.c
+++ /dev/null
@@ -1,78 +0,0 @@
1/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
2 *
3 * This program is free software; you can redistribute it and/or
4 * modify it under the terms of version 2 of the GNU General Public
5 * License as published by the Free Software Foundation.
6 */
7#include <linux/kernel.h>
8#include <linux/types.h>
9#include <linux/slab.h>
10#include <linux/err.h>
11#include <linux/bpf.h>
12
13/* test stubs for BPF_MAP_TYPE_UNSPEC and for BPF_PROG_TYPE_UNSPEC
14 * to be used by user space verifier testsuite
15 */
16struct bpf_context {
17 u64 arg1;
18 u64 arg2;
19};
20
21static const struct bpf_func_proto *test_func_proto(enum bpf_func_id func_id)
22{
23 switch (func_id) {
24 case BPF_FUNC_map_lookup_elem:
25 return &bpf_map_lookup_elem_proto;
26 case BPF_FUNC_map_update_elem:
27 return &bpf_map_update_elem_proto;
28 case BPF_FUNC_map_delete_elem:
29 return &bpf_map_delete_elem_proto;
30 default:
31 return NULL;
32 }
33}
34
35static const struct bpf_context_access {
36 int size;
37 enum bpf_access_type type;
38} test_ctx_access[] = {
39 [offsetof(struct bpf_context, arg1)] = {
40 FIELD_SIZEOF(struct bpf_context, arg1),
41 BPF_READ
42 },
43 [offsetof(struct bpf_context, arg2)] = {
44 FIELD_SIZEOF(struct bpf_context, arg2),
45 BPF_READ
46 },
47};
48
49static bool test_is_valid_access(int off, int size, enum bpf_access_type type)
50{
51 const struct bpf_context_access *access;
52
53 if (off < 0 || off >= ARRAY_SIZE(test_ctx_access))
54 return false;
55
56 access = &test_ctx_access[off];
57 if (access->size == size && (access->type & type))
58 return true;
59
60 return false;
61}
62
63static struct bpf_verifier_ops test_ops = {
64 .get_func_proto = test_func_proto,
65 .is_valid_access = test_is_valid_access,
66};
67
68static struct bpf_prog_type_list tl_prog = {
69 .ops = &test_ops,
70 .type = BPF_PROG_TYPE_UNSPEC,
71};
72
73static int __init register_test_ops(void)
74{
75 bpf_register_prog_type(&tl_prog);
76 return 0;
77}
78late_initcall(register_test_ops);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index a28e09c7825d..47dcd3aa6e23 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -755,7 +755,7 @@ static int check_func_arg(struct verifier_env *env, u32 regno,
755 enum bpf_reg_type expected_type; 755 enum bpf_reg_type expected_type;
756 int err = 0; 756 int err = 0;
757 757
758 if (arg_type == ARG_ANYTHING) 758 if (arg_type == ARG_DONTCARE)
759 return 0; 759 return 0;
760 760
761 if (reg->type == NOT_INIT) { 761 if (reg->type == NOT_INIT) {
@@ -763,6 +763,9 @@ static int check_func_arg(struct verifier_env *env, u32 regno,
763 return -EACCES; 763 return -EACCES;
764 } 764 }
765 765
766 if (arg_type == ARG_ANYTHING)
767 return 0;
768
766 if (arg_type == ARG_PTR_TO_STACK || arg_type == ARG_PTR_TO_MAP_KEY || 769 if (arg_type == ARG_PTR_TO_STACK || arg_type == ARG_PTR_TO_MAP_KEY ||
767 arg_type == ARG_PTR_TO_MAP_VALUE) { 770 arg_type == ARG_PTR_TO_MAP_VALUE) {
768 expected_type = PTR_TO_STACK; 771 expected_type = PTR_TO_STACK;
@@ -770,6 +773,8 @@ static int check_func_arg(struct verifier_env *env, u32 regno,
770 expected_type = CONST_IMM; 773 expected_type = CONST_IMM;
771 } else if (arg_type == ARG_CONST_MAP_PTR) { 774 } else if (arg_type == ARG_CONST_MAP_PTR) {
772 expected_type = CONST_PTR_TO_MAP; 775 expected_type = CONST_PTR_TO_MAP;
776 } else if (arg_type == ARG_PTR_TO_CTX) {
777 expected_type = PTR_TO_CTX;
773 } else { 778 } else {
774 verbose("unsupported arg_type %d\n", arg_type); 779 verbose("unsupported arg_type %d\n", arg_type);
775 return -EFAULT; 780 return -EFAULT;
@@ -852,7 +857,7 @@ static int check_call(struct verifier_env *env, int func_id)
852 } 857 }
853 858
854 /* eBPF programs must be GPL compatible to use GPL-ed functions */ 859 /* eBPF programs must be GPL compatible to use GPL-ed functions */
855 if (!env->prog->aux->is_gpl_compatible && fn->gpl_only) { 860 if (!env->prog->gpl_compatible && fn->gpl_only) {
856 verbose("cannot call GPL only function from proprietary program\n"); 861 verbose("cannot call GPL only function from proprietary program\n");
857 return -EINVAL; 862 return -EINVAL;
858 } 863 }
@@ -1172,6 +1177,18 @@ static int check_ld_imm(struct verifier_env *env, struct bpf_insn *insn)
1172 return 0; 1177 return 0;
1173} 1178}
1174 1179
1180static bool may_access_skb(enum bpf_prog_type type)
1181{
1182 switch (type) {
1183 case BPF_PROG_TYPE_SOCKET_FILTER:
1184 case BPF_PROG_TYPE_SCHED_CLS:
1185 case BPF_PROG_TYPE_SCHED_ACT:
1186 return true;
1187 default:
1188 return false;
1189 }
1190}
1191
1175/* verify safety of LD_ABS|LD_IND instructions: 1192/* verify safety of LD_ABS|LD_IND instructions:
1176 * - they can only appear in the programs where ctx == skb 1193 * - they can only appear in the programs where ctx == skb
1177 * - since they are wrappers of function calls, they scratch R1-R5 registers, 1194 * - since they are wrappers of function calls, they scratch R1-R5 registers,
@@ -1194,8 +1211,8 @@ static int check_ld_abs(struct verifier_env *env, struct bpf_insn *insn)
1194 struct reg_state *reg; 1211 struct reg_state *reg;
1195 int i, err; 1212 int i, err;
1196 1213
1197 if (env->prog->aux->prog_type != BPF_PROG_TYPE_SOCKET_FILTER) { 1214 if (!may_access_skb(env->prog->type)) {
1198 verbose("BPF_LD_ABS|IND instructions are only allowed in socket filters\n"); 1215 verbose("BPF_LD_ABS|IND instructions not allowed for this program type\n");
1199 return -EINVAL; 1216 return -EINVAL;
1200 } 1217 }
1201 1218
@@ -1380,7 +1397,8 @@ peek_stack:
1380 /* tell verifier to check for equivalent states 1397 /* tell verifier to check for equivalent states
1381 * after every call and jump 1398 * after every call and jump
1382 */ 1399 */
1383 env->explored_states[t + 1] = STATE_LIST_MARK; 1400 if (t + 1 < insn_cnt)
1401 env->explored_states[t + 1] = STATE_LIST_MARK;
1384 } else { 1402 } else {
1385 /* conditional jump with two edges */ 1403 /* conditional jump with two edges */
1386 ret = push_insn(t, t + 1, FALLTHROUGH, env); 1404 ret = push_insn(t, t + 1, FALLTHROUGH, env);
@@ -1606,11 +1624,10 @@ static int do_check(struct verifier_env *env)
1606 return err; 1624 return err;
1607 1625
1608 } else if (class == BPF_LDX) { 1626 } else if (class == BPF_LDX) {
1609 if (BPF_MODE(insn->code) != BPF_MEM || 1627 enum bpf_reg_type src_reg_type;
1610 insn->imm != 0) { 1628
1611 verbose("BPF_LDX uses reserved fields\n"); 1629 /* check for reserved fields is already done */
1612 return -EINVAL; 1630
1613 }
1614 /* check src operand */ 1631 /* check src operand */
1615 err = check_reg_arg(regs, insn->src_reg, SRC_OP); 1632 err = check_reg_arg(regs, insn->src_reg, SRC_OP);
1616 if (err) 1633 if (err)
@@ -1620,6 +1637,8 @@ static int do_check(struct verifier_env *env)
1620 if (err) 1637 if (err)
1621 return err; 1638 return err;
1622 1639
1640 src_reg_type = regs[insn->src_reg].type;
1641
1623 /* check that memory (src_reg + off) is readable, 1642 /* check that memory (src_reg + off) is readable,
1624 * the state of dst_reg will be updated by this func 1643 * the state of dst_reg will be updated by this func
1625 */ 1644 */
@@ -1629,6 +1648,32 @@ static int do_check(struct verifier_env *env)
1629 if (err) 1648 if (err)
1630 return err; 1649 return err;
1631 1650
1651 if (BPF_SIZE(insn->code) != BPF_W) {
1652 insn_idx++;
1653 continue;
1654 }
1655
1656 if (insn->imm == 0) {
1657 /* saw a valid insn
1658 * dst_reg = *(u32 *)(src_reg + off)
1659 * use reserved 'imm' field to mark this insn
1660 */
1661 insn->imm = src_reg_type;
1662
1663 } else if (src_reg_type != insn->imm &&
1664 (src_reg_type == PTR_TO_CTX ||
1665 insn->imm == PTR_TO_CTX)) {
1666 /* ABuser program is trying to use the same insn
1667 * dst_reg = *(u32*) (src_reg + off)
1668 * with different pointer types:
1669 * src_reg == ctx in one branch and
1670 * src_reg == stack|map in some other branch.
1671 * Reject it.
1672 */
1673 verbose("same insn cannot be used with different pointers\n");
1674 return -EINVAL;
1675 }
1676
1632 } else if (class == BPF_STX) { 1677 } else if (class == BPF_STX) {
1633 if (BPF_MODE(insn->code) == BPF_XADD) { 1678 if (BPF_MODE(insn->code) == BPF_XADD) {
1634 err = check_xadd(env, insn); 1679 err = check_xadd(env, insn);
@@ -1776,6 +1821,13 @@ static int replace_map_fd_with_map_ptr(struct verifier_env *env)
1776 int i, j; 1821 int i, j;
1777 1822
1778 for (i = 0; i < insn_cnt; i++, insn++) { 1823 for (i = 0; i < insn_cnt; i++, insn++) {
1824 if (BPF_CLASS(insn->code) == BPF_LDX &&
1825 (BPF_MODE(insn->code) != BPF_MEM ||
1826 insn->imm != 0)) {
1827 verbose("BPF_LDX uses reserved fields\n");
1828 return -EINVAL;
1829 }
1830
1779 if (insn[0].code == (BPF_LD | BPF_IMM | BPF_DW)) { 1831 if (insn[0].code == (BPF_LD | BPF_IMM | BPF_DW)) {
1780 struct bpf_map *map; 1832 struct bpf_map *map;
1781 struct fd f; 1833 struct fd f;
@@ -1867,6 +1919,92 @@ static void convert_pseudo_ld_imm64(struct verifier_env *env)
1867 insn->src_reg = 0; 1919 insn->src_reg = 0;
1868} 1920}
1869 1921
1922static void adjust_branches(struct bpf_prog *prog, int pos, int delta)
1923{
1924 struct bpf_insn *insn = prog->insnsi;
1925 int insn_cnt = prog->len;
1926 int i;
1927
1928 for (i = 0; i < insn_cnt; i++, insn++) {
1929 if (BPF_CLASS(insn->code) != BPF_JMP ||
1930 BPF_OP(insn->code) == BPF_CALL ||
1931 BPF_OP(insn->code) == BPF_EXIT)
1932 continue;
1933
1934 /* adjust offset of jmps if necessary */
1935 if (i < pos && i + insn->off + 1 > pos)
1936 insn->off += delta;
1937 else if (i > pos && i + insn->off + 1 < pos)
1938 insn->off -= delta;
1939 }
1940}
1941
1942/* convert load instructions that access fields of 'struct __sk_buff'
1943 * into sequence of instructions that access fields of 'struct sk_buff'
1944 */
1945static int convert_ctx_accesses(struct verifier_env *env)
1946{
1947 struct bpf_insn *insn = env->prog->insnsi;
1948 int insn_cnt = env->prog->len;
1949 struct bpf_insn insn_buf[16];
1950 struct bpf_prog *new_prog;
1951 u32 cnt;
1952 int i;
1953
1954 if (!env->prog->aux->ops->convert_ctx_access)
1955 return 0;
1956
1957 for (i = 0; i < insn_cnt; i++, insn++) {
1958 if (insn->code != (BPF_LDX | BPF_MEM | BPF_W))
1959 continue;
1960
1961 if (insn->imm != PTR_TO_CTX) {
1962 /* clear internal mark */
1963 insn->imm = 0;
1964 continue;
1965 }
1966
1967 cnt = env->prog->aux->ops->
1968 convert_ctx_access(insn->dst_reg, insn->src_reg,
1969 insn->off, insn_buf);
1970 if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) {
1971 verbose("bpf verifier is misconfigured\n");
1972 return -EINVAL;
1973 }
1974
1975 if (cnt == 1) {
1976 memcpy(insn, insn_buf, sizeof(*insn));
1977 continue;
1978 }
1979
1980 /* several new insns need to be inserted. Make room for them */
1981 insn_cnt += cnt - 1;
1982 new_prog = bpf_prog_realloc(env->prog,
1983 bpf_prog_size(insn_cnt),
1984 GFP_USER);
1985 if (!new_prog)
1986 return -ENOMEM;
1987
1988 new_prog->len = insn_cnt;
1989
1990 memmove(new_prog->insnsi + i + cnt, new_prog->insns + i + 1,
1991 sizeof(*insn) * (insn_cnt - i - cnt));
1992
1993 /* copy substitute insns in place of load instruction */
1994 memcpy(new_prog->insnsi + i, insn_buf, sizeof(*insn) * cnt);
1995
1996 /* adjust branches in the whole program */
1997 adjust_branches(new_prog, i, cnt - 1);
1998
1999 /* keep walking new program and skip insns we just inserted */
2000 env->prog = new_prog;
2001 insn = new_prog->insnsi + i + cnt - 1;
2002 i += cnt - 1;
2003 }
2004
2005 return 0;
2006}
2007
1870static void free_states(struct verifier_env *env) 2008static void free_states(struct verifier_env *env)
1871{ 2009{
1872 struct verifier_state_list *sl, *sln; 2010 struct verifier_state_list *sl, *sln;
@@ -1889,13 +2027,13 @@ static void free_states(struct verifier_env *env)
1889 kfree(env->explored_states); 2027 kfree(env->explored_states);
1890} 2028}
1891 2029
1892int bpf_check(struct bpf_prog *prog, union bpf_attr *attr) 2030int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
1893{ 2031{
1894 char __user *log_ubuf = NULL; 2032 char __user *log_ubuf = NULL;
1895 struct verifier_env *env; 2033 struct verifier_env *env;
1896 int ret = -EINVAL; 2034 int ret = -EINVAL;
1897 2035
1898 if (prog->len <= 0 || prog->len > BPF_MAXINSNS) 2036 if ((*prog)->len <= 0 || (*prog)->len > BPF_MAXINSNS)
1899 return -E2BIG; 2037 return -E2BIG;
1900 2038
1901 /* 'struct verifier_env' can be global, but since it's not small, 2039 /* 'struct verifier_env' can be global, but since it's not small,
@@ -1905,7 +2043,7 @@ int bpf_check(struct bpf_prog *prog, union bpf_attr *attr)
1905 if (!env) 2043 if (!env)
1906 return -ENOMEM; 2044 return -ENOMEM;
1907 2045
1908 env->prog = prog; 2046 env->prog = *prog;
1909 2047
1910 /* grab the mutex to protect few globals used by verifier */ 2048 /* grab the mutex to protect few globals used by verifier */
1911 mutex_lock(&bpf_verifier_lock); 2049 mutex_lock(&bpf_verifier_lock);
@@ -1937,7 +2075,7 @@ int bpf_check(struct bpf_prog *prog, union bpf_attr *attr)
1937 if (ret < 0) 2075 if (ret < 0)
1938 goto skip_full_check; 2076 goto skip_full_check;
1939 2077
1940 env->explored_states = kcalloc(prog->len, 2078 env->explored_states = kcalloc(env->prog->len,
1941 sizeof(struct verifier_state_list *), 2079 sizeof(struct verifier_state_list *),
1942 GFP_USER); 2080 GFP_USER);
1943 ret = -ENOMEM; 2081 ret = -ENOMEM;
@@ -1954,6 +2092,10 @@ skip_full_check:
1954 while (pop_stack(env, NULL) >= 0); 2092 while (pop_stack(env, NULL) >= 0);
1955 free_states(env); 2093 free_states(env);
1956 2094
2095 if (ret == 0)
2096 /* program is valid, convert *(u32*)(ctx + off) accesses */
2097 ret = convert_ctx_accesses(env);
2098
1957 if (log_level && log_len >= log_size - 1) { 2099 if (log_level && log_len >= log_size - 1) {
1958 BUG_ON(log_len >= log_size); 2100 BUG_ON(log_len >= log_size);
1959 /* verifier log exceeded user supplied buffer */ 2101 /* verifier log exceeded user supplied buffer */
@@ -1969,18 +2111,18 @@ skip_full_check:
1969 2111
1970 if (ret == 0 && env->used_map_cnt) { 2112 if (ret == 0 && env->used_map_cnt) {
1971 /* if program passed verifier, update used_maps in bpf_prog_info */ 2113 /* if program passed verifier, update used_maps in bpf_prog_info */
1972 prog->aux->used_maps = kmalloc_array(env->used_map_cnt, 2114 env->prog->aux->used_maps = kmalloc_array(env->used_map_cnt,
1973 sizeof(env->used_maps[0]), 2115 sizeof(env->used_maps[0]),
1974 GFP_KERNEL); 2116 GFP_KERNEL);
1975 2117
1976 if (!prog->aux->used_maps) { 2118 if (!env->prog->aux->used_maps) {
1977 ret = -ENOMEM; 2119 ret = -ENOMEM;
1978 goto free_log_buf; 2120 goto free_log_buf;
1979 } 2121 }
1980 2122
1981 memcpy(prog->aux->used_maps, env->used_maps, 2123 memcpy(env->prog->aux->used_maps, env->used_maps,
1982 sizeof(env->used_maps[0]) * env->used_map_cnt); 2124 sizeof(env->used_maps[0]) * env->used_map_cnt);
1983 prog->aux->used_map_cnt = env->used_map_cnt; 2125 env->prog->aux->used_map_cnt = env->used_map_cnt;
1984 2126
1985 /* program is valid. Convert pseudo bpf_ld_imm64 into generic 2127 /* program is valid. Convert pseudo bpf_ld_imm64 into generic
1986 * bpf_ld_imm64 instructions 2128 * bpf_ld_imm64 instructions
@@ -1992,11 +2134,12 @@ free_log_buf:
1992 if (log_level) 2134 if (log_level)
1993 vfree(log_buf); 2135 vfree(log_buf);
1994free_env: 2136free_env:
1995 if (!prog->aux->used_maps) 2137 if (!env->prog->aux->used_maps)
1996 /* if we didn't copy map pointers into bpf_prog_info, release 2138 /* if we didn't copy map pointers into bpf_prog_info, release
1997 * them now. Otherwise free_bpf_prog_info() will release them. 2139 * them now. Otherwise free_bpf_prog_info() will release them.
1998 */ 2140 */
1999 release_maps(env); 2141 release_maps(env);
2142 *prog = env->prog;
2000 kfree(env); 2143 kfree(env);
2001 mutex_unlock(&bpf_verifier_lock); 2144 mutex_unlock(&bpf_verifier_lock);
2002 return ret; 2145 return ret;
diff --git a/kernel/capability.c b/kernel/capability.c
index 989f5bfc57dc..45432b54d5c6 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -35,6 +35,7 @@ static int __init file_caps_disable(char *str)
35} 35}
36__setup("no_file_caps", file_caps_disable); 36__setup("no_file_caps", file_caps_disable);
37 37
38#ifdef CONFIG_MULTIUSER
38/* 39/*
39 * More recent versions of libcap are available from: 40 * More recent versions of libcap are available from:
40 * 41 *
@@ -386,6 +387,24 @@ bool ns_capable(struct user_namespace *ns, int cap)
386} 387}
387EXPORT_SYMBOL(ns_capable); 388EXPORT_SYMBOL(ns_capable);
388 389
390
391/**
392 * capable - Determine if the current task has a superior capability in effect
393 * @cap: The capability to be tested for
394 *
395 * Return true if the current task has the given superior capability currently
396 * available for use, false if not.
397 *
398 * This sets PF_SUPERPRIV on the task if the capability is available on the
399 * assumption that it's about to be used.
400 */
401bool capable(int cap)
402{
403 return ns_capable(&init_user_ns, cap);
404}
405EXPORT_SYMBOL(capable);
406#endif /* CONFIG_MULTIUSER */
407
389/** 408/**
390 * file_ns_capable - Determine if the file's opener had a capability in effect 409 * file_ns_capable - Determine if the file's opener had a capability in effect
391 * @file: The file we want to check 410 * @file: The file we want to check
@@ -412,22 +431,6 @@ bool file_ns_capable(const struct file *file, struct user_namespace *ns,
412EXPORT_SYMBOL(file_ns_capable); 431EXPORT_SYMBOL(file_ns_capable);
413 432
414/** 433/**
415 * capable - Determine if the current task has a superior capability in effect
416 * @cap: The capability to be tested for
417 *
418 * Return true if the current task has the given superior capability currently
419 * available for use, false if not.
420 *
421 * This sets PF_SUPERPRIV on the task if the capability is available on the
422 * assumption that it's about to be used.
423 */
424bool capable(int cap)
425{
426 return ns_capable(&init_user_ns, cap);
427}
428EXPORT_SYMBOL(capable);
429
430/**
431 * capable_wrt_inode_uidgid - Check nsown_capable and uid and gid mapped 434 * capable_wrt_inode_uidgid - Check nsown_capable and uid and gid mapped
432 * @inode: The inode in question 435 * @inode: The inode in question
433 * @cap: The capability in question 436 * @cap: The capability in question
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 29a7b2cc593e..469dd547770c 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -3806,10 +3806,7 @@ static void *pidlist_allocate(int count)
3806 3806
3807static void pidlist_free(void *p) 3807static void pidlist_free(void *p)
3808{ 3808{
3809 if (is_vmalloc_addr(p)) 3809 kvfree(p);
3810 vfree(p);
3811 else
3812 kfree(p);
3813} 3810}
3814 3811
3815/* 3812/*
@@ -4199,7 +4196,9 @@ static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
4199 4196
4200static int cgroup_pidlist_show(struct seq_file *s, void *v) 4197static int cgroup_pidlist_show(struct seq_file *s, void *v)
4201{ 4198{
4202 return seq_printf(s, "%d\n", *(int *)v); 4199 seq_printf(s, "%d\n", *(int *)v);
4200
4201 return 0;
4203} 4202}
4204 4203
4205static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css, 4204static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css,
@@ -5040,6 +5039,9 @@ int __init cgroup_init(void)
5040 WARN_ON(cgroup_add_dfl_cftypes(ss, ss->dfl_cftypes)); 5039 WARN_ON(cgroup_add_dfl_cftypes(ss, ss->dfl_cftypes));
5041 WARN_ON(cgroup_add_legacy_cftypes(ss, ss->legacy_cftypes)); 5040 WARN_ON(cgroup_add_legacy_cftypes(ss, ss->legacy_cftypes));
5042 } 5041 }
5042
5043 if (ss->bind)
5044 ss->bind(init_css_set.subsys[ssid]);
5043 } 5045 }
5044 5046
5045 cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj); 5047 cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj);
@@ -5451,7 +5453,7 @@ struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry,
5451struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss) 5453struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss)
5452{ 5454{
5453 WARN_ON_ONCE(!rcu_read_lock_held()); 5455 WARN_ON_ONCE(!rcu_read_lock_held());
5454 return idr_find(&ss->css_idr, id); 5456 return id > 0 ? idr_find(&ss->css_idr, id) : NULL;
5455} 5457}
5456 5458
5457#ifdef CONFIG_CGROUP_DEBUG 5459#ifdef CONFIG_CGROUP_DEBUG
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
index 937ecdfdf258..72d59a1a6eb6 100644
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c
@@ -39,15 +39,15 @@ void context_tracking_cpu_set(int cpu)
39} 39}
40 40
41/** 41/**
42 * context_tracking_user_enter - Inform the context tracking that the CPU is going to 42 * context_tracking_enter - Inform the context tracking that the CPU is going
43 * enter userspace mode. 43 * enter user or guest space mode.
44 * 44 *
45 * This function must be called right before we switch from the kernel 45 * This function must be called right before we switch from the kernel
46 * to userspace, when it's guaranteed the remaining kernel instructions 46 * to user or guest space, when it's guaranteed the remaining kernel
47 * to execute won't use any RCU read side critical section because this 47 * instructions to execute won't use any RCU read side critical section
48 * function sets RCU in extended quiescent state. 48 * because this function sets RCU in extended quiescent state.
49 */ 49 */
50void context_tracking_user_enter(void) 50void context_tracking_enter(enum ctx_state state)
51{ 51{
52 unsigned long flags; 52 unsigned long flags;
53 53
@@ -75,9 +75,8 @@ void context_tracking_user_enter(void)
75 WARN_ON_ONCE(!current->mm); 75 WARN_ON_ONCE(!current->mm);
76 76
77 local_irq_save(flags); 77 local_irq_save(flags);
78 if ( __this_cpu_read(context_tracking.state) != IN_USER) { 78 if ( __this_cpu_read(context_tracking.state) != state) {
79 if (__this_cpu_read(context_tracking.active)) { 79 if (__this_cpu_read(context_tracking.active)) {
80 trace_user_enter(0);
81 /* 80 /*
82 * At this stage, only low level arch entry code remains and 81 * At this stage, only low level arch entry code remains and
83 * then we'll run in userspace. We can assume there won't be 82 * then we'll run in userspace. We can assume there won't be
@@ -85,7 +84,10 @@ void context_tracking_user_enter(void)
85 * user_exit() or rcu_irq_enter(). Let's remove RCU's dependency 84 * user_exit() or rcu_irq_enter(). Let's remove RCU's dependency
86 * on the tick. 85 * on the tick.
87 */ 86 */
88 vtime_user_enter(current); 87 if (state == CONTEXT_USER) {
88 trace_user_enter(0);
89 vtime_user_enter(current);
90 }
89 rcu_user_enter(); 91 rcu_user_enter();
90 } 92 }
91 /* 93 /*
@@ -101,24 +103,32 @@ void context_tracking_user_enter(void)
101 * OTOH we can spare the calls to vtime and RCU when context_tracking.active 103 * OTOH we can spare the calls to vtime and RCU when context_tracking.active
102 * is false because we know that CPU is not tickless. 104 * is false because we know that CPU is not tickless.
103 */ 105 */
104 __this_cpu_write(context_tracking.state, IN_USER); 106 __this_cpu_write(context_tracking.state, state);
105 } 107 }
106 local_irq_restore(flags); 108 local_irq_restore(flags);
107} 109}
110NOKPROBE_SYMBOL(context_tracking_enter);
111EXPORT_SYMBOL_GPL(context_tracking_enter);
112
113void context_tracking_user_enter(void)
114{
115 context_tracking_enter(CONTEXT_USER);
116}
108NOKPROBE_SYMBOL(context_tracking_user_enter); 117NOKPROBE_SYMBOL(context_tracking_user_enter);
109 118
110/** 119/**
111 * context_tracking_user_exit - Inform the context tracking that the CPU is 120 * context_tracking_exit - Inform the context tracking that the CPU is
112 * exiting userspace mode and entering the kernel. 121 * exiting user or guest mode and entering the kernel.
113 * 122 *
114 * This function must be called after we entered the kernel from userspace 123 * This function must be called after we entered the kernel from user or
115 * before any use of RCU read side critical section. This potentially include 124 * guest space before any use of RCU read side critical section. This
116 * any high level kernel code like syscalls, exceptions, signal handling, etc... 125 * potentially include any high level kernel code like syscalls, exceptions,
126 * signal handling, etc...
117 * 127 *
118 * This call supports re-entrancy. This way it can be called from any exception 128 * This call supports re-entrancy. This way it can be called from any exception
119 * handler without needing to know if we came from userspace or not. 129 * handler without needing to know if we came from userspace or not.
120 */ 130 */
121void context_tracking_user_exit(void) 131void context_tracking_exit(enum ctx_state state)
122{ 132{
123 unsigned long flags; 133 unsigned long flags;
124 134
@@ -129,20 +139,29 @@ void context_tracking_user_exit(void)
129 return; 139 return;
130 140
131 local_irq_save(flags); 141 local_irq_save(flags);
132 if (__this_cpu_read(context_tracking.state) == IN_USER) { 142 if (__this_cpu_read(context_tracking.state) == state) {
133 if (__this_cpu_read(context_tracking.active)) { 143 if (__this_cpu_read(context_tracking.active)) {
134 /* 144 /*
135 * We are going to run code that may use RCU. Inform 145 * We are going to run code that may use RCU. Inform
136 * RCU core about that (ie: we may need the tick again). 146 * RCU core about that (ie: we may need the tick again).
137 */ 147 */
138 rcu_user_exit(); 148 rcu_user_exit();
139 vtime_user_exit(current); 149 if (state == CONTEXT_USER) {
140 trace_user_exit(0); 150 vtime_user_exit(current);
151 trace_user_exit(0);
152 }
141 } 153 }
142 __this_cpu_write(context_tracking.state, IN_KERNEL); 154 __this_cpu_write(context_tracking.state, CONTEXT_KERNEL);
143 } 155 }
144 local_irq_restore(flags); 156 local_irq_restore(flags);
145} 157}
158NOKPROBE_SYMBOL(context_tracking_exit);
159EXPORT_SYMBOL_GPL(context_tracking_exit);
160
161void context_tracking_user_exit(void)
162{
163 context_tracking_exit(CONTEXT_USER);
164}
146NOKPROBE_SYMBOL(context_tracking_user_exit); 165NOKPROBE_SYMBOL(context_tracking_user_exit);
147 166
148/** 167/**
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 1972b161c61e..94bbe4695232 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -20,6 +20,7 @@
20#include <linux/gfp.h> 20#include <linux/gfp.h>
21#include <linux/suspend.h> 21#include <linux/suspend.h>
22#include <linux/lockdep.h> 22#include <linux/lockdep.h>
23#include <linux/tick.h>
23#include <trace/events/power.h> 24#include <trace/events/power.h>
24 25
25#include "smpboot.h" 26#include "smpboot.h"
@@ -338,6 +339,8 @@ static int __ref take_cpu_down(void *_param)
338 return err; 339 return err;
339 340
340 cpu_notify(CPU_DYING | param->mod, param->hcpu); 341 cpu_notify(CPU_DYING | param->mod, param->hcpu);
342 /* Give up timekeeping duties */
343 tick_handover_do_timer();
341 /* Park the stopper thread */ 344 /* Park the stopper thread */
342 kthread_park(current); 345 kthread_park(current);
343 return 0; 346 return 0;
@@ -408,13 +411,17 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
408 * 411 *
409 * Wait for the stop thread to go away. 412 * Wait for the stop thread to go away.
410 */ 413 */
411 while (!idle_cpu(cpu)) 414 while (!per_cpu(cpu_dead_idle, cpu))
412 cpu_relax(); 415 cpu_relax();
416 smp_mb(); /* Read from cpu_dead_idle before __cpu_die(). */
417 per_cpu(cpu_dead_idle, cpu) = false;
413 418
419 hotplug_cpu__broadcast_tick_pull(cpu);
414 /* This actually kills the CPU. */ 420 /* This actually kills the CPU. */
415 __cpu_die(cpu); 421 __cpu_die(cpu);
416 422
417 /* CPU is completely dead: tell everyone. Too late to complain. */ 423 /* CPU is completely dead: tell everyone. Too late to complain. */
424 tick_cleanup_dead_cpu(cpu);
418 cpu_notify_nofail(CPU_DEAD | mod, hcpu); 425 cpu_notify_nofail(CPU_DEAD | mod, hcpu);
419 426
420 check_for_tasks(cpu); 427 check_for_tasks(cpu);
@@ -446,6 +453,37 @@ out:
446EXPORT_SYMBOL(cpu_down); 453EXPORT_SYMBOL(cpu_down);
447#endif /*CONFIG_HOTPLUG_CPU*/ 454#endif /*CONFIG_HOTPLUG_CPU*/
448 455
456/*
457 * Unpark per-CPU smpboot kthreads at CPU-online time.
458 */
459static int smpboot_thread_call(struct notifier_block *nfb,
460 unsigned long action, void *hcpu)
461{
462 int cpu = (long)hcpu;
463
464 switch (action & ~CPU_TASKS_FROZEN) {
465
466 case CPU_ONLINE:
467 smpboot_unpark_threads(cpu);
468 break;
469
470 default:
471 break;
472 }
473
474 return NOTIFY_OK;
475}
476
477static struct notifier_block smpboot_thread_notifier = {
478 .notifier_call = smpboot_thread_call,
479 .priority = CPU_PRI_SMPBOOT,
480};
481
482void __cpuinit smpboot_thread_init(void)
483{
484 register_cpu_notifier(&smpboot_thread_notifier);
485}
486
449/* Requires cpu_add_remove_lock to be held */ 487/* Requires cpu_add_remove_lock to be held */
450static int _cpu_up(unsigned int cpu, int tasks_frozen) 488static int _cpu_up(unsigned int cpu, int tasks_frozen)
451{ 489{
@@ -485,9 +523,6 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen)
485 goto out_notify; 523 goto out_notify;
486 BUG_ON(!cpu_online(cpu)); 524 BUG_ON(!cpu_online(cpu));
487 525
488 /* Wake the per cpu threads */
489 smpboot_unpark_threads(cpu);
490
491 /* Now call notifier in preparation. */ 526 /* Now call notifier in preparation. */
492 cpu_notify(CPU_ONLINE | mod, hcpu); 527 cpu_notify(CPU_ONLINE | mod, hcpu);
493 528
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index fc7f4748d34a..ee14e3a35a29 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -622,6 +622,7 @@ static int generate_sched_domains(cpumask_var_t **domains,
622 int csn; /* how many cpuset ptrs in csa so far */ 622 int csn; /* how many cpuset ptrs in csa so far */
623 int i, j, k; /* indices for partition finding loops */ 623 int i, j, k; /* indices for partition finding loops */
624 cpumask_var_t *doms; /* resulting partition; i.e. sched domains */ 624 cpumask_var_t *doms; /* resulting partition; i.e. sched domains */
625 cpumask_var_t non_isolated_cpus; /* load balanced CPUs */
625 struct sched_domain_attr *dattr; /* attributes for custom domains */ 626 struct sched_domain_attr *dattr; /* attributes for custom domains */
626 int ndoms = 0; /* number of sched domains in result */ 627 int ndoms = 0; /* number of sched domains in result */
627 int nslot; /* next empty doms[] struct cpumask slot */ 628 int nslot; /* next empty doms[] struct cpumask slot */
@@ -631,6 +632,10 @@ static int generate_sched_domains(cpumask_var_t **domains,
631 dattr = NULL; 632 dattr = NULL;
632 csa = NULL; 633 csa = NULL;
633 634
635 if (!alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL))
636 goto done;
637 cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
638
634 /* Special case for the 99% of systems with one, full, sched domain */ 639 /* Special case for the 99% of systems with one, full, sched domain */
635 if (is_sched_load_balance(&top_cpuset)) { 640 if (is_sched_load_balance(&top_cpuset)) {
636 ndoms = 1; 641 ndoms = 1;
@@ -643,7 +648,8 @@ static int generate_sched_domains(cpumask_var_t **domains,
643 *dattr = SD_ATTR_INIT; 648 *dattr = SD_ATTR_INIT;
644 update_domain_attr_tree(dattr, &top_cpuset); 649 update_domain_attr_tree(dattr, &top_cpuset);
645 } 650 }
646 cpumask_copy(doms[0], top_cpuset.effective_cpus); 651 cpumask_and(doms[0], top_cpuset.effective_cpus,
652 non_isolated_cpus);
647 653
648 goto done; 654 goto done;
649 } 655 }
@@ -666,7 +672,8 @@ static int generate_sched_domains(cpumask_var_t **domains,
666 * the corresponding sched domain. 672 * the corresponding sched domain.
667 */ 673 */
668 if (!cpumask_empty(cp->cpus_allowed) && 674 if (!cpumask_empty(cp->cpus_allowed) &&
669 !is_sched_load_balance(cp)) 675 !(is_sched_load_balance(cp) &&
676 cpumask_intersects(cp->cpus_allowed, non_isolated_cpus)))
670 continue; 677 continue;
671 678
672 if (is_sched_load_balance(cp)) 679 if (is_sched_load_balance(cp))
@@ -748,6 +755,7 @@ restart:
748 755
749 if (apn == b->pn) { 756 if (apn == b->pn) {
750 cpumask_or(dp, dp, b->effective_cpus); 757 cpumask_or(dp, dp, b->effective_cpus);
758 cpumask_and(dp, dp, non_isolated_cpus);
751 if (dattr) 759 if (dattr)
752 update_domain_attr_tree(dattr + nslot, b); 760 update_domain_attr_tree(dattr + nslot, b);
753 761
@@ -760,6 +768,7 @@ restart:
760 BUG_ON(nslot != ndoms); 768 BUG_ON(nslot != ndoms);
761 769
762done: 770done:
771 free_cpumask_var(non_isolated_cpus);
763 kfree(csa); 772 kfree(csa);
764 773
765 /* 774 /*
@@ -2444,20 +2453,12 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
2444 * @node: is this an allowed node? 2453 * @node: is this an allowed node?
2445 * @gfp_mask: memory allocation flags 2454 * @gfp_mask: memory allocation flags
2446 * 2455 *
2447 * If we're in interrupt, yes, we can always allocate. If __GFP_THISNODE is 2456 * If we're in interrupt, yes, we can always allocate. If @node is set in
2448 * set, yes, we can always allocate. If node is in our task's mems_allowed, 2457 * current's mems_allowed, yes. If it's not a __GFP_HARDWALL request and this
2449 * yes. If it's not a __GFP_HARDWALL request and this node is in the nearest 2458 * node is set in the nearest hardwalled cpuset ancestor to current's cpuset,
2450 * hardwalled cpuset ancestor to this task's cpuset, yes. If the task has been 2459 * yes. If current has access to memory reserves due to TIF_MEMDIE, yes.
2451 * OOM killed and has access to memory reserves as specified by the TIF_MEMDIE
2452 * flag, yes.
2453 * Otherwise, no. 2460 * Otherwise, no.
2454 * 2461 *
2455 * The __GFP_THISNODE placement logic is really handled elsewhere,
2456 * by forcibly using a zonelist starting at a specified node, and by
2457 * (in get_page_from_freelist()) refusing to consider the zones for
2458 * any node on the zonelist except the first. By the time any such
2459 * calls get to this routine, we should just shut up and say 'yes'.
2460 *
2461 * GFP_USER allocations are marked with the __GFP_HARDWALL bit, 2462 * GFP_USER allocations are marked with the __GFP_HARDWALL bit,
2462 * and do not allow allocations outside the current tasks cpuset 2463 * and do not allow allocations outside the current tasks cpuset
2463 * unless the task has been OOM killed as is marked TIF_MEMDIE. 2464 * unless the task has been OOM killed as is marked TIF_MEMDIE.
@@ -2493,7 +2494,7 @@ int __cpuset_node_allowed(int node, gfp_t gfp_mask)
2493 int allowed; /* is allocation in zone z allowed? */ 2494 int allowed; /* is allocation in zone z allowed? */
2494 unsigned long flags; 2495 unsigned long flags;
2495 2496
2496 if (in_interrupt() || (gfp_mask & __GFP_THISNODE)) 2497 if (in_interrupt())
2497 return 1; 2498 return 1;
2498 if (node_isset(node, current->mems_allowed)) 2499 if (node_isset(node, current->mems_allowed))
2499 return 1; 2500 return 1;
diff --git a/kernel/cred.c b/kernel/cred.c
index e0573a43c7df..ec1c07667ec1 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -29,6 +29,9 @@
29 29
30static struct kmem_cache *cred_jar; 30static struct kmem_cache *cred_jar;
31 31
32/* init to 2 - one for init_task, one to ensure it is never freed */
33struct group_info init_groups = { .usage = ATOMIC_INIT(2) };
34
32/* 35/*
33 * The initial credentials for the initial task 36 * The initial credentials for the initial task
34 */ 37 */
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 453ef61311d4..81aa3a4ece9f 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -34,14 +34,16 @@
34#include <linux/syscalls.h> 34#include <linux/syscalls.h>
35#include <linux/anon_inodes.h> 35#include <linux/anon_inodes.h>
36#include <linux/kernel_stat.h> 36#include <linux/kernel_stat.h>
37#include <linux/cgroup.h>
37#include <linux/perf_event.h> 38#include <linux/perf_event.h>
38#include <linux/ftrace_event.h> 39#include <linux/ftrace_event.h>
39#include <linux/hw_breakpoint.h> 40#include <linux/hw_breakpoint.h>
40#include <linux/mm_types.h> 41#include <linux/mm_types.h>
41#include <linux/cgroup.h>
42#include <linux/module.h> 42#include <linux/module.h>
43#include <linux/mman.h> 43#include <linux/mman.h>
44#include <linux/compat.h> 44#include <linux/compat.h>
45#include <linux/bpf.h>
46#include <linux/filter.h>
45 47
46#include "internal.h" 48#include "internal.h"
47 49
@@ -153,7 +155,7 @@ enum event_type_t {
153 */ 155 */
154struct static_key_deferred perf_sched_events __read_mostly; 156struct static_key_deferred perf_sched_events __read_mostly;
155static DEFINE_PER_CPU(atomic_t, perf_cgroup_events); 157static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
156static DEFINE_PER_CPU(atomic_t, perf_branch_stack_events); 158static DEFINE_PER_CPU(int, perf_sched_cb_usages);
157 159
158static atomic_t nr_mmap_events __read_mostly; 160static atomic_t nr_mmap_events __read_mostly;
159static atomic_t nr_comm_events __read_mostly; 161static atomic_t nr_comm_events __read_mostly;
@@ -327,6 +329,11 @@ static inline u64 perf_clock(void)
327 return local_clock(); 329 return local_clock();
328} 330}
329 331
332static inline u64 perf_event_clock(struct perf_event *event)
333{
334 return event->clock();
335}
336
330static inline struct perf_cpu_context * 337static inline struct perf_cpu_context *
331__get_cpu_context(struct perf_event_context *ctx) 338__get_cpu_context(struct perf_event_context *ctx)
332{ 339{
@@ -351,32 +358,6 @@ static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
351 358
352#ifdef CONFIG_CGROUP_PERF 359#ifdef CONFIG_CGROUP_PERF
353 360
354/*
355 * perf_cgroup_info keeps track of time_enabled for a cgroup.
356 * This is a per-cpu dynamically allocated data structure.
357 */
358struct perf_cgroup_info {
359 u64 time;
360 u64 timestamp;
361};
362
363struct perf_cgroup {
364 struct cgroup_subsys_state css;
365 struct perf_cgroup_info __percpu *info;
366};
367
368/*
369 * Must ensure cgroup is pinned (css_get) before calling
370 * this function. In other words, we cannot call this function
371 * if there is no cgroup event for the current CPU context.
372 */
373static inline struct perf_cgroup *
374perf_cgroup_from_task(struct task_struct *task)
375{
376 return container_of(task_css(task, perf_event_cgrp_id),
377 struct perf_cgroup, css);
378}
379
380static inline bool 361static inline bool
381perf_cgroup_match(struct perf_event *event) 362perf_cgroup_match(struct perf_event *event)
382{ 363{
@@ -905,6 +886,15 @@ static void get_ctx(struct perf_event_context *ctx)
905 WARN_ON(!atomic_inc_not_zero(&ctx->refcount)); 886 WARN_ON(!atomic_inc_not_zero(&ctx->refcount));
906} 887}
907 888
889static void free_ctx(struct rcu_head *head)
890{
891 struct perf_event_context *ctx;
892
893 ctx = container_of(head, struct perf_event_context, rcu_head);
894 kfree(ctx->task_ctx_data);
895 kfree(ctx);
896}
897
908static void put_ctx(struct perf_event_context *ctx) 898static void put_ctx(struct perf_event_context *ctx)
909{ 899{
910 if (atomic_dec_and_test(&ctx->refcount)) { 900 if (atomic_dec_and_test(&ctx->refcount)) {
@@ -912,7 +902,7 @@ static void put_ctx(struct perf_event_context *ctx)
912 put_ctx(ctx->parent_ctx); 902 put_ctx(ctx->parent_ctx);
913 if (ctx->task) 903 if (ctx->task)
914 put_task_struct(ctx->task); 904 put_task_struct(ctx->task);
915 kfree_rcu(ctx, rcu_head); 905 call_rcu(&ctx->rcu_head, free_ctx);
916 } 906 }
917} 907}
918 908
@@ -1239,9 +1229,6 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
1239 if (is_cgroup_event(event)) 1229 if (is_cgroup_event(event))
1240 ctx->nr_cgroups++; 1230 ctx->nr_cgroups++;
1241 1231
1242 if (has_branch_stack(event))
1243 ctx->nr_branch_stack++;
1244
1245 list_add_rcu(&event->event_entry, &ctx->event_list); 1232 list_add_rcu(&event->event_entry, &ctx->event_list);
1246 ctx->nr_events++; 1233 ctx->nr_events++;
1247 if (event->attr.inherit_stat) 1234 if (event->attr.inherit_stat)
@@ -1408,9 +1395,6 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
1408 cpuctx->cgrp = NULL; 1395 cpuctx->cgrp = NULL;
1409 } 1396 }
1410 1397
1411 if (has_branch_stack(event))
1412 ctx->nr_branch_stack--;
1413
1414 ctx->nr_events--; 1398 ctx->nr_events--;
1415 if (event->attr.inherit_stat) 1399 if (event->attr.inherit_stat)
1416 ctx->nr_stat--; 1400 ctx->nr_stat--;
@@ -1847,6 +1831,7 @@ static void perf_set_shadow_time(struct perf_event *event,
1847#define MAX_INTERRUPTS (~0ULL) 1831#define MAX_INTERRUPTS (~0ULL)
1848 1832
1849static void perf_log_throttle(struct perf_event *event, int enable); 1833static void perf_log_throttle(struct perf_event *event, int enable);
1834static void perf_log_itrace_start(struct perf_event *event);
1850 1835
1851static int 1836static int
1852event_sched_in(struct perf_event *event, 1837event_sched_in(struct perf_event *event,
@@ -1881,6 +1866,12 @@ event_sched_in(struct perf_event *event,
1881 1866
1882 perf_pmu_disable(event->pmu); 1867 perf_pmu_disable(event->pmu);
1883 1868
1869 event->tstamp_running += tstamp - event->tstamp_stopped;
1870
1871 perf_set_shadow_time(event, ctx, tstamp);
1872
1873 perf_log_itrace_start(event);
1874
1884 if (event->pmu->add(event, PERF_EF_START)) { 1875 if (event->pmu->add(event, PERF_EF_START)) {
1885 event->state = PERF_EVENT_STATE_INACTIVE; 1876 event->state = PERF_EVENT_STATE_INACTIVE;
1886 event->oncpu = -1; 1877 event->oncpu = -1;
@@ -1888,10 +1879,6 @@ event_sched_in(struct perf_event *event,
1888 goto out; 1879 goto out;
1889 } 1880 }
1890 1881
1891 event->tstamp_running += tstamp - event->tstamp_stopped;
1892
1893 perf_set_shadow_time(event, ctx, tstamp);
1894
1895 if (!is_software_event(event)) 1882 if (!is_software_event(event))
1896 cpuctx->active_oncpu++; 1883 cpuctx->active_oncpu++;
1897 if (!ctx->nr_active++) 1884 if (!ctx->nr_active++)
@@ -2559,6 +2546,9 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
2559 next->perf_event_ctxp[ctxn] = ctx; 2546 next->perf_event_ctxp[ctxn] = ctx;
2560 ctx->task = next; 2547 ctx->task = next;
2561 next_ctx->task = task; 2548 next_ctx->task = task;
2549
2550 swap(ctx->task_ctx_data, next_ctx->task_ctx_data);
2551
2562 do_switch = 0; 2552 do_switch = 0;
2563 2553
2564 perf_event_sync_stat(ctx, next_ctx); 2554 perf_event_sync_stat(ctx, next_ctx);
@@ -2577,6 +2567,56 @@ unlock:
2577 } 2567 }
2578} 2568}
2579 2569
2570void perf_sched_cb_dec(struct pmu *pmu)
2571{
2572 this_cpu_dec(perf_sched_cb_usages);
2573}
2574
2575void perf_sched_cb_inc(struct pmu *pmu)
2576{
2577 this_cpu_inc(perf_sched_cb_usages);
2578}
2579
2580/*
2581 * This function provides the context switch callback to the lower code
2582 * layer. It is invoked ONLY when the context switch callback is enabled.
2583 */
2584static void perf_pmu_sched_task(struct task_struct *prev,
2585 struct task_struct *next,
2586 bool sched_in)
2587{
2588 struct perf_cpu_context *cpuctx;
2589 struct pmu *pmu;
2590 unsigned long flags;
2591
2592 if (prev == next)
2593 return;
2594
2595 local_irq_save(flags);
2596
2597 rcu_read_lock();
2598
2599 list_for_each_entry_rcu(pmu, &pmus, entry) {
2600 if (pmu->sched_task) {
2601 cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
2602
2603 perf_ctx_lock(cpuctx, cpuctx->task_ctx);
2604
2605 perf_pmu_disable(pmu);
2606
2607 pmu->sched_task(cpuctx->task_ctx, sched_in);
2608
2609 perf_pmu_enable(pmu);
2610
2611 perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
2612 }
2613 }
2614
2615 rcu_read_unlock();
2616
2617 local_irq_restore(flags);
2618}
2619
2580#define for_each_task_context_nr(ctxn) \ 2620#define for_each_task_context_nr(ctxn) \
2581 for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++) 2621 for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++)
2582 2622
@@ -2596,6 +2636,9 @@ void __perf_event_task_sched_out(struct task_struct *task,
2596{ 2636{
2597 int ctxn; 2637 int ctxn;
2598 2638
2639 if (__this_cpu_read(perf_sched_cb_usages))
2640 perf_pmu_sched_task(task, next, false);
2641
2599 for_each_task_context_nr(ctxn) 2642 for_each_task_context_nr(ctxn)
2600 perf_event_context_sched_out(task, ctxn, next); 2643 perf_event_context_sched_out(task, ctxn, next);
2601 2644
@@ -2755,64 +2798,6 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
2755} 2798}
2756 2799
2757/* 2800/*
2758 * When sampling the branck stack in system-wide, it may be necessary
2759 * to flush the stack on context switch. This happens when the branch
2760 * stack does not tag its entries with the pid of the current task.
2761 * Otherwise it becomes impossible to associate a branch entry with a
2762 * task. This ambiguity is more likely to appear when the branch stack
2763 * supports priv level filtering and the user sets it to monitor only
2764 * at the user level (which could be a useful measurement in system-wide
2765 * mode). In that case, the risk is high of having a branch stack with
2766 * branch from multiple tasks. Flushing may mean dropping the existing
2767 * entries or stashing them somewhere in the PMU specific code layer.
2768 *
2769 * This function provides the context switch callback to the lower code
2770 * layer. It is invoked ONLY when there is at least one system-wide context
2771 * with at least one active event using taken branch sampling.
2772 */
2773static void perf_branch_stack_sched_in(struct task_struct *prev,
2774 struct task_struct *task)
2775{
2776 struct perf_cpu_context *cpuctx;
2777 struct pmu *pmu;
2778 unsigned long flags;
2779
2780 /* no need to flush branch stack if not changing task */
2781 if (prev == task)
2782 return;
2783
2784 local_irq_save(flags);
2785
2786 rcu_read_lock();
2787
2788 list_for_each_entry_rcu(pmu, &pmus, entry) {
2789 cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
2790
2791 /*
2792 * check if the context has at least one
2793 * event using PERF_SAMPLE_BRANCH_STACK
2794 */
2795 if (cpuctx->ctx.nr_branch_stack > 0
2796 && pmu->flush_branch_stack) {
2797
2798 perf_ctx_lock(cpuctx, cpuctx->task_ctx);
2799
2800 perf_pmu_disable(pmu);
2801
2802 pmu->flush_branch_stack();
2803
2804 perf_pmu_enable(pmu);
2805
2806 perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
2807 }
2808 }
2809
2810 rcu_read_unlock();
2811
2812 local_irq_restore(flags);
2813}
2814
2815/*
2816 * Called from scheduler to add the events of the current task 2801 * Called from scheduler to add the events of the current task
2817 * with interrupts disabled. 2802 * with interrupts disabled.
2818 * 2803 *
@@ -2844,9 +2829,8 @@ void __perf_event_task_sched_in(struct task_struct *prev,
2844 if (atomic_read(this_cpu_ptr(&perf_cgroup_events))) 2829 if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
2845 perf_cgroup_sched_in(prev, task); 2830 perf_cgroup_sched_in(prev, task);
2846 2831
2847 /* check for system-wide branch_stack events */ 2832 if (__this_cpu_read(perf_sched_cb_usages))
2848 if (atomic_read(this_cpu_ptr(&perf_branch_stack_events))) 2833 perf_pmu_sched_task(prev, task, true);
2849 perf_branch_stack_sched_in(prev, task);
2850} 2834}
2851 2835
2852static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count) 2836static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
@@ -3220,7 +3204,10 @@ static void __perf_event_read(void *info)
3220 3204
3221static inline u64 perf_event_count(struct perf_event *event) 3205static inline u64 perf_event_count(struct perf_event *event)
3222{ 3206{
3223 return local64_read(&event->count) + atomic64_read(&event->child_count); 3207 if (event->pmu->count)
3208 return event->pmu->count(event);
3209
3210 return __perf_event_count(event);
3224} 3211}
3225 3212
3226static u64 perf_event_read(struct perf_event *event) 3213static u64 perf_event_read(struct perf_event *event)
@@ -3321,12 +3308,15 @@ errout:
3321 * Returns a matching context with refcount and pincount. 3308 * Returns a matching context with refcount and pincount.
3322 */ 3309 */
3323static struct perf_event_context * 3310static struct perf_event_context *
3324find_get_context(struct pmu *pmu, struct task_struct *task, int cpu) 3311find_get_context(struct pmu *pmu, struct task_struct *task,
3312 struct perf_event *event)
3325{ 3313{
3326 struct perf_event_context *ctx, *clone_ctx = NULL; 3314 struct perf_event_context *ctx, *clone_ctx = NULL;
3327 struct perf_cpu_context *cpuctx; 3315 struct perf_cpu_context *cpuctx;
3316 void *task_ctx_data = NULL;
3328 unsigned long flags; 3317 unsigned long flags;
3329 int ctxn, err; 3318 int ctxn, err;
3319 int cpu = event->cpu;
3330 3320
3331 if (!task) { 3321 if (!task) {
3332 /* Must be root to operate on a CPU event: */ 3322 /* Must be root to operate on a CPU event: */
@@ -3354,11 +3344,24 @@ find_get_context(struct pmu *pmu, struct task_struct *task, int cpu)
3354 if (ctxn < 0) 3344 if (ctxn < 0)
3355 goto errout; 3345 goto errout;
3356 3346
3347 if (event->attach_state & PERF_ATTACH_TASK_DATA) {
3348 task_ctx_data = kzalloc(pmu->task_ctx_size, GFP_KERNEL);
3349 if (!task_ctx_data) {
3350 err = -ENOMEM;
3351 goto errout;
3352 }
3353 }
3354
3357retry: 3355retry:
3358 ctx = perf_lock_task_context(task, ctxn, &flags); 3356 ctx = perf_lock_task_context(task, ctxn, &flags);
3359 if (ctx) { 3357 if (ctx) {
3360 clone_ctx = unclone_ctx(ctx); 3358 clone_ctx = unclone_ctx(ctx);
3361 ++ctx->pin_count; 3359 ++ctx->pin_count;
3360
3361 if (task_ctx_data && !ctx->task_ctx_data) {
3362 ctx->task_ctx_data = task_ctx_data;
3363 task_ctx_data = NULL;
3364 }
3362 raw_spin_unlock_irqrestore(&ctx->lock, flags); 3365 raw_spin_unlock_irqrestore(&ctx->lock, flags);
3363 3366
3364 if (clone_ctx) 3367 if (clone_ctx)
@@ -3369,6 +3372,11 @@ retry:
3369 if (!ctx) 3372 if (!ctx)
3370 goto errout; 3373 goto errout;
3371 3374
3375 if (task_ctx_data) {
3376 ctx->task_ctx_data = task_ctx_data;
3377 task_ctx_data = NULL;
3378 }
3379
3372 err = 0; 3380 err = 0;
3373 mutex_lock(&task->perf_event_mutex); 3381 mutex_lock(&task->perf_event_mutex);
3374 /* 3382 /*
@@ -3395,13 +3403,16 @@ retry:
3395 } 3403 }
3396 } 3404 }
3397 3405
3406 kfree(task_ctx_data);
3398 return ctx; 3407 return ctx;
3399 3408
3400errout: 3409errout:
3410 kfree(task_ctx_data);
3401 return ERR_PTR(err); 3411 return ERR_PTR(err);
3402} 3412}
3403 3413
3404static void perf_event_free_filter(struct perf_event *event); 3414static void perf_event_free_filter(struct perf_event *event);
3415static void perf_event_free_bpf_prog(struct perf_event *event);
3405 3416
3406static void free_event_rcu(struct rcu_head *head) 3417static void free_event_rcu(struct rcu_head *head)
3407{ 3418{
@@ -3411,10 +3422,10 @@ static void free_event_rcu(struct rcu_head *head)
3411 if (event->ns) 3422 if (event->ns)
3412 put_pid_ns(event->ns); 3423 put_pid_ns(event->ns);
3413 perf_event_free_filter(event); 3424 perf_event_free_filter(event);
3425 perf_event_free_bpf_prog(event);
3414 kfree(event); 3426 kfree(event);
3415} 3427}
3416 3428
3417static void ring_buffer_put(struct ring_buffer *rb);
3418static void ring_buffer_attach(struct perf_event *event, 3429static void ring_buffer_attach(struct perf_event *event,
3419 struct ring_buffer *rb); 3430 struct ring_buffer *rb);
3420 3431
@@ -3423,10 +3434,6 @@ static void unaccount_event_cpu(struct perf_event *event, int cpu)
3423 if (event->parent) 3434 if (event->parent)
3424 return; 3435 return;
3425 3436
3426 if (has_branch_stack(event)) {
3427 if (!(event->attach_state & PERF_ATTACH_TASK))
3428 atomic_dec(&per_cpu(perf_branch_stack_events, cpu));
3429 }
3430 if (is_cgroup_event(event)) 3437 if (is_cgroup_event(event))
3431 atomic_dec(&per_cpu(perf_cgroup_events, cpu)); 3438 atomic_dec(&per_cpu(perf_cgroup_events, cpu));
3432} 3439}
@@ -3454,6 +3461,91 @@ static void unaccount_event(struct perf_event *event)
3454 unaccount_event_cpu(event, event->cpu); 3461 unaccount_event_cpu(event, event->cpu);
3455} 3462}
3456 3463
3464/*
3465 * The following implement mutual exclusion of events on "exclusive" pmus
3466 * (PERF_PMU_CAP_EXCLUSIVE). Such pmus can only have one event scheduled
3467 * at a time, so we disallow creating events that might conflict, namely:
3468 *
3469 * 1) cpu-wide events in the presence of per-task events,
3470 * 2) per-task events in the presence of cpu-wide events,
3471 * 3) two matching events on the same context.
3472 *
3473 * The former two cases are handled in the allocation path (perf_event_alloc(),
3474 * __free_event()), the latter -- before the first perf_install_in_context().
3475 */
3476static int exclusive_event_init(struct perf_event *event)
3477{
3478 struct pmu *pmu = event->pmu;
3479
3480 if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE))
3481 return 0;
3482
3483 /*
3484 * Prevent co-existence of per-task and cpu-wide events on the
3485 * same exclusive pmu.
3486 *
3487 * Negative pmu::exclusive_cnt means there are cpu-wide
3488 * events on this "exclusive" pmu, positive means there are
3489 * per-task events.
3490 *
3491 * Since this is called in perf_event_alloc() path, event::ctx
3492 * doesn't exist yet; it is, however, safe to use PERF_ATTACH_TASK
3493 * to mean "per-task event", because unlike other attach states it
3494 * never gets cleared.
3495 */
3496 if (event->attach_state & PERF_ATTACH_TASK) {
3497 if (!atomic_inc_unless_negative(&pmu->exclusive_cnt))
3498 return -EBUSY;
3499 } else {
3500 if (!atomic_dec_unless_positive(&pmu->exclusive_cnt))
3501 return -EBUSY;
3502 }
3503
3504 return 0;
3505}
3506
3507static void exclusive_event_destroy(struct perf_event *event)
3508{
3509 struct pmu *pmu = event->pmu;
3510
3511 if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE))
3512 return;
3513
3514 /* see comment in exclusive_event_init() */
3515 if (event->attach_state & PERF_ATTACH_TASK)
3516 atomic_dec(&pmu->exclusive_cnt);
3517 else
3518 atomic_inc(&pmu->exclusive_cnt);
3519}
3520
3521static bool exclusive_event_match(struct perf_event *e1, struct perf_event *e2)
3522{
3523 if ((e1->pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE) &&
3524 (e1->cpu == e2->cpu ||
3525 e1->cpu == -1 ||
3526 e2->cpu == -1))
3527 return true;
3528 return false;
3529}
3530
3531/* Called under the same ctx::mutex as perf_install_in_context() */
3532static bool exclusive_event_installable(struct perf_event *event,
3533 struct perf_event_context *ctx)
3534{
3535 struct perf_event *iter_event;
3536 struct pmu *pmu = event->pmu;
3537
3538 if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE))
3539 return true;
3540
3541 list_for_each_entry(iter_event, &ctx->event_list, event_entry) {
3542 if (exclusive_event_match(iter_event, event))
3543 return false;
3544 }
3545
3546 return true;
3547}
3548
3457static void __free_event(struct perf_event *event) 3549static void __free_event(struct perf_event *event)
3458{ 3550{
3459 if (!event->parent) { 3551 if (!event->parent) {
@@ -3467,8 +3559,10 @@ static void __free_event(struct perf_event *event)
3467 if (event->ctx) 3559 if (event->ctx)
3468 put_ctx(event->ctx); 3560 put_ctx(event->ctx);
3469 3561
3470 if (event->pmu) 3562 if (event->pmu) {
3563 exclusive_event_destroy(event);
3471 module_put(event->pmu->module); 3564 module_put(event->pmu->module);
3565 }
3472 3566
3473 call_rcu(&event->rcu_head, free_event_rcu); 3567 call_rcu(&event->rcu_head, free_event_rcu);
3474} 3568}
@@ -3927,6 +4021,7 @@ static inline int perf_fget_light(int fd, struct fd *p)
3927static int perf_event_set_output(struct perf_event *event, 4021static int perf_event_set_output(struct perf_event *event,
3928 struct perf_event *output_event); 4022 struct perf_event *output_event);
3929static int perf_event_set_filter(struct perf_event *event, void __user *arg); 4023static int perf_event_set_filter(struct perf_event *event, void __user *arg);
4024static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd);
3930 4025
3931static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg) 4026static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg)
3932{ 4027{
@@ -3980,6 +4075,9 @@ static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned lon
3980 case PERF_EVENT_IOC_SET_FILTER: 4075 case PERF_EVENT_IOC_SET_FILTER:
3981 return perf_event_set_filter(event, (void __user *)arg); 4076 return perf_event_set_filter(event, (void __user *)arg);
3982 4077
4078 case PERF_EVENT_IOC_SET_BPF:
4079 return perf_event_set_bpf_prog(event, arg);
4080
3983 default: 4081 default:
3984 return -ENOTTY; 4082 return -ENOTTY;
3985 } 4083 }
@@ -4096,6 +4194,8 @@ static void perf_event_init_userpage(struct perf_event *event)
4096 /* Allow new userspace to detect that bit 0 is deprecated */ 4194 /* Allow new userspace to detect that bit 0 is deprecated */
4097 userpg->cap_bit0_is_deprecated = 1; 4195 userpg->cap_bit0_is_deprecated = 1;
4098 userpg->size = offsetof(struct perf_event_mmap_page, __reserved); 4196 userpg->size = offsetof(struct perf_event_mmap_page, __reserved);
4197 userpg->data_offset = PAGE_SIZE;
4198 userpg->data_size = perf_data_size(rb);
4099 4199
4100unlock: 4200unlock:
4101 rcu_read_unlock(); 4201 rcu_read_unlock();
@@ -4263,7 +4363,7 @@ static void rb_free_rcu(struct rcu_head *rcu_head)
4263 rb_free(rb); 4363 rb_free(rb);
4264} 4364}
4265 4365
4266static struct ring_buffer *ring_buffer_get(struct perf_event *event) 4366struct ring_buffer *ring_buffer_get(struct perf_event *event)
4267{ 4367{
4268 struct ring_buffer *rb; 4368 struct ring_buffer *rb;
4269 4369
@@ -4278,7 +4378,7 @@ static struct ring_buffer *ring_buffer_get(struct perf_event *event)
4278 return rb; 4378 return rb;
4279} 4379}
4280 4380
4281static void ring_buffer_put(struct ring_buffer *rb) 4381void ring_buffer_put(struct ring_buffer *rb)
4282{ 4382{
4283 if (!atomic_dec_and_test(&rb->refcount)) 4383 if (!atomic_dec_and_test(&rb->refcount))
4284 return; 4384 return;
@@ -4295,6 +4395,9 @@ static void perf_mmap_open(struct vm_area_struct *vma)
4295 atomic_inc(&event->mmap_count); 4395 atomic_inc(&event->mmap_count);
4296 atomic_inc(&event->rb->mmap_count); 4396 atomic_inc(&event->rb->mmap_count);
4297 4397
4398 if (vma->vm_pgoff)
4399 atomic_inc(&event->rb->aux_mmap_count);
4400
4298 if (event->pmu->event_mapped) 4401 if (event->pmu->event_mapped)
4299 event->pmu->event_mapped(event); 4402 event->pmu->event_mapped(event);
4300} 4403}
@@ -4319,6 +4422,20 @@ static void perf_mmap_close(struct vm_area_struct *vma)
4319 if (event->pmu->event_unmapped) 4422 if (event->pmu->event_unmapped)
4320 event->pmu->event_unmapped(event); 4423 event->pmu->event_unmapped(event);
4321 4424
4425 /*
4426 * rb->aux_mmap_count will always drop before rb->mmap_count and
4427 * event->mmap_count, so it is ok to use event->mmap_mutex to
4428 * serialize with perf_mmap here.
4429 */
4430 if (rb_has_aux(rb) && vma->vm_pgoff == rb->aux_pgoff &&
4431 atomic_dec_and_mutex_lock(&rb->aux_mmap_count, &event->mmap_mutex)) {
4432 atomic_long_sub(rb->aux_nr_pages, &mmap_user->locked_vm);
4433 vma->vm_mm->pinned_vm -= rb->aux_mmap_locked;
4434
4435 rb_free_aux(rb);
4436 mutex_unlock(&event->mmap_mutex);
4437 }
4438
4322 atomic_dec(&rb->mmap_count); 4439 atomic_dec(&rb->mmap_count);
4323 4440
4324 if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) 4441 if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex))
@@ -4392,7 +4509,7 @@ out_put:
4392 4509
4393static const struct vm_operations_struct perf_mmap_vmops = { 4510static const struct vm_operations_struct perf_mmap_vmops = {
4394 .open = perf_mmap_open, 4511 .open = perf_mmap_open,
4395 .close = perf_mmap_close, 4512 .close = perf_mmap_close, /* non mergable */
4396 .fault = perf_mmap_fault, 4513 .fault = perf_mmap_fault,
4397 .page_mkwrite = perf_mmap_fault, 4514 .page_mkwrite = perf_mmap_fault,
4398}; 4515};
@@ -4403,10 +4520,10 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
4403 unsigned long user_locked, user_lock_limit; 4520 unsigned long user_locked, user_lock_limit;
4404 struct user_struct *user = current_user(); 4521 struct user_struct *user = current_user();
4405 unsigned long locked, lock_limit; 4522 unsigned long locked, lock_limit;
4406 struct ring_buffer *rb; 4523 struct ring_buffer *rb = NULL;
4407 unsigned long vma_size; 4524 unsigned long vma_size;
4408 unsigned long nr_pages; 4525 unsigned long nr_pages;
4409 long user_extra, extra; 4526 long user_extra = 0, extra = 0;
4410 int ret = 0, flags = 0; 4527 int ret = 0, flags = 0;
4411 4528
4412 /* 4529 /*
@@ -4421,7 +4538,66 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
4421 return -EINVAL; 4538 return -EINVAL;
4422 4539
4423 vma_size = vma->vm_end - vma->vm_start; 4540 vma_size = vma->vm_end - vma->vm_start;
4424 nr_pages = (vma_size / PAGE_SIZE) - 1; 4541
4542 if (vma->vm_pgoff == 0) {
4543 nr_pages = (vma_size / PAGE_SIZE) - 1;
4544 } else {
4545 /*
4546 * AUX area mapping: if rb->aux_nr_pages != 0, it's already
4547 * mapped, all subsequent mappings should have the same size
4548 * and offset. Must be above the normal perf buffer.
4549 */
4550 u64 aux_offset, aux_size;
4551
4552 if (!event->rb)
4553 return -EINVAL;
4554
4555 nr_pages = vma_size / PAGE_SIZE;
4556
4557 mutex_lock(&event->mmap_mutex);
4558 ret = -EINVAL;
4559
4560 rb = event->rb;
4561 if (!rb)
4562 goto aux_unlock;
4563
4564 aux_offset = ACCESS_ONCE(rb->user_page->aux_offset);
4565 aux_size = ACCESS_ONCE(rb->user_page->aux_size);
4566
4567 if (aux_offset < perf_data_size(rb) + PAGE_SIZE)
4568 goto aux_unlock;
4569
4570 if (aux_offset != vma->vm_pgoff << PAGE_SHIFT)
4571 goto aux_unlock;
4572
4573 /* already mapped with a different offset */
4574 if (rb_has_aux(rb) && rb->aux_pgoff != vma->vm_pgoff)
4575 goto aux_unlock;
4576
4577 if (aux_size != vma_size || aux_size != nr_pages * PAGE_SIZE)
4578 goto aux_unlock;
4579
4580 /* already mapped with a different size */
4581 if (rb_has_aux(rb) && rb->aux_nr_pages != nr_pages)
4582 goto aux_unlock;
4583
4584 if (!is_power_of_2(nr_pages))
4585 goto aux_unlock;
4586
4587 if (!atomic_inc_not_zero(&rb->mmap_count))
4588 goto aux_unlock;
4589
4590 if (rb_has_aux(rb)) {
4591 atomic_inc(&rb->aux_mmap_count);
4592 ret = 0;
4593 goto unlock;
4594 }
4595
4596 atomic_set(&rb->aux_mmap_count, 1);
4597 user_extra = nr_pages;
4598
4599 goto accounting;
4600 }
4425 4601
4426 /* 4602 /*
4427 * If we have rb pages ensure they're a power-of-two number, so we 4603 * If we have rb pages ensure they're a power-of-two number, so we
@@ -4433,9 +4609,6 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
4433 if (vma_size != PAGE_SIZE * (1 + nr_pages)) 4609 if (vma_size != PAGE_SIZE * (1 + nr_pages))
4434 return -EINVAL; 4610 return -EINVAL;
4435 4611
4436 if (vma->vm_pgoff != 0)
4437 return -EINVAL;
4438
4439 WARN_ON_ONCE(event->ctx->parent_ctx); 4612 WARN_ON_ONCE(event->ctx->parent_ctx);
4440again: 4613again:
4441 mutex_lock(&event->mmap_mutex); 4614 mutex_lock(&event->mmap_mutex);
@@ -4459,6 +4632,8 @@ again:
4459 } 4632 }
4460 4633
4461 user_extra = nr_pages + 1; 4634 user_extra = nr_pages + 1;
4635
4636accounting:
4462 user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10); 4637 user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10);
4463 4638
4464 /* 4639 /*
@@ -4468,7 +4643,6 @@ again:
4468 4643
4469 user_locked = atomic_long_read(&user->locked_vm) + user_extra; 4644 user_locked = atomic_long_read(&user->locked_vm) + user_extra;
4470 4645
4471 extra = 0;
4472 if (user_locked > user_lock_limit) 4646 if (user_locked > user_lock_limit)
4473 extra = user_locked - user_lock_limit; 4647 extra = user_locked - user_lock_limit;
4474 4648
@@ -4482,35 +4656,46 @@ again:
4482 goto unlock; 4656 goto unlock;
4483 } 4657 }
4484 4658
4485 WARN_ON(event->rb); 4659 WARN_ON(!rb && event->rb);
4486 4660
4487 if (vma->vm_flags & VM_WRITE) 4661 if (vma->vm_flags & VM_WRITE)
4488 flags |= RING_BUFFER_WRITABLE; 4662 flags |= RING_BUFFER_WRITABLE;
4489 4663
4490 rb = rb_alloc(nr_pages,
4491 event->attr.watermark ? event->attr.wakeup_watermark : 0,
4492 event->cpu, flags);
4493
4494 if (!rb) { 4664 if (!rb) {
4495 ret = -ENOMEM; 4665 rb = rb_alloc(nr_pages,
4496 goto unlock; 4666 event->attr.watermark ? event->attr.wakeup_watermark : 0,
4497 } 4667 event->cpu, flags);
4498 4668
4499 atomic_set(&rb->mmap_count, 1); 4669 if (!rb) {
4500 rb->mmap_locked = extra; 4670 ret = -ENOMEM;
4501 rb->mmap_user = get_current_user(); 4671 goto unlock;
4672 }
4502 4673
4503 atomic_long_add(user_extra, &user->locked_vm); 4674 atomic_set(&rb->mmap_count, 1);
4504 vma->vm_mm->pinned_vm += extra; 4675 rb->mmap_user = get_current_user();
4676 rb->mmap_locked = extra;
4505 4677
4506 ring_buffer_attach(event, rb); 4678 ring_buffer_attach(event, rb);
4507 4679
4508 perf_event_init_userpage(event); 4680 perf_event_init_userpage(event);
4509 perf_event_update_userpage(event); 4681 perf_event_update_userpage(event);
4682 } else {
4683 ret = rb_alloc_aux(rb, event, vma->vm_pgoff, nr_pages,
4684 event->attr.aux_watermark, flags);
4685 if (!ret)
4686 rb->aux_mmap_locked = extra;
4687 }
4510 4688
4511unlock: 4689unlock:
4512 if (!ret) 4690 if (!ret) {
4691 atomic_long_add(user_extra, &user->locked_vm);
4692 vma->vm_mm->pinned_vm += extra;
4693
4513 atomic_inc(&event->mmap_count); 4694 atomic_inc(&event->mmap_count);
4695 } else if (rb) {
4696 atomic_dec(&rb->mmap_count);
4697 }
4698aux_unlock:
4514 mutex_unlock(&event->mmap_mutex); 4699 mutex_unlock(&event->mmap_mutex);
4515 4700
4516 /* 4701 /*
@@ -4574,6 +4759,13 @@ static void perf_pending_event(struct irq_work *entry)
4574{ 4759{
4575 struct perf_event *event = container_of(entry, 4760 struct perf_event *event = container_of(entry,
4576 struct perf_event, pending); 4761 struct perf_event, pending);
4762 int rctx;
4763
4764 rctx = perf_swevent_get_recursion_context();
4765 /*
4766 * If we 'fail' here, that's OK, it means recursion is already disabled
4767 * and we won't recurse 'further'.
4768 */
4577 4769
4578 if (event->pending_disable) { 4770 if (event->pending_disable) {
4579 event->pending_disable = 0; 4771 event->pending_disable = 0;
@@ -4584,6 +4776,9 @@ static void perf_pending_event(struct irq_work *entry)
4584 event->pending_wakeup = 0; 4776 event->pending_wakeup = 0;
4585 perf_event_wakeup(event); 4777 perf_event_wakeup(event);
4586 } 4778 }
4779
4780 if (rctx >= 0)
4781 perf_swevent_put_recursion_context(rctx);
4587} 4782}
4588 4783
4589/* 4784/*
@@ -4756,7 +4951,7 @@ static void __perf_event_header__init_id(struct perf_event_header *header,
4756 } 4951 }
4757 4952
4758 if (sample_type & PERF_SAMPLE_TIME) 4953 if (sample_type & PERF_SAMPLE_TIME)
4759 data->time = perf_clock(); 4954 data->time = perf_event_clock(event);
4760 4955
4761 if (sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER)) 4956 if (sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER))
4762 data->id = primary_event_id(event); 4957 data->id = primary_event_id(event);
@@ -5334,6 +5529,8 @@ static void perf_event_task_output(struct perf_event *event,
5334 task_event->event_id.tid = perf_event_tid(event, task); 5529 task_event->event_id.tid = perf_event_tid(event, task);
5335 task_event->event_id.ptid = perf_event_tid(event, current); 5530 task_event->event_id.ptid = perf_event_tid(event, current);
5336 5531
5532 task_event->event_id.time = perf_event_clock(event);
5533
5337 perf_output_put(&handle, task_event->event_id); 5534 perf_output_put(&handle, task_event->event_id);
5338 5535
5339 perf_event__output_id_sample(event, &handle, &sample); 5536 perf_event__output_id_sample(event, &handle, &sample);
@@ -5367,7 +5564,7 @@ static void perf_event_task(struct task_struct *task,
5367 /* .ppid */ 5564 /* .ppid */
5368 /* .tid */ 5565 /* .tid */
5369 /* .ptid */ 5566 /* .ptid */
5370 .time = perf_clock(), 5567 /* .time */
5371 }, 5568 },
5372 }; 5569 };
5373 5570
@@ -5722,6 +5919,40 @@ void perf_event_mmap(struct vm_area_struct *vma)
5722 perf_event_mmap_event(&mmap_event); 5919 perf_event_mmap_event(&mmap_event);
5723} 5920}
5724 5921
5922void perf_event_aux_event(struct perf_event *event, unsigned long head,
5923 unsigned long size, u64 flags)
5924{
5925 struct perf_output_handle handle;
5926 struct perf_sample_data sample;
5927 struct perf_aux_event {
5928 struct perf_event_header header;
5929 u64 offset;
5930 u64 size;
5931 u64 flags;
5932 } rec = {
5933 .header = {
5934 .type = PERF_RECORD_AUX,
5935 .misc = 0,
5936 .size = sizeof(rec),
5937 },
5938 .offset = head,
5939 .size = size,
5940 .flags = flags,
5941 };
5942 int ret;
5943
5944 perf_event_header__init_id(&rec.header, &sample, event);
5945 ret = perf_output_begin(&handle, event, rec.header.size);
5946
5947 if (ret)
5948 return;
5949
5950 perf_output_put(&handle, rec);
5951 perf_event__output_id_sample(event, &handle, &sample);
5952
5953 perf_output_end(&handle);
5954}
5955
5725/* 5956/*
5726 * IRQ throttle logging 5957 * IRQ throttle logging
5727 */ 5958 */
@@ -5743,7 +5974,7 @@ static void perf_log_throttle(struct perf_event *event, int enable)
5743 .misc = 0, 5974 .misc = 0,
5744 .size = sizeof(throttle_event), 5975 .size = sizeof(throttle_event),
5745 }, 5976 },
5746 .time = perf_clock(), 5977 .time = perf_event_clock(event),
5747 .id = primary_event_id(event), 5978 .id = primary_event_id(event),
5748 .stream_id = event->id, 5979 .stream_id = event->id,
5749 }; 5980 };
@@ -5763,6 +5994,44 @@ static void perf_log_throttle(struct perf_event *event, int enable)
5763 perf_output_end(&handle); 5994 perf_output_end(&handle);
5764} 5995}
5765 5996
5997static void perf_log_itrace_start(struct perf_event *event)
5998{
5999 struct perf_output_handle handle;
6000 struct perf_sample_data sample;
6001 struct perf_aux_event {
6002 struct perf_event_header header;
6003 u32 pid;
6004 u32 tid;
6005 } rec;
6006 int ret;
6007
6008 if (event->parent)
6009 event = event->parent;
6010
6011 if (!(event->pmu->capabilities & PERF_PMU_CAP_ITRACE) ||
6012 event->hw.itrace_started)
6013 return;
6014
6015 event->hw.itrace_started = 1;
6016
6017 rec.header.type = PERF_RECORD_ITRACE_START;
6018 rec.header.misc = 0;
6019 rec.header.size = sizeof(rec);
6020 rec.pid = perf_event_pid(event, current);
6021 rec.tid = perf_event_tid(event, current);
6022
6023 perf_event_header__init_id(&rec.header, &sample, event);
6024 ret = perf_output_begin(&handle, event, rec.header.size);
6025
6026 if (ret)
6027 return;
6028
6029 perf_output_put(&handle, rec);
6030 perf_event__output_id_sample(event, &handle, &sample);
6031
6032 perf_output_end(&handle);
6033}
6034
5766/* 6035/*
5767 * Generic event overflow handling, sampling. 6036 * Generic event overflow handling, sampling.
5768 */ 6037 */
@@ -6123,6 +6392,7 @@ static int perf_swevent_add(struct perf_event *event, int flags)
6123 } 6392 }
6124 6393
6125 hlist_add_head_rcu(&event->hlist_entry, head); 6394 hlist_add_head_rcu(&event->hlist_entry, head);
6395 perf_event_update_userpage(event);
6126 6396
6127 return 0; 6397 return 0;
6128} 6398}
@@ -6286,6 +6556,8 @@ static int perf_swevent_init(struct perf_event *event)
6286static struct pmu perf_swevent = { 6556static struct pmu perf_swevent = {
6287 .task_ctx_nr = perf_sw_context, 6557 .task_ctx_nr = perf_sw_context,
6288 6558
6559 .capabilities = PERF_PMU_CAP_NO_NMI,
6560
6289 .event_init = perf_swevent_init, 6561 .event_init = perf_swevent_init,
6290 .add = perf_swevent_add, 6562 .add = perf_swevent_add,
6291 .del = perf_swevent_del, 6563 .del = perf_swevent_del,
@@ -6439,6 +6711,49 @@ static void perf_event_free_filter(struct perf_event *event)
6439 ftrace_profile_free_filter(event); 6711 ftrace_profile_free_filter(event);
6440} 6712}
6441 6713
6714static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
6715{
6716 struct bpf_prog *prog;
6717
6718 if (event->attr.type != PERF_TYPE_TRACEPOINT)
6719 return -EINVAL;
6720
6721 if (event->tp_event->prog)
6722 return -EEXIST;
6723
6724 if (!(event->tp_event->flags & TRACE_EVENT_FL_KPROBE))
6725 /* bpf programs can only be attached to kprobes */
6726 return -EINVAL;
6727
6728 prog = bpf_prog_get(prog_fd);
6729 if (IS_ERR(prog))
6730 return PTR_ERR(prog);
6731
6732 if (prog->type != BPF_PROG_TYPE_KPROBE) {
6733 /* valid fd, but invalid bpf program type */
6734 bpf_prog_put(prog);
6735 return -EINVAL;
6736 }
6737
6738 event->tp_event->prog = prog;
6739
6740 return 0;
6741}
6742
6743static void perf_event_free_bpf_prog(struct perf_event *event)
6744{
6745 struct bpf_prog *prog;
6746
6747 if (!event->tp_event)
6748 return;
6749
6750 prog = event->tp_event->prog;
6751 if (prog) {
6752 event->tp_event->prog = NULL;
6753 bpf_prog_put(prog);
6754 }
6755}
6756
6442#else 6757#else
6443 6758
6444static inline void perf_tp_register(void) 6759static inline void perf_tp_register(void)
@@ -6454,6 +6769,14 @@ static void perf_event_free_filter(struct perf_event *event)
6454{ 6769{
6455} 6770}
6456 6771
6772static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
6773{
6774 return -ENOENT;
6775}
6776
6777static void perf_event_free_bpf_prog(struct perf_event *event)
6778{
6779}
6457#endif /* CONFIG_EVENT_TRACING */ 6780#endif /* CONFIG_EVENT_TRACING */
6458 6781
6459#ifdef CONFIG_HAVE_HW_BREAKPOINT 6782#ifdef CONFIG_HAVE_HW_BREAKPOINT
@@ -6592,6 +6915,7 @@ static int cpu_clock_event_add(struct perf_event *event, int flags)
6592{ 6915{
6593 if (flags & PERF_EF_START) 6916 if (flags & PERF_EF_START)
6594 cpu_clock_event_start(event, flags); 6917 cpu_clock_event_start(event, flags);
6918 perf_event_update_userpage(event);
6595 6919
6596 return 0; 6920 return 0;
6597} 6921}
@@ -6628,6 +6952,8 @@ static int cpu_clock_event_init(struct perf_event *event)
6628static struct pmu perf_cpu_clock = { 6952static struct pmu perf_cpu_clock = {
6629 .task_ctx_nr = perf_sw_context, 6953 .task_ctx_nr = perf_sw_context,
6630 6954
6955 .capabilities = PERF_PMU_CAP_NO_NMI,
6956
6631 .event_init = cpu_clock_event_init, 6957 .event_init = cpu_clock_event_init,
6632 .add = cpu_clock_event_add, 6958 .add = cpu_clock_event_add,
6633 .del = cpu_clock_event_del, 6959 .del = cpu_clock_event_del,
@@ -6666,6 +6992,7 @@ static int task_clock_event_add(struct perf_event *event, int flags)
6666{ 6992{
6667 if (flags & PERF_EF_START) 6993 if (flags & PERF_EF_START)
6668 task_clock_event_start(event, flags); 6994 task_clock_event_start(event, flags);
6995 perf_event_update_userpage(event);
6669 6996
6670 return 0; 6997 return 0;
6671} 6998}
@@ -6706,6 +7033,8 @@ static int task_clock_event_init(struct perf_event *event)
6706static struct pmu perf_task_clock = { 7033static struct pmu perf_task_clock = {
6707 .task_ctx_nr = perf_sw_context, 7034 .task_ctx_nr = perf_sw_context,
6708 7035
7036 .capabilities = PERF_PMU_CAP_NO_NMI,
7037
6709 .event_init = task_clock_event_init, 7038 .event_init = task_clock_event_init,
6710 .add = task_clock_event_add, 7039 .add = task_clock_event_add,
6711 .del = task_clock_event_del, 7040 .del = task_clock_event_del,
@@ -6983,6 +7312,7 @@ got_cpu_context:
6983 pmu->event_idx = perf_event_idx_default; 7312 pmu->event_idx = perf_event_idx_default;
6984 7313
6985 list_add_rcu(&pmu->entry, &pmus); 7314 list_add_rcu(&pmu->entry, &pmus);
7315 atomic_set(&pmu->exclusive_cnt, 0);
6986 ret = 0; 7316 ret = 0;
6987unlock: 7317unlock:
6988 mutex_unlock(&pmus_lock); 7318 mutex_unlock(&pmus_lock);
@@ -7027,12 +7357,23 @@ EXPORT_SYMBOL_GPL(perf_pmu_unregister);
7027 7357
7028static int perf_try_init_event(struct pmu *pmu, struct perf_event *event) 7358static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
7029{ 7359{
7360 struct perf_event_context *ctx = NULL;
7030 int ret; 7361 int ret;
7031 7362
7032 if (!try_module_get(pmu->module)) 7363 if (!try_module_get(pmu->module))
7033 return -ENODEV; 7364 return -ENODEV;
7365
7366 if (event->group_leader != event) {
7367 ctx = perf_event_ctx_lock(event->group_leader);
7368 BUG_ON(!ctx);
7369 }
7370
7034 event->pmu = pmu; 7371 event->pmu = pmu;
7035 ret = pmu->event_init(event); 7372 ret = pmu->event_init(event);
7373
7374 if (ctx)
7375 perf_event_ctx_unlock(event->group_leader, ctx);
7376
7036 if (ret) 7377 if (ret)
7037 module_put(pmu->module); 7378 module_put(pmu->module);
7038 7379
@@ -7079,10 +7420,6 @@ static void account_event_cpu(struct perf_event *event, int cpu)
7079 if (event->parent) 7420 if (event->parent)
7080 return; 7421 return;
7081 7422
7082 if (has_branch_stack(event)) {
7083 if (!(event->attach_state & PERF_ATTACH_TASK))
7084 atomic_inc(&per_cpu(perf_branch_stack_events, cpu));
7085 }
7086 if (is_cgroup_event(event)) 7423 if (is_cgroup_event(event))
7087 atomic_inc(&per_cpu(perf_cgroup_events, cpu)); 7424 atomic_inc(&per_cpu(perf_cgroup_events, cpu));
7088} 7425}
@@ -7121,7 +7458,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
7121 struct perf_event *group_leader, 7458 struct perf_event *group_leader,
7122 struct perf_event *parent_event, 7459 struct perf_event *parent_event,
7123 perf_overflow_handler_t overflow_handler, 7460 perf_overflow_handler_t overflow_handler,
7124 void *context) 7461 void *context, int cgroup_fd)
7125{ 7462{
7126 struct pmu *pmu; 7463 struct pmu *pmu;
7127 struct perf_event *event; 7464 struct perf_event *event;
@@ -7176,18 +7513,18 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
7176 7513
7177 if (task) { 7514 if (task) {
7178 event->attach_state = PERF_ATTACH_TASK; 7515 event->attach_state = PERF_ATTACH_TASK;
7179
7180 if (attr->type == PERF_TYPE_TRACEPOINT)
7181 event->hw.tp_target = task;
7182#ifdef CONFIG_HAVE_HW_BREAKPOINT
7183 /* 7516 /*
7184 * hw_breakpoint is a bit difficult here.. 7517 * XXX pmu::event_init needs to know what task to account to
7518 * and we cannot use the ctx information because we need the
7519 * pmu before we get a ctx.
7185 */ 7520 */
7186 else if (attr->type == PERF_TYPE_BREAKPOINT) 7521 event->hw.target = task;
7187 event->hw.bp_target = task;
7188#endif
7189 } 7522 }
7190 7523
7524 event->clock = &local_clock;
7525 if (parent_event)
7526 event->clock = parent_event->clock;
7527
7191 if (!overflow_handler && parent_event) { 7528 if (!overflow_handler && parent_event) {
7192 overflow_handler = parent_event->overflow_handler; 7529 overflow_handler = parent_event->overflow_handler;
7193 context = parent_event->overflow_handler_context; 7530 context = parent_event->overflow_handler_context;
@@ -7214,6 +7551,15 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
7214 if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP)) 7551 if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP))
7215 goto err_ns; 7552 goto err_ns;
7216 7553
7554 if (!has_branch_stack(event))
7555 event->attr.branch_sample_type = 0;
7556
7557 if (cgroup_fd != -1) {
7558 err = perf_cgroup_connect(cgroup_fd, event, attr, group_leader);
7559 if (err)
7560 goto err_ns;
7561 }
7562
7217 pmu = perf_init_event(event); 7563 pmu = perf_init_event(event);
7218 if (!pmu) 7564 if (!pmu)
7219 goto err_ns; 7565 goto err_ns;
@@ -7222,21 +7568,30 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
7222 goto err_ns; 7568 goto err_ns;
7223 } 7569 }
7224 7570
7571 err = exclusive_event_init(event);
7572 if (err)
7573 goto err_pmu;
7574
7225 if (!event->parent) { 7575 if (!event->parent) {
7226 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) { 7576 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) {
7227 err = get_callchain_buffers(); 7577 err = get_callchain_buffers();
7228 if (err) 7578 if (err)
7229 goto err_pmu; 7579 goto err_per_task;
7230 } 7580 }
7231 } 7581 }
7232 7582
7233 return event; 7583 return event;
7234 7584
7585err_per_task:
7586 exclusive_event_destroy(event);
7587
7235err_pmu: 7588err_pmu:
7236 if (event->destroy) 7589 if (event->destroy)
7237 event->destroy(event); 7590 event->destroy(event);
7238 module_put(pmu->module); 7591 module_put(pmu->module);
7239err_ns: 7592err_ns:
7593 if (is_cgroup_event(event))
7594 perf_detach_cgroup(event);
7240 if (event->ns) 7595 if (event->ns)
7241 put_pid_ns(event->ns); 7596 put_pid_ns(event->ns);
7242 kfree(event); 7597 kfree(event);
@@ -7399,6 +7754,19 @@ perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
7399 if (output_event->cpu == -1 && output_event->ctx != event->ctx) 7754 if (output_event->cpu == -1 && output_event->ctx != event->ctx)
7400 goto out; 7755 goto out;
7401 7756
7757 /*
7758 * Mixing clocks in the same buffer is trouble you don't need.
7759 */
7760 if (output_event->clock != event->clock)
7761 goto out;
7762
7763 /*
7764 * If both events generate aux data, they must be on the same PMU
7765 */
7766 if (has_aux(event) && has_aux(output_event) &&
7767 event->pmu != output_event->pmu)
7768 goto out;
7769
7402set: 7770set:
7403 mutex_lock(&event->mmap_mutex); 7771 mutex_lock(&event->mmap_mutex);
7404 /* Can't redirect output if we've got an active mmap() */ 7772 /* Can't redirect output if we've got an active mmap() */
@@ -7431,6 +7799,43 @@ static void mutex_lock_double(struct mutex *a, struct mutex *b)
7431 mutex_lock_nested(b, SINGLE_DEPTH_NESTING); 7799 mutex_lock_nested(b, SINGLE_DEPTH_NESTING);
7432} 7800}
7433 7801
7802static int perf_event_set_clock(struct perf_event *event, clockid_t clk_id)
7803{
7804 bool nmi_safe = false;
7805
7806 switch (clk_id) {
7807 case CLOCK_MONOTONIC:
7808 event->clock = &ktime_get_mono_fast_ns;
7809 nmi_safe = true;
7810 break;
7811
7812 case CLOCK_MONOTONIC_RAW:
7813 event->clock = &ktime_get_raw_fast_ns;
7814 nmi_safe = true;
7815 break;
7816
7817 case CLOCK_REALTIME:
7818 event->clock = &ktime_get_real_ns;
7819 break;
7820
7821 case CLOCK_BOOTTIME:
7822 event->clock = &ktime_get_boot_ns;
7823 break;
7824
7825 case CLOCK_TAI:
7826 event->clock = &ktime_get_tai_ns;
7827 break;
7828
7829 default:
7830 return -EINVAL;
7831 }
7832
7833 if (!nmi_safe && !(event->pmu->capabilities & PERF_PMU_CAP_NO_NMI))
7834 return -EINVAL;
7835
7836 return 0;
7837}
7838
7434/** 7839/**
7435 * sys_perf_event_open - open a performance event, associate it to a task/cpu 7840 * sys_perf_event_open - open a performance event, associate it to a task/cpu
7436 * 7841 *
@@ -7455,6 +7860,7 @@ SYSCALL_DEFINE5(perf_event_open,
7455 int move_group = 0; 7860 int move_group = 0;
7456 int err; 7861 int err;
7457 int f_flags = O_RDWR; 7862 int f_flags = O_RDWR;
7863 int cgroup_fd = -1;
7458 7864
7459 /* for future expandability... */ 7865 /* for future expandability... */
7460 if (flags & ~PERF_FLAG_ALL) 7866 if (flags & ~PERF_FLAG_ALL)
@@ -7520,21 +7926,16 @@ SYSCALL_DEFINE5(perf_event_open,
7520 7926
7521 get_online_cpus(); 7927 get_online_cpus();
7522 7928
7929 if (flags & PERF_FLAG_PID_CGROUP)
7930 cgroup_fd = pid;
7931
7523 event = perf_event_alloc(&attr, cpu, task, group_leader, NULL, 7932 event = perf_event_alloc(&attr, cpu, task, group_leader, NULL,
7524 NULL, NULL); 7933 NULL, NULL, cgroup_fd);
7525 if (IS_ERR(event)) { 7934 if (IS_ERR(event)) {
7526 err = PTR_ERR(event); 7935 err = PTR_ERR(event);
7527 goto err_cpus; 7936 goto err_cpus;
7528 } 7937 }
7529 7938
7530 if (flags & PERF_FLAG_PID_CGROUP) {
7531 err = perf_cgroup_connect(pid, event, &attr, group_leader);
7532 if (err) {
7533 __free_event(event);
7534 goto err_cpus;
7535 }
7536 }
7537
7538 if (is_sampling_event(event)) { 7939 if (is_sampling_event(event)) {
7539 if (event->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) { 7940 if (event->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) {
7540 err = -ENOTSUPP; 7941 err = -ENOTSUPP;
@@ -7550,6 +7951,12 @@ SYSCALL_DEFINE5(perf_event_open,
7550 */ 7951 */
7551 pmu = event->pmu; 7952 pmu = event->pmu;
7552 7953
7954 if (attr.use_clockid) {
7955 err = perf_event_set_clock(event, attr.clockid);
7956 if (err)
7957 goto err_alloc;
7958 }
7959
7553 if (group_leader && 7960 if (group_leader &&
7554 (is_software_event(event) != is_software_event(group_leader))) { 7961 (is_software_event(event) != is_software_event(group_leader))) {
7555 if (is_software_event(event)) { 7962 if (is_software_event(event)) {
@@ -7576,12 +7983,17 @@ SYSCALL_DEFINE5(perf_event_open,
7576 /* 7983 /*
7577 * Get the target context (task or percpu): 7984 * Get the target context (task or percpu):
7578 */ 7985 */
7579 ctx = find_get_context(pmu, task, event->cpu); 7986 ctx = find_get_context(pmu, task, event);
7580 if (IS_ERR(ctx)) { 7987 if (IS_ERR(ctx)) {
7581 err = PTR_ERR(ctx); 7988 err = PTR_ERR(ctx);
7582 goto err_alloc; 7989 goto err_alloc;
7583 } 7990 }
7584 7991
7992 if ((pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE) && group_leader) {
7993 err = -EBUSY;
7994 goto err_context;
7995 }
7996
7585 if (task) { 7997 if (task) {
7586 put_task_struct(task); 7998 put_task_struct(task);
7587 task = NULL; 7999 task = NULL;
@@ -7599,6 +8011,11 @@ SYSCALL_DEFINE5(perf_event_open,
7599 */ 8011 */
7600 if (group_leader->group_leader != group_leader) 8012 if (group_leader->group_leader != group_leader)
7601 goto err_context; 8013 goto err_context;
8014
8015 /* All events in a group should have the same clock */
8016 if (group_leader->clock != event->clock)
8017 goto err_context;
8018
7602 /* 8019 /*
7603 * Do not allow to attach to a group in a different 8020 * Do not allow to attach to a group in a different
7604 * task or CPU context: 8021 * task or CPU context:
@@ -7699,6 +8116,13 @@ SYSCALL_DEFINE5(perf_event_open,
7699 get_ctx(ctx); 8116 get_ctx(ctx);
7700 } 8117 }
7701 8118
8119 if (!exclusive_event_installable(event, ctx)) {
8120 err = -EBUSY;
8121 mutex_unlock(&ctx->mutex);
8122 fput(event_file);
8123 goto err_context;
8124 }
8125
7702 perf_install_in_context(ctx, event, event->cpu); 8126 perf_install_in_context(ctx, event, event->cpu);
7703 perf_unpin_context(ctx); 8127 perf_unpin_context(ctx);
7704 8128
@@ -7771,7 +8195,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
7771 */ 8195 */
7772 8196
7773 event = perf_event_alloc(attr, cpu, task, NULL, NULL, 8197 event = perf_event_alloc(attr, cpu, task, NULL, NULL,
7774 overflow_handler, context); 8198 overflow_handler, context, -1);
7775 if (IS_ERR(event)) { 8199 if (IS_ERR(event)) {
7776 err = PTR_ERR(event); 8200 err = PTR_ERR(event);
7777 goto err; 8201 goto err;
@@ -7782,7 +8206,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
7782 8206
7783 account_event(event); 8207 account_event(event);
7784 8208
7785 ctx = find_get_context(event->pmu, task, cpu); 8209 ctx = find_get_context(event->pmu, task, event);
7786 if (IS_ERR(ctx)) { 8210 if (IS_ERR(ctx)) {
7787 err = PTR_ERR(ctx); 8211 err = PTR_ERR(ctx);
7788 goto err_free; 8212 goto err_free;
@@ -7790,6 +8214,14 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
7790 8214
7791 WARN_ON_ONCE(ctx->parent_ctx); 8215 WARN_ON_ONCE(ctx->parent_ctx);
7792 mutex_lock(&ctx->mutex); 8216 mutex_lock(&ctx->mutex);
8217 if (!exclusive_event_installable(event, ctx)) {
8218 mutex_unlock(&ctx->mutex);
8219 perf_unpin_context(ctx);
8220 put_ctx(ctx);
8221 err = -EBUSY;
8222 goto err_free;
8223 }
8224
7793 perf_install_in_context(ctx, event, cpu); 8225 perf_install_in_context(ctx, event, cpu);
7794 perf_unpin_context(ctx); 8226 perf_unpin_context(ctx);
7795 mutex_unlock(&ctx->mutex); 8227 mutex_unlock(&ctx->mutex);
@@ -8132,7 +8564,7 @@ inherit_event(struct perf_event *parent_event,
8132 parent_event->cpu, 8564 parent_event->cpu,
8133 child, 8565 child,
8134 group_leader, parent_event, 8566 group_leader, parent_event,
8135 NULL, NULL); 8567 NULL, NULL, -1);
8136 if (IS_ERR(child_event)) 8568 if (IS_ERR(child_event))
8137 return child_event; 8569 return child_event;
8138 8570
diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c
index 9803a6600d49..92ce5f4ccc26 100644
--- a/kernel/events/hw_breakpoint.c
+++ b/kernel/events/hw_breakpoint.c
@@ -116,12 +116,12 @@ static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type)
116 */ 116 */
117static int task_bp_pinned(int cpu, struct perf_event *bp, enum bp_type_idx type) 117static int task_bp_pinned(int cpu, struct perf_event *bp, enum bp_type_idx type)
118{ 118{
119 struct task_struct *tsk = bp->hw.bp_target; 119 struct task_struct *tsk = bp->hw.target;
120 struct perf_event *iter; 120 struct perf_event *iter;
121 int count = 0; 121 int count = 0;
122 122
123 list_for_each_entry(iter, &bp_task_head, hw.bp_list) { 123 list_for_each_entry(iter, &bp_task_head, hw.bp_list) {
124 if (iter->hw.bp_target == tsk && 124 if (iter->hw.target == tsk &&
125 find_slot_idx(iter) == type && 125 find_slot_idx(iter) == type &&
126 (iter->cpu < 0 || cpu == iter->cpu)) 126 (iter->cpu < 0 || cpu == iter->cpu))
127 count += hw_breakpoint_weight(iter); 127 count += hw_breakpoint_weight(iter);
@@ -153,7 +153,7 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp,
153 int nr; 153 int nr;
154 154
155 nr = info->cpu_pinned; 155 nr = info->cpu_pinned;
156 if (!bp->hw.bp_target) 156 if (!bp->hw.target)
157 nr += max_task_bp_pinned(cpu, type); 157 nr += max_task_bp_pinned(cpu, type);
158 else 158 else
159 nr += task_bp_pinned(cpu, bp, type); 159 nr += task_bp_pinned(cpu, bp, type);
@@ -210,7 +210,7 @@ toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type,
210 weight = -weight; 210 weight = -weight;
211 211
212 /* Pinned counter cpu profiling */ 212 /* Pinned counter cpu profiling */
213 if (!bp->hw.bp_target) { 213 if (!bp->hw.target) {
214 get_bp_info(bp->cpu, type)->cpu_pinned += weight; 214 get_bp_info(bp->cpu, type)->cpu_pinned += weight;
215 return; 215 return;
216 } 216 }
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index 569b218782ad..9f6ce9ba4a04 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -27,6 +27,7 @@ struct ring_buffer {
27 local_t lost; /* nr records lost */ 27 local_t lost; /* nr records lost */
28 28
29 long watermark; /* wakeup watermark */ 29 long watermark; /* wakeup watermark */
30 long aux_watermark;
30 /* poll crap */ 31 /* poll crap */
31 spinlock_t event_lock; 32 spinlock_t event_lock;
32 struct list_head event_list; 33 struct list_head event_list;
@@ -35,6 +36,20 @@ struct ring_buffer {
35 unsigned long mmap_locked; 36 unsigned long mmap_locked;
36 struct user_struct *mmap_user; 37 struct user_struct *mmap_user;
37 38
39 /* AUX area */
40 local_t aux_head;
41 local_t aux_nest;
42 local_t aux_wakeup;
43 unsigned long aux_pgoff;
44 int aux_nr_pages;
45 int aux_overwrite;
46 atomic_t aux_mmap_count;
47 unsigned long aux_mmap_locked;
48 void (*free_aux)(void *);
49 atomic_t aux_refcount;
50 void **aux_pages;
51 void *aux_priv;
52
38 struct perf_event_mmap_page *user_page; 53 struct perf_event_mmap_page *user_page;
39 void *data_pages[0]; 54 void *data_pages[0];
40}; 55};
@@ -43,6 +58,19 @@ extern void rb_free(struct ring_buffer *rb);
43extern struct ring_buffer * 58extern struct ring_buffer *
44rb_alloc(int nr_pages, long watermark, int cpu, int flags); 59rb_alloc(int nr_pages, long watermark, int cpu, int flags);
45extern void perf_event_wakeup(struct perf_event *event); 60extern void perf_event_wakeup(struct perf_event *event);
61extern int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event,
62 pgoff_t pgoff, int nr_pages, long watermark, int flags);
63extern void rb_free_aux(struct ring_buffer *rb);
64extern struct ring_buffer *ring_buffer_get(struct perf_event *event);
65extern void ring_buffer_put(struct ring_buffer *rb);
66
67static inline bool rb_has_aux(struct ring_buffer *rb)
68{
69 return !!rb->aux_nr_pages;
70}
71
72void perf_event_aux_event(struct perf_event *event, unsigned long head,
73 unsigned long size, u64 flags);
46 74
47extern void 75extern void
48perf_event_header__init_id(struct perf_event_header *header, 76perf_event_header__init_id(struct perf_event_header *header,
@@ -81,6 +109,11 @@ static inline unsigned long perf_data_size(struct ring_buffer *rb)
81 return rb->nr_pages << (PAGE_SHIFT + page_order(rb)); 109 return rb->nr_pages << (PAGE_SHIFT + page_order(rb));
82} 110}
83 111
112static inline unsigned long perf_aux_size(struct ring_buffer *rb)
113{
114 return rb->aux_nr_pages << PAGE_SHIFT;
115}
116
84#define DEFINE_OUTPUT_COPY(func_name, memcpy_func) \ 117#define DEFINE_OUTPUT_COPY(func_name, memcpy_func) \
85static inline unsigned long \ 118static inline unsigned long \
86func_name(struct perf_output_handle *handle, \ 119func_name(struct perf_output_handle *handle, \
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index eadb95ce7aac..232f00f273cb 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -243,14 +243,317 @@ ring_buffer_init(struct ring_buffer *rb, long watermark, int flags)
243 spin_lock_init(&rb->event_lock); 243 spin_lock_init(&rb->event_lock);
244} 244}
245 245
246/*
247 * This is called before hardware starts writing to the AUX area to
248 * obtain an output handle and make sure there's room in the buffer.
249 * When the capture completes, call perf_aux_output_end() to commit
250 * the recorded data to the buffer.
251 *
252 * The ordering is similar to that of perf_output_{begin,end}, with
253 * the exception of (B), which should be taken care of by the pmu
254 * driver, since ordering rules will differ depending on hardware.
255 */
256void *perf_aux_output_begin(struct perf_output_handle *handle,
257 struct perf_event *event)
258{
259 struct perf_event *output_event = event;
260 unsigned long aux_head, aux_tail;
261 struct ring_buffer *rb;
262
263 if (output_event->parent)
264 output_event = output_event->parent;
265
266 /*
267 * Since this will typically be open across pmu::add/pmu::del, we
268 * grab ring_buffer's refcount instead of holding rcu read lock
269 * to make sure it doesn't disappear under us.
270 */
271 rb = ring_buffer_get(output_event);
272 if (!rb)
273 return NULL;
274
275 if (!rb_has_aux(rb) || !atomic_inc_not_zero(&rb->aux_refcount))
276 goto err;
277
278 /*
279 * Nesting is not supported for AUX area, make sure nested
280 * writers are caught early
281 */
282 if (WARN_ON_ONCE(local_xchg(&rb->aux_nest, 1)))
283 goto err_put;
284
285 aux_head = local_read(&rb->aux_head);
286
287 handle->rb = rb;
288 handle->event = event;
289 handle->head = aux_head;
290 handle->size = 0;
291
292 /*
293 * In overwrite mode, AUX data stores do not depend on aux_tail,
294 * therefore (A) control dependency barrier does not exist. The
295 * (B) <-> (C) ordering is still observed by the pmu driver.
296 */
297 if (!rb->aux_overwrite) {
298 aux_tail = ACCESS_ONCE(rb->user_page->aux_tail);
299 handle->wakeup = local_read(&rb->aux_wakeup) + rb->aux_watermark;
300 if (aux_head - aux_tail < perf_aux_size(rb))
301 handle->size = CIRC_SPACE(aux_head, aux_tail, perf_aux_size(rb));
302
303 /*
304 * handle->size computation depends on aux_tail load; this forms a
305 * control dependency barrier separating aux_tail load from aux data
306 * store that will be enabled on successful return
307 */
308 if (!handle->size) { /* A, matches D */
309 event->pending_disable = 1;
310 perf_output_wakeup(handle);
311 local_set(&rb->aux_nest, 0);
312 goto err_put;
313 }
314 }
315
316 return handle->rb->aux_priv;
317
318err_put:
319 rb_free_aux(rb);
320
321err:
322 ring_buffer_put(rb);
323 handle->event = NULL;
324
325 return NULL;
326}
327
328/*
329 * Commit the data written by hardware into the ring buffer by adjusting
330 * aux_head and posting a PERF_RECORD_AUX into the perf buffer. It is the
331 * pmu driver's responsibility to observe ordering rules of the hardware,
332 * so that all the data is externally visible before this is called.
333 */
334void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size,
335 bool truncated)
336{
337 struct ring_buffer *rb = handle->rb;
338 unsigned long aux_head;
339 u64 flags = 0;
340
341 if (truncated)
342 flags |= PERF_AUX_FLAG_TRUNCATED;
343
344 /* in overwrite mode, driver provides aux_head via handle */
345 if (rb->aux_overwrite) {
346 flags |= PERF_AUX_FLAG_OVERWRITE;
347
348 aux_head = handle->head;
349 local_set(&rb->aux_head, aux_head);
350 } else {
351 aux_head = local_read(&rb->aux_head);
352 local_add(size, &rb->aux_head);
353 }
354
355 if (size || flags) {
356 /*
357 * Only send RECORD_AUX if we have something useful to communicate
358 */
359
360 perf_event_aux_event(handle->event, aux_head, size, flags);
361 }
362
363 aux_head = rb->user_page->aux_head = local_read(&rb->aux_head);
364
365 if (aux_head - local_read(&rb->aux_wakeup) >= rb->aux_watermark) {
366 perf_output_wakeup(handle);
367 local_add(rb->aux_watermark, &rb->aux_wakeup);
368 }
369 handle->event = NULL;
370
371 local_set(&rb->aux_nest, 0);
372 rb_free_aux(rb);
373 ring_buffer_put(rb);
374}
375
376/*
377 * Skip over a given number of bytes in the AUX buffer, due to, for example,
378 * hardware's alignment constraints.
379 */
380int perf_aux_output_skip(struct perf_output_handle *handle, unsigned long size)
381{
382 struct ring_buffer *rb = handle->rb;
383 unsigned long aux_head;
384
385 if (size > handle->size)
386 return -ENOSPC;
387
388 local_add(size, &rb->aux_head);
389
390 aux_head = rb->user_page->aux_head = local_read(&rb->aux_head);
391 if (aux_head - local_read(&rb->aux_wakeup) >= rb->aux_watermark) {
392 perf_output_wakeup(handle);
393 local_add(rb->aux_watermark, &rb->aux_wakeup);
394 handle->wakeup = local_read(&rb->aux_wakeup) +
395 rb->aux_watermark;
396 }
397
398 handle->head = aux_head;
399 handle->size -= size;
400
401 return 0;
402}
403
404void *perf_get_aux(struct perf_output_handle *handle)
405{
406 /* this is only valid between perf_aux_output_begin and *_end */
407 if (!handle->event)
408 return NULL;
409
410 return handle->rb->aux_priv;
411}
412
413#define PERF_AUX_GFP (GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY)
414
415static struct page *rb_alloc_aux_page(int node, int order)
416{
417 struct page *page;
418
419 if (order > MAX_ORDER)
420 order = MAX_ORDER;
421
422 do {
423 page = alloc_pages_node(node, PERF_AUX_GFP, order);
424 } while (!page && order--);
425
426 if (page && order) {
427 /*
428 * Communicate the allocation size to the driver
429 */
430 split_page(page, order);
431 SetPagePrivate(page);
432 set_page_private(page, order);
433 }
434
435 return page;
436}
437
438static void rb_free_aux_page(struct ring_buffer *rb, int idx)
439{
440 struct page *page = virt_to_page(rb->aux_pages[idx]);
441
442 ClearPagePrivate(page);
443 page->mapping = NULL;
444 __free_page(page);
445}
446
447int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event,
448 pgoff_t pgoff, int nr_pages, long watermark, int flags)
449{
450 bool overwrite = !(flags & RING_BUFFER_WRITABLE);
451 int node = (event->cpu == -1) ? -1 : cpu_to_node(event->cpu);
452 int ret = -ENOMEM, max_order = 0;
453
454 if (!has_aux(event))
455 return -ENOTSUPP;
456
457 if (event->pmu->capabilities & PERF_PMU_CAP_AUX_NO_SG) {
458 /*
459 * We need to start with the max_order that fits in nr_pages,
460 * not the other way around, hence ilog2() and not get_order.
461 */
462 max_order = ilog2(nr_pages);
463
464 /*
465 * PMU requests more than one contiguous chunks of memory
466 * for SW double buffering
467 */
468 if ((event->pmu->capabilities & PERF_PMU_CAP_AUX_SW_DOUBLEBUF) &&
469 !overwrite) {
470 if (!max_order)
471 return -EINVAL;
472
473 max_order--;
474 }
475 }
476
477 rb->aux_pages = kzalloc_node(nr_pages * sizeof(void *), GFP_KERNEL, node);
478 if (!rb->aux_pages)
479 return -ENOMEM;
480
481 rb->free_aux = event->pmu->free_aux;
482 for (rb->aux_nr_pages = 0; rb->aux_nr_pages < nr_pages;) {
483 struct page *page;
484 int last, order;
485
486 order = min(max_order, ilog2(nr_pages - rb->aux_nr_pages));
487 page = rb_alloc_aux_page(node, order);
488 if (!page)
489 goto out;
490
491 for (last = rb->aux_nr_pages + (1 << page_private(page));
492 last > rb->aux_nr_pages; rb->aux_nr_pages++)
493 rb->aux_pages[rb->aux_nr_pages] = page_address(page++);
494 }
495
496 rb->aux_priv = event->pmu->setup_aux(event->cpu, rb->aux_pages, nr_pages,
497 overwrite);
498 if (!rb->aux_priv)
499 goto out;
500
501 ret = 0;
502
503 /*
504 * aux_pages (and pmu driver's private data, aux_priv) will be
505 * referenced in both producer's and consumer's contexts, thus
506 * we keep a refcount here to make sure either of the two can
507 * reference them safely.
508 */
509 atomic_set(&rb->aux_refcount, 1);
510
511 rb->aux_overwrite = overwrite;
512 rb->aux_watermark = watermark;
513
514 if (!rb->aux_watermark && !rb->aux_overwrite)
515 rb->aux_watermark = nr_pages << (PAGE_SHIFT - 1);
516
517out:
518 if (!ret)
519 rb->aux_pgoff = pgoff;
520 else
521 rb_free_aux(rb);
522
523 return ret;
524}
525
526static void __rb_free_aux(struct ring_buffer *rb)
527{
528 int pg;
529
530 if (rb->aux_priv) {
531 rb->free_aux(rb->aux_priv);
532 rb->free_aux = NULL;
533 rb->aux_priv = NULL;
534 }
535
536 for (pg = 0; pg < rb->aux_nr_pages; pg++)
537 rb_free_aux_page(rb, pg);
538
539 kfree(rb->aux_pages);
540 rb->aux_nr_pages = 0;
541}
542
543void rb_free_aux(struct ring_buffer *rb)
544{
545 if (atomic_dec_and_test(&rb->aux_refcount))
546 __rb_free_aux(rb);
547}
548
246#ifndef CONFIG_PERF_USE_VMALLOC 549#ifndef CONFIG_PERF_USE_VMALLOC
247 550
248/* 551/*
249 * Back perf_mmap() with regular GFP_KERNEL-0 pages. 552 * Back perf_mmap() with regular GFP_KERNEL-0 pages.
250 */ 553 */
251 554
252struct page * 555static struct page *
253perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff) 556__perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff)
254{ 557{
255 if (pgoff > rb->nr_pages) 558 if (pgoff > rb->nr_pages)
256 return NULL; 559 return NULL;
@@ -340,8 +643,8 @@ static int data_page_nr(struct ring_buffer *rb)
340 return rb->nr_pages << page_order(rb); 643 return rb->nr_pages << page_order(rb);
341} 644}
342 645
343struct page * 646static struct page *
344perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff) 647__perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff)
345{ 648{
346 /* The '>' counts in the user page. */ 649 /* The '>' counts in the user page. */
347 if (pgoff > data_page_nr(rb)) 650 if (pgoff > data_page_nr(rb))
@@ -416,3 +719,19 @@ fail:
416} 719}
417 720
418#endif 721#endif
722
723struct page *
724perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff)
725{
726 if (rb->aux_nr_pages) {
727 /* above AUX space */
728 if (pgoff > rb->aux_pgoff + rb->aux_nr_pages)
729 return NULL;
730
731 /* AUX space */
732 if (pgoff >= rb->aux_pgoff)
733 return virt_to_page(rb->aux_pages[pgoff - rb->aux_pgoff]);
734 }
735
736 return __perf_mmap_to_page(rb, pgoff);
737}
diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c
index 83d4382f5699..6873bb3e6b7e 100644
--- a/kernel/exec_domain.c
+++ b/kernel/exec_domain.c
@@ -20,145 +20,10 @@
20#include <linux/types.h> 20#include <linux/types.h>
21#include <linux/fs_struct.h> 21#include <linux/fs_struct.h>
22 22
23
24static void default_handler(int, struct pt_regs *);
25
26static struct exec_domain *exec_domains = &default_exec_domain;
27static DEFINE_RWLOCK(exec_domains_lock);
28
29
30static unsigned long ident_map[32] = {
31 0, 1, 2, 3, 4, 5, 6, 7,
32 8, 9, 10, 11, 12, 13, 14, 15,
33 16, 17, 18, 19, 20, 21, 22, 23,
34 24, 25, 26, 27, 28, 29, 30, 31
35};
36
37struct exec_domain default_exec_domain = {
38 .name = "Linux", /* name */
39 .handler = default_handler, /* lcall7 causes a seg fault. */
40 .pers_low = 0, /* PER_LINUX personality. */
41 .pers_high = 0, /* PER_LINUX personality. */
42 .signal_map = ident_map, /* Identity map signals. */
43 .signal_invmap = ident_map, /* - both ways. */
44};
45
46
47static void
48default_handler(int segment, struct pt_regs *regp)
49{
50 set_personality(0);
51
52 if (current_thread_info()->exec_domain->handler != default_handler)
53 current_thread_info()->exec_domain->handler(segment, regp);
54 else
55 send_sig(SIGSEGV, current, 1);
56}
57
58static struct exec_domain *
59lookup_exec_domain(unsigned int personality)
60{
61 unsigned int pers = personality(personality);
62 struct exec_domain *ep;
63
64 read_lock(&exec_domains_lock);
65 for (ep = exec_domains; ep; ep = ep->next) {
66 if (pers >= ep->pers_low && pers <= ep->pers_high)
67 if (try_module_get(ep->module))
68 goto out;
69 }
70
71#ifdef CONFIG_MODULES
72 read_unlock(&exec_domains_lock);
73 request_module("personality-%d", pers);
74 read_lock(&exec_domains_lock);
75
76 for (ep = exec_domains; ep; ep = ep->next) {
77 if (pers >= ep->pers_low && pers <= ep->pers_high)
78 if (try_module_get(ep->module))
79 goto out;
80 }
81#endif
82
83 ep = &default_exec_domain;
84out:
85 read_unlock(&exec_domains_lock);
86 return ep;
87}
88
89int
90register_exec_domain(struct exec_domain *ep)
91{
92 struct exec_domain *tmp;
93 int err = -EBUSY;
94
95 if (ep == NULL)
96 return -EINVAL;
97
98 if (ep->next != NULL)
99 return -EBUSY;
100
101 write_lock(&exec_domains_lock);
102 for (tmp = exec_domains; tmp; tmp = tmp->next) {
103 if (tmp == ep)
104 goto out;
105 }
106
107 ep->next = exec_domains;
108 exec_domains = ep;
109 err = 0;
110
111out:
112 write_unlock(&exec_domains_lock);
113 return err;
114}
115EXPORT_SYMBOL(register_exec_domain);
116
117int
118unregister_exec_domain(struct exec_domain *ep)
119{
120 struct exec_domain **epp;
121
122 epp = &exec_domains;
123 write_lock(&exec_domains_lock);
124 for (epp = &exec_domains; *epp; epp = &(*epp)->next) {
125 if (ep == *epp)
126 goto unregister;
127 }
128 write_unlock(&exec_domains_lock);
129 return -EINVAL;
130
131unregister:
132 *epp = ep->next;
133 ep->next = NULL;
134 write_unlock(&exec_domains_lock);
135 return 0;
136}
137EXPORT_SYMBOL(unregister_exec_domain);
138
139int __set_personality(unsigned int personality)
140{
141 struct exec_domain *oep = current_thread_info()->exec_domain;
142
143 current_thread_info()->exec_domain = lookup_exec_domain(personality);
144 current->personality = personality;
145 module_put(oep->module);
146
147 return 0;
148}
149EXPORT_SYMBOL(__set_personality);
150
151#ifdef CONFIG_PROC_FS 23#ifdef CONFIG_PROC_FS
152static int execdomains_proc_show(struct seq_file *m, void *v) 24static int execdomains_proc_show(struct seq_file *m, void *v)
153{ 25{
154 struct exec_domain *ep; 26 seq_puts(m, "0-0\tLinux \t[kernel]\n");
155
156 read_lock(&exec_domains_lock);
157 for (ep = exec_domains; ep; ep = ep->next)
158 seq_printf(m, "%d-%d\t%-16s\t[%s]\n",
159 ep->pers_low, ep->pers_high, ep->name,
160 module_name(ep->module));
161 read_unlock(&exec_domains_lock);
162 return 0; 27 return 0;
163} 28}
164 29
diff --git a/kernel/exit.c b/kernel/exit.c
index feff10bbb307..22fcc05dec40 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -756,8 +756,6 @@ void do_exit(long code)
756 756
757 cgroup_exit(tsk); 757 cgroup_exit(tsk);
758 758
759 module_put(task_thread_info(tsk)->exec_domain->module);
760
761 /* 759 /*
762 * FIXME: do that only when needed, using sched_exit tracepoint 760 * FIXME: do that only when needed, using sched_exit tracepoint
763 */ 761 */
diff --git a/kernel/fork.c b/kernel/fork.c
index cf65139615a0..03c1eaaa6ef5 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -74,6 +74,7 @@
74#include <linux/uprobes.h> 74#include <linux/uprobes.h>
75#include <linux/aio.h> 75#include <linux/aio.h>
76#include <linux/compiler.h> 76#include <linux/compiler.h>
77#include <linux/sysctl.h>
77 78
78#include <asm/pgtable.h> 79#include <asm/pgtable.h>
79#include <asm/pgalloc.h> 80#include <asm/pgalloc.h>
@@ -88,6 +89,16 @@
88#include <trace/events/task.h> 89#include <trace/events/task.h>
89 90
90/* 91/*
92 * Minimum number of threads to boot the kernel
93 */
94#define MIN_THREADS 20
95
96/*
97 * Maximum number of threads
98 */
99#define MAX_THREADS FUTEX_TID_MASK
100
101/*
91 * Protected counters by write_lock_irq(&tasklist_lock) 102 * Protected counters by write_lock_irq(&tasklist_lock)
92 */ 103 */
93unsigned long total_forks; /* Handle normal Linux uptimes. */ 104unsigned long total_forks; /* Handle normal Linux uptimes. */
@@ -253,7 +264,30 @@ EXPORT_SYMBOL_GPL(__put_task_struct);
253 264
254void __init __weak arch_task_cache_init(void) { } 265void __init __weak arch_task_cache_init(void) { }
255 266
256void __init fork_init(unsigned long mempages) 267/*
268 * set_max_threads
269 */
270static void set_max_threads(unsigned int max_threads_suggested)
271{
272 u64 threads;
273
274 /*
275 * The number of threads shall be limited such that the thread
276 * structures may only consume a small part of the available memory.
277 */
278 if (fls64(totalram_pages) + fls64(PAGE_SIZE) > 64)
279 threads = MAX_THREADS;
280 else
281 threads = div64_u64((u64) totalram_pages * (u64) PAGE_SIZE,
282 (u64) THREAD_SIZE * 8UL);
283
284 if (threads > max_threads_suggested)
285 threads = max_threads_suggested;
286
287 max_threads = clamp_t(u64, threads, MIN_THREADS, MAX_THREADS);
288}
289
290void __init fork_init(void)
257{ 291{
258#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR 292#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
259#ifndef ARCH_MIN_TASKALIGN 293#ifndef ARCH_MIN_TASKALIGN
@@ -268,18 +302,7 @@ void __init fork_init(unsigned long mempages)
268 /* do the arch specific task caches init */ 302 /* do the arch specific task caches init */
269 arch_task_cache_init(); 303 arch_task_cache_init();
270 304
271 /* 305 set_max_threads(MAX_THREADS);
272 * The default maximum number of threads is set to a safe
273 * value: the thread structures can take up at most half
274 * of memory.
275 */
276 max_threads = mempages / (8 * THREAD_SIZE / PAGE_SIZE);
277
278 /*
279 * we need to allow at least 20 threads to boot a system
280 */
281 if (max_threads < 20)
282 max_threads = 20;
283 306
284 init_task.signal->rlim[RLIMIT_NPROC].rlim_cur = max_threads/2; 307 init_task.signal->rlim[RLIMIT_NPROC].rlim_cur = max_threads/2;
285 init_task.signal->rlim[RLIMIT_NPROC].rlim_max = max_threads/2; 308 init_task.signal->rlim[RLIMIT_NPROC].rlim_max = max_threads/2;
@@ -380,6 +403,9 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
380 */ 403 */
381 down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING); 404 down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING);
382 405
406 /* No ordering required: file already has been exposed. */
407 RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm));
408
383 mm->total_vm = oldmm->total_vm; 409 mm->total_vm = oldmm->total_vm;
384 mm->shared_vm = oldmm->shared_vm; 410 mm->shared_vm = oldmm->shared_vm;
385 mm->exec_vm = oldmm->exec_vm; 411 mm->exec_vm = oldmm->exec_vm;
@@ -505,7 +531,13 @@ static inline void mm_free_pgd(struct mm_struct *mm)
505 pgd_free(mm, mm->pgd); 531 pgd_free(mm, mm->pgd);
506} 532}
507#else 533#else
508#define dup_mmap(mm, oldmm) (0) 534static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
535{
536 down_write(&oldmm->mmap_sem);
537 RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm));
538 up_write(&oldmm->mmap_sem);
539 return 0;
540}
509#define mm_alloc_pgd(mm) (0) 541#define mm_alloc_pgd(mm) (0)
510#define mm_free_pgd(mm) 542#define mm_free_pgd(mm)
511#endif /* CONFIG_MMU */ 543#endif /* CONFIG_MMU */
@@ -674,34 +706,53 @@ void mmput(struct mm_struct *mm)
674} 706}
675EXPORT_SYMBOL_GPL(mmput); 707EXPORT_SYMBOL_GPL(mmput);
676 708
709/**
710 * set_mm_exe_file - change a reference to the mm's executable file
711 *
712 * This changes mm's executable file (shown as symlink /proc/[pid]/exe).
713 *
714 * Main users are mmput() and sys_execve(). Callers prevent concurrent
715 * invocations: in mmput() nobody alive left, in execve task is single
716 * threaded. sys_prctl(PR_SET_MM_MAP/EXE_FILE) also needs to set the
717 * mm->exe_file, but does so without using set_mm_exe_file() in order
718 * to do avoid the need for any locks.
719 */
677void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file) 720void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
678{ 721{
722 struct file *old_exe_file;
723
724 /*
725 * It is safe to dereference the exe_file without RCU as
726 * this function is only called if nobody else can access
727 * this mm -- see comment above for justification.
728 */
729 old_exe_file = rcu_dereference_raw(mm->exe_file);
730
679 if (new_exe_file) 731 if (new_exe_file)
680 get_file(new_exe_file); 732 get_file(new_exe_file);
681 if (mm->exe_file) 733 rcu_assign_pointer(mm->exe_file, new_exe_file);
682 fput(mm->exe_file); 734 if (old_exe_file)
683 mm->exe_file = new_exe_file; 735 fput(old_exe_file);
684} 736}
685 737
738/**
739 * get_mm_exe_file - acquire a reference to the mm's executable file
740 *
741 * Returns %NULL if mm has no associated executable file.
742 * User must release file via fput().
743 */
686struct file *get_mm_exe_file(struct mm_struct *mm) 744struct file *get_mm_exe_file(struct mm_struct *mm)
687{ 745{
688 struct file *exe_file; 746 struct file *exe_file;
689 747
690 /* We need mmap_sem to protect against races with removal of exe_file */ 748 rcu_read_lock();
691 down_read(&mm->mmap_sem); 749 exe_file = rcu_dereference(mm->exe_file);
692 exe_file = mm->exe_file; 750 if (exe_file && !get_file_rcu(exe_file))
693 if (exe_file) 751 exe_file = NULL;
694 get_file(exe_file); 752 rcu_read_unlock();
695 up_read(&mm->mmap_sem);
696 return exe_file; 753 return exe_file;
697} 754}
698 755EXPORT_SYMBOL(get_mm_exe_file);
699static void dup_mm_exe_file(struct mm_struct *oldmm, struct mm_struct *newmm)
700{
701 /* It's safe to write the exe_file pointer without exe_file_lock because
702 * this is called during fork when the task is not yet in /proc */
703 newmm->exe_file = get_mm_exe_file(oldmm);
704}
705 756
706/** 757/**
707 * get_task_mm - acquire a reference to the task's mm 758 * get_task_mm - acquire a reference to the task's mm
@@ -864,8 +915,6 @@ static struct mm_struct *dup_mm(struct task_struct *tsk)
864 if (!mm_init(mm, tsk)) 915 if (!mm_init(mm, tsk))
865 goto fail_nomem; 916 goto fail_nomem;
866 917
867 dup_mm_exe_file(oldmm, mm);
868
869 err = dup_mmap(mm, oldmm); 918 err = dup_mmap(mm, oldmm);
870 if (err) 919 if (err)
871 goto free_pt; 920 goto free_pt;
@@ -1279,9 +1328,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1279 if (nr_threads >= max_threads) 1328 if (nr_threads >= max_threads)
1280 goto bad_fork_cleanup_count; 1329 goto bad_fork_cleanup_count;
1281 1330
1282 if (!try_module_get(task_thread_info(p)->exec_domain->module))
1283 goto bad_fork_cleanup_count;
1284
1285 delayacct_tsk_init(p); /* Must remain after dup_task_struct() */ 1331 delayacct_tsk_init(p); /* Must remain after dup_task_struct() */
1286 p->flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER); 1332 p->flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER);
1287 p->flags |= PF_FORKNOEXEC; 1333 p->flags |= PF_FORKNOEXEC;
@@ -1406,10 +1452,11 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1406 goto bad_fork_cleanup_io; 1452 goto bad_fork_cleanup_io;
1407 1453
1408 if (pid != &init_struct_pid) { 1454 if (pid != &init_struct_pid) {
1409 retval = -ENOMEM;
1410 pid = alloc_pid(p->nsproxy->pid_ns_for_children); 1455 pid = alloc_pid(p->nsproxy->pid_ns_for_children);
1411 if (!pid) 1456 if (IS_ERR(pid)) {
1457 retval = PTR_ERR(pid);
1412 goto bad_fork_cleanup_io; 1458 goto bad_fork_cleanup_io;
1459 }
1413 } 1460 }
1414 1461
1415 p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL; 1462 p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
@@ -1590,7 +1637,6 @@ bad_fork_cleanup_threadgroup_lock:
1590 if (clone_flags & CLONE_THREAD) 1637 if (clone_flags & CLONE_THREAD)
1591 threadgroup_change_end(current); 1638 threadgroup_change_end(current);
1592 delayacct_tsk_free(p); 1639 delayacct_tsk_free(p);
1593 module_put(task_thread_info(p)->exec_domain->module);
1594bad_fork_cleanup_count: 1640bad_fork_cleanup_count:
1595 atomic_dec(&p->cred->user->processes); 1641 atomic_dec(&p->cred->user->processes);
1596 exit_creds(p); 1642 exit_creds(p);
@@ -2004,3 +2050,26 @@ int unshare_files(struct files_struct **displaced)
2004 task_unlock(task); 2050 task_unlock(task);
2005 return 0; 2051 return 0;
2006} 2052}
2053
2054int sysctl_max_threads(struct ctl_table *table, int write,
2055 void __user *buffer, size_t *lenp, loff_t *ppos)
2056{
2057 struct ctl_table t;
2058 int ret;
2059 int threads = max_threads;
2060 int min = MIN_THREADS;
2061 int max = MAX_THREADS;
2062
2063 t = *table;
2064 t.data = &threads;
2065 t.extra1 = &min;
2066 t.extra2 = &max;
2067
2068 ret = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
2069 if (ret || !write)
2070 return ret;
2071
2072 set_max_threads(threads);
2073
2074 return 0;
2075}
diff --git a/kernel/futex.c b/kernel/futex.c
index 2a5e3830e953..2579e407ff67 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -900,7 +900,7 @@ static int attach_to_pi_owner(u32 uval, union futex_key *key,
900 if (!p) 900 if (!p)
901 return -ESRCH; 901 return -ESRCH;
902 902
903 if (!p->mm) { 903 if (unlikely(p->flags & PF_KTHREAD)) {
904 put_task_struct(p); 904 put_task_struct(p);
905 return -EPERM; 905 return -EPERM;
906 } 906 }
diff --git a/kernel/gcov/base.c b/kernel/gcov/base.c
index b358a802fd18..a744098e4eb7 100644
--- a/kernel/gcov/base.c
+++ b/kernel/gcov/base.c
@@ -18,6 +18,7 @@
18#include <linux/init.h> 18#include <linux/init.h>
19#include <linux/module.h> 19#include <linux/module.h>
20#include <linux/mutex.h> 20#include <linux/mutex.h>
21#include <linux/sched.h>
21#include "gcov.h" 22#include "gcov.h"
22 23
23static int gcov_events_enabled; 24static int gcov_events_enabled;
@@ -107,8 +108,10 @@ void gcov_enable_events(void)
107 gcov_events_enabled = 1; 108 gcov_events_enabled = 1;
108 109
109 /* Perform event callback for previously registered entries. */ 110 /* Perform event callback for previously registered entries. */
110 while ((info = gcov_info_next(info))) 111 while ((info = gcov_info_next(info))) {
111 gcov_event(GCOV_ADD, info); 112 gcov_event(GCOV_ADD, info);
113 cond_resched();
114 }
112 115
113 mutex_unlock(&gcov_lock); 116 mutex_unlock(&gcov_lock);
114} 117}
diff --git a/kernel/groups.c b/kernel/groups.c
index 664411f171b5..74d431d25251 100644
--- a/kernel/groups.c
+++ b/kernel/groups.c
@@ -9,9 +9,6 @@
9#include <linux/user_namespace.h> 9#include <linux/user_namespace.h>
10#include <asm/uaccess.h> 10#include <asm/uaccess.h>
11 11
12/* init to 2 - one for init_task, one to ensure it is never freed */
13struct group_info init_groups = { .usage = ATOMIC_INIT(2) };
14
15struct group_info *groups_alloc(int gidsetsize) 12struct group_info *groups_alloc(int gidsetsize)
16{ 13{
17 struct group_info *group_info; 14 struct group_info *group_info;
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index 06db12434d72..e0f90c2b57aa 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -169,7 +169,7 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
169 return; 169 return;
170 170
171 rcu_read_lock(); 171 rcu_read_lock();
172 do_each_thread(g, t) { 172 for_each_process_thread(g, t) {
173 if (!max_count--) 173 if (!max_count--)
174 goto unlock; 174 goto unlock;
175 if (!--batch_count) { 175 if (!--batch_count) {
@@ -180,7 +180,7 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
180 /* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */ 180 /* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */
181 if (t->state == TASK_UNINTERRUPTIBLE) 181 if (t->state == TASK_UNINTERRUPTIBLE)
182 check_hung_task(t, timeout); 182 check_hung_task(t, timeout);
183 } while_each_thread(g, t); 183 }
184 unlock: 184 unlock:
185 rcu_read_unlock(); 185 rcu_read_unlock();
186} 186}
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 6f1c7a566b95..eb9a4ea394ab 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -948,6 +948,22 @@ int irq_chip_retrigger_hierarchy(struct irq_data *data)
948 948
949 return -ENOSYS; 949 return -ENOSYS;
950} 950}
951
952/**
953 * irq_chip_set_wake_parent - Set/reset wake-up on the parent interrupt
954 * @data: Pointer to interrupt specific data
955 * @on: Whether to set or reset the wake-up capability of this irq
956 *
957 * Conditional, as the underlying parent chip might not implement it.
958 */
959int irq_chip_set_wake_parent(struct irq_data *data, unsigned int on)
960{
961 data = data->parent_data;
962 if (data->chip->irq_set_wake)
963 return data->chip->irq_set_wake(data, on);
964
965 return -ENOSYS;
966}
951#endif 967#endif
952 968
953/** 969/**
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 886d09e691d5..e68932bb308e 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -68,14 +68,20 @@ static void __synchronize_hardirq(struct irq_desc *desc)
68 * Do not use this for shutdown scenarios where you must be sure 68 * Do not use this for shutdown scenarios where you must be sure
69 * that all parts (hardirq and threaded handler) have completed. 69 * that all parts (hardirq and threaded handler) have completed.
70 * 70 *
71 * Returns: false if a threaded handler is active.
72 *
71 * This function may be called - with care - from IRQ context. 73 * This function may be called - with care - from IRQ context.
72 */ 74 */
73void synchronize_hardirq(unsigned int irq) 75bool synchronize_hardirq(unsigned int irq)
74{ 76{
75 struct irq_desc *desc = irq_to_desc(irq); 77 struct irq_desc *desc = irq_to_desc(irq);
76 78
77 if (desc) 79 if (desc) {
78 __synchronize_hardirq(desc); 80 __synchronize_hardirq(desc);
81 return !atomic_read(&desc->threads_active);
82 }
83
84 return true;
79} 85}
80EXPORT_SYMBOL(synchronize_hardirq); 86EXPORT_SYMBOL(synchronize_hardirq);
81 87
@@ -440,6 +446,32 @@ void disable_irq(unsigned int irq)
440} 446}
441EXPORT_SYMBOL(disable_irq); 447EXPORT_SYMBOL(disable_irq);
442 448
449/**
450 * disable_hardirq - disables an irq and waits for hardirq completion
451 * @irq: Interrupt to disable
452 *
453 * Disable the selected interrupt line. Enables and Disables are
454 * nested.
455 * This function waits for any pending hard IRQ handlers for this
456 * interrupt to complete before returning. If you use this function while
457 * holding a resource the hard IRQ handler may need you will deadlock.
458 *
459 * When used to optimistically disable an interrupt from atomic context
460 * the return value must be checked.
461 *
462 * Returns: false if a threaded handler is active.
463 *
464 * This function may be called - with care - from IRQ context.
465 */
466bool disable_hardirq(unsigned int irq)
467{
468 if (!__disable_irq_nosync(irq))
469 return synchronize_hardirq(irq);
470
471 return false;
472}
473EXPORT_SYMBOL_GPL(disable_hardirq);
474
443void __enable_irq(struct irq_desc *desc, unsigned int irq) 475void __enable_irq(struct irq_desc *desc, unsigned int irq)
444{ 476{
445 switch (desc->depth) { 477 switch (desc->depth) {
@@ -1766,3 +1798,94 @@ int request_percpu_irq(unsigned int irq, irq_handler_t handler,
1766 1798
1767 return retval; 1799 return retval;
1768} 1800}
1801
1802/**
1803 * irq_get_irqchip_state - returns the irqchip state of a interrupt.
1804 * @irq: Interrupt line that is forwarded to a VM
1805 * @which: One of IRQCHIP_STATE_* the caller wants to know about
1806 * @state: a pointer to a boolean where the state is to be storeed
1807 *
1808 * This call snapshots the internal irqchip state of an
1809 * interrupt, returning into @state the bit corresponding to
1810 * stage @which
1811 *
1812 * This function should be called with preemption disabled if the
1813 * interrupt controller has per-cpu registers.
1814 */
1815int irq_get_irqchip_state(unsigned int irq, enum irqchip_irq_state which,
1816 bool *state)
1817{
1818 struct irq_desc *desc;
1819 struct irq_data *data;
1820 struct irq_chip *chip;
1821 unsigned long flags;
1822 int err = -EINVAL;
1823
1824 desc = irq_get_desc_buslock(irq, &flags, 0);
1825 if (!desc)
1826 return err;
1827
1828 data = irq_desc_get_irq_data(desc);
1829
1830 do {
1831 chip = irq_data_get_irq_chip(data);
1832 if (chip->irq_get_irqchip_state)
1833 break;
1834#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
1835 data = data->parent_data;
1836#else
1837 data = NULL;
1838#endif
1839 } while (data);
1840
1841 if (data)
1842 err = chip->irq_get_irqchip_state(data, which, state);
1843
1844 irq_put_desc_busunlock(desc, flags);
1845 return err;
1846}
1847
1848/**
1849 * irq_set_irqchip_state - set the state of a forwarded interrupt.
1850 * @irq: Interrupt line that is forwarded to a VM
1851 * @which: State to be restored (one of IRQCHIP_STATE_*)
1852 * @val: Value corresponding to @which
1853 *
1854 * This call sets the internal irqchip state of an interrupt,
1855 * depending on the value of @which.
1856 *
1857 * This function should be called with preemption disabled if the
1858 * interrupt controller has per-cpu registers.
1859 */
1860int irq_set_irqchip_state(unsigned int irq, enum irqchip_irq_state which,
1861 bool val)
1862{
1863 struct irq_desc *desc;
1864 struct irq_data *data;
1865 struct irq_chip *chip;
1866 unsigned long flags;
1867 int err = -EINVAL;
1868
1869 desc = irq_get_desc_buslock(irq, &flags, 0);
1870 if (!desc)
1871 return err;
1872
1873 data = irq_desc_get_irq_data(desc);
1874
1875 do {
1876 chip = irq_data_get_irq_chip(data);
1877 if (chip->irq_set_irqchip_state)
1878 break;
1879#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
1880 data = data->parent_data;
1881#else
1882 data = NULL;
1883#endif
1884 } while (data);
1885
1886 if (data)
1887 err = chip->irq_set_irqchip_state(data, which, val);
1888
1889 irq_put_desc_busunlock(desc, flags);
1890 return err;
1891}
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index 3e18163f336f..474de5cb394d 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -310,8 +310,15 @@ void msi_domain_free_irqs(struct irq_domain *domain, struct device *dev)
310 struct msi_desc *desc; 310 struct msi_desc *desc;
311 311
312 for_each_msi_entry(desc, dev) { 312 for_each_msi_entry(desc, dev) {
313 irq_domain_free_irqs(desc->irq, desc->nvec_used); 313 /*
314 desc->irq = 0; 314 * We might have failed to allocate an MSI early
315 * enough that there is no IRQ associated to this
316 * entry. If that's the case, don't do anything.
317 */
318 if (desc->irq) {
319 irq_domain_free_irqs(desc->irq, desc->nvec_used);
320 desc->irq = 0;
321 }
315 } 322 }
316} 323}
317 324
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
index 3f9f1d6b4c2e..284e2691e380 100644
--- a/kernel/livepatch/core.c
+++ b/kernel/livepatch/core.c
@@ -335,32 +335,20 @@ unlock:
335 rcu_read_unlock(); 335 rcu_read_unlock();
336} 336}
337 337
338static int klp_disable_func(struct klp_func *func) 338static void klp_disable_func(struct klp_func *func)
339{ 339{
340 struct klp_ops *ops; 340 struct klp_ops *ops;
341 int ret;
342
343 if (WARN_ON(func->state != KLP_ENABLED))
344 return -EINVAL;
345 341
346 if (WARN_ON(!func->old_addr)) 342 WARN_ON(func->state != KLP_ENABLED);
347 return -EINVAL; 343 WARN_ON(!func->old_addr);
348 344
349 ops = klp_find_ops(func->old_addr); 345 ops = klp_find_ops(func->old_addr);
350 if (WARN_ON(!ops)) 346 if (WARN_ON(!ops))
351 return -EINVAL; 347 return;
352 348
353 if (list_is_singular(&ops->func_stack)) { 349 if (list_is_singular(&ops->func_stack)) {
354 ret = unregister_ftrace_function(&ops->fops); 350 WARN_ON(unregister_ftrace_function(&ops->fops));
355 if (ret) { 351 WARN_ON(ftrace_set_filter_ip(&ops->fops, func->old_addr, 1, 0));
356 pr_err("failed to unregister ftrace handler for function '%s' (%d)\n",
357 func->old_name, ret);
358 return ret;
359 }
360
361 ret = ftrace_set_filter_ip(&ops->fops, func->old_addr, 1, 0);
362 if (ret)
363 pr_warn("function unregister succeeded but failed to clear the filter\n");
364 352
365 list_del_rcu(&func->stack_node); 353 list_del_rcu(&func->stack_node);
366 list_del(&ops->node); 354 list_del(&ops->node);
@@ -370,8 +358,6 @@ static int klp_disable_func(struct klp_func *func)
370 } 358 }
371 359
372 func->state = KLP_DISABLED; 360 func->state = KLP_DISABLED;
373
374 return 0;
375} 361}
376 362
377static int klp_enable_func(struct klp_func *func) 363static int klp_enable_func(struct klp_func *func)
@@ -432,23 +418,15 @@ err:
432 return ret; 418 return ret;
433} 419}
434 420
435static int klp_disable_object(struct klp_object *obj) 421static void klp_disable_object(struct klp_object *obj)
436{ 422{
437 struct klp_func *func; 423 struct klp_func *func;
438 int ret;
439 424
440 for (func = obj->funcs; func->old_name; func++) { 425 for (func = obj->funcs; func->old_name; func++)
441 if (func->state != KLP_ENABLED) 426 if (func->state == KLP_ENABLED)
442 continue; 427 klp_disable_func(func);
443
444 ret = klp_disable_func(func);
445 if (ret)
446 return ret;
447 }
448 428
449 obj->state = KLP_DISABLED; 429 obj->state = KLP_DISABLED;
450
451 return 0;
452} 430}
453 431
454static int klp_enable_object(struct klp_object *obj) 432static int klp_enable_object(struct klp_object *obj)
@@ -464,22 +442,19 @@ static int klp_enable_object(struct klp_object *obj)
464 442
465 for (func = obj->funcs; func->old_name; func++) { 443 for (func = obj->funcs; func->old_name; func++) {
466 ret = klp_enable_func(func); 444 ret = klp_enable_func(func);
467 if (ret) 445 if (ret) {
468 goto unregister; 446 klp_disable_object(obj);
447 return ret;
448 }
469 } 449 }
470 obj->state = KLP_ENABLED; 450 obj->state = KLP_ENABLED;
471 451
472 return 0; 452 return 0;
473
474unregister:
475 WARN_ON(klp_disable_object(obj));
476 return ret;
477} 453}
478 454
479static int __klp_disable_patch(struct klp_patch *patch) 455static int __klp_disable_patch(struct klp_patch *patch)
480{ 456{
481 struct klp_object *obj; 457 struct klp_object *obj;
482 int ret;
483 458
484 /* enforce stacking: only the last enabled patch can be disabled */ 459 /* enforce stacking: only the last enabled patch can be disabled */
485 if (!list_is_last(&patch->list, &klp_patches) && 460 if (!list_is_last(&patch->list, &klp_patches) &&
@@ -489,12 +464,8 @@ static int __klp_disable_patch(struct klp_patch *patch)
489 pr_notice("disabling patch '%s'\n", patch->mod->name); 464 pr_notice("disabling patch '%s'\n", patch->mod->name);
490 465
491 for (obj = patch->objs; obj->funcs; obj++) { 466 for (obj = patch->objs; obj->funcs; obj++) {
492 if (obj->state != KLP_ENABLED) 467 if (obj->state == KLP_ENABLED)
493 continue; 468 klp_disable_object(obj);
494
495 ret = klp_disable_object(obj);
496 if (ret)
497 return ret;
498 } 469 }
499 470
500 patch->state = KLP_DISABLED; 471 patch->state = KLP_DISABLED;
@@ -553,8 +524,6 @@ static int __klp_enable_patch(struct klp_patch *patch)
553 pr_notice("enabling patch '%s'\n", patch->mod->name); 524 pr_notice("enabling patch '%s'\n", patch->mod->name);
554 525
555 for (obj = patch->objs; obj->funcs; obj++) { 526 for (obj = patch->objs; obj->funcs; obj++) {
556 klp_find_object_module(obj);
557
558 if (!klp_is_object_loaded(obj)) 527 if (!klp_is_object_loaded(obj))
559 continue; 528 continue;
560 529
@@ -945,7 +914,6 @@ static void klp_module_notify_going(struct klp_patch *patch,
945{ 914{
946 struct module *pmod = patch->mod; 915 struct module *pmod = patch->mod;
947 struct module *mod = obj->mod; 916 struct module *mod = obj->mod;
948 int ret;
949 917
950 if (patch->state == KLP_DISABLED) 918 if (patch->state == KLP_DISABLED)
951 goto disabled; 919 goto disabled;
@@ -953,10 +921,7 @@ static void klp_module_notify_going(struct klp_patch *patch,
953 pr_notice("reverting patch '%s' on unloading module '%s'\n", 921 pr_notice("reverting patch '%s' on unloading module '%s'\n",
954 pmod->name, mod->name); 922 pmod->name, mod->name);
955 923
956 ret = klp_disable_object(obj); 924 klp_disable_object(obj);
957 if (ret)
958 pr_warn("failed to revert patch '%s' on module '%s' (%d)\n",
959 pmod->name, mod->name, ret);
960 925
961disabled: 926disabled:
962 klp_free_object_loaded(obj); 927 klp_free_object_loaded(obj);
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 88d0d4420ad2..a0831e1b99f4 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -551,7 +551,21 @@ static void print_lockdep_cache(struct lockdep_map *lock)
551 551
552static void print_lock(struct held_lock *hlock) 552static void print_lock(struct held_lock *hlock)
553{ 553{
554 print_lock_name(hlock_class(hlock)); 554 /*
555 * We can be called locklessly through debug_show_all_locks() so be
556 * extra careful, the hlock might have been released and cleared.
557 */
558 unsigned int class_idx = hlock->class_idx;
559
560 /* Don't re-read hlock->class_idx, can't use READ_ONCE() on bitfields: */
561 barrier();
562
563 if (!class_idx || (class_idx - 1) >= MAX_LOCKDEP_KEYS) {
564 printk("<RELEASED>\n");
565 return;
566 }
567
568 print_lock_name(lock_classes + class_idx - 1);
555 printk(", at: "); 569 printk(", at: ");
556 print_ip_sym(hlock->acquire_ip); 570 print_ip_sym(hlock->acquire_ip);
557} 571}
@@ -633,7 +647,7 @@ static int count_matching_names(struct lock_class *new_class)
633 if (!new_class->name) 647 if (!new_class->name)
634 return 0; 648 return 0;
635 649
636 list_for_each_entry(class, &all_lock_classes, lock_entry) { 650 list_for_each_entry_rcu(class, &all_lock_classes, lock_entry) {
637 if (new_class->key - new_class->subclass == class->key) 651 if (new_class->key - new_class->subclass == class->key)
638 return class->name_version; 652 return class->name_version;
639 if (class->name && !strcmp(class->name, new_class->name)) 653 if (class->name && !strcmp(class->name, new_class->name))
@@ -700,10 +714,12 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
700 hash_head = classhashentry(key); 714 hash_head = classhashentry(key);
701 715
702 /* 716 /*
703 * We can walk the hash lockfree, because the hash only 717 * We do an RCU walk of the hash, see lockdep_free_key_range().
704 * grows, and we are careful when adding entries to the end:
705 */ 718 */
706 list_for_each_entry(class, hash_head, hash_entry) { 719 if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
720 return NULL;
721
722 list_for_each_entry_rcu(class, hash_head, hash_entry) {
707 if (class->key == key) { 723 if (class->key == key) {
708 /* 724 /*
709 * Huh! same key, different name? Did someone trample 725 * Huh! same key, different name? Did someone trample
@@ -728,7 +744,8 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
728 struct lockdep_subclass_key *key; 744 struct lockdep_subclass_key *key;
729 struct list_head *hash_head; 745 struct list_head *hash_head;
730 struct lock_class *class; 746 struct lock_class *class;
731 unsigned long flags; 747
748 DEBUG_LOCKS_WARN_ON(!irqs_disabled());
732 749
733 class = look_up_lock_class(lock, subclass); 750 class = look_up_lock_class(lock, subclass);
734 if (likely(class)) 751 if (likely(class))
@@ -750,28 +767,26 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
750 key = lock->key->subkeys + subclass; 767 key = lock->key->subkeys + subclass;
751 hash_head = classhashentry(key); 768 hash_head = classhashentry(key);
752 769
753 raw_local_irq_save(flags);
754 if (!graph_lock()) { 770 if (!graph_lock()) {
755 raw_local_irq_restore(flags);
756 return NULL; 771 return NULL;
757 } 772 }
758 /* 773 /*
759 * We have to do the hash-walk again, to avoid races 774 * We have to do the hash-walk again, to avoid races
760 * with another CPU: 775 * with another CPU:
761 */ 776 */
762 list_for_each_entry(class, hash_head, hash_entry) 777 list_for_each_entry_rcu(class, hash_head, hash_entry) {
763 if (class->key == key) 778 if (class->key == key)
764 goto out_unlock_set; 779 goto out_unlock_set;
780 }
781
765 /* 782 /*
766 * Allocate a new key from the static array, and add it to 783 * Allocate a new key from the static array, and add it to
767 * the hash: 784 * the hash:
768 */ 785 */
769 if (nr_lock_classes >= MAX_LOCKDEP_KEYS) { 786 if (nr_lock_classes >= MAX_LOCKDEP_KEYS) {
770 if (!debug_locks_off_graph_unlock()) { 787 if (!debug_locks_off_graph_unlock()) {
771 raw_local_irq_restore(flags);
772 return NULL; 788 return NULL;
773 } 789 }
774 raw_local_irq_restore(flags);
775 790
776 print_lockdep_off("BUG: MAX_LOCKDEP_KEYS too low!"); 791 print_lockdep_off("BUG: MAX_LOCKDEP_KEYS too low!");
777 dump_stack(); 792 dump_stack();
@@ -798,7 +813,6 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
798 813
799 if (verbose(class)) { 814 if (verbose(class)) {
800 graph_unlock(); 815 graph_unlock();
801 raw_local_irq_restore(flags);
802 816
803 printk("\nnew class %p: %s", class->key, class->name); 817 printk("\nnew class %p: %s", class->key, class->name);
804 if (class->name_version > 1) 818 if (class->name_version > 1)
@@ -806,15 +820,12 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
806 printk("\n"); 820 printk("\n");
807 dump_stack(); 821 dump_stack();
808 822
809 raw_local_irq_save(flags);
810 if (!graph_lock()) { 823 if (!graph_lock()) {
811 raw_local_irq_restore(flags);
812 return NULL; 824 return NULL;
813 } 825 }
814 } 826 }
815out_unlock_set: 827out_unlock_set:
816 graph_unlock(); 828 graph_unlock();
817 raw_local_irq_restore(flags);
818 829
819out_set_class_cache: 830out_set_class_cache:
820 if (!subclass || force) 831 if (!subclass || force)
@@ -870,11 +881,9 @@ static int add_lock_to_list(struct lock_class *class, struct lock_class *this,
870 entry->distance = distance; 881 entry->distance = distance;
871 entry->trace = *trace; 882 entry->trace = *trace;
872 /* 883 /*
873 * Since we never remove from the dependency list, the list can 884 * Both allocation and removal are done under the graph lock; but
874 * be walked lockless by other CPUs, it's only allocation 885 * iteration is under RCU-sched; see look_up_lock_class() and
875 * that must be protected by the spinlock. But this also means 886 * lockdep_free_key_range().
876 * we must make new entries visible only once writes to the
877 * entry become visible - hence the RCU op:
878 */ 887 */
879 list_add_tail_rcu(&entry->entry, head); 888 list_add_tail_rcu(&entry->entry, head);
880 889
@@ -1025,7 +1034,9 @@ static int __bfs(struct lock_list *source_entry,
1025 else 1034 else
1026 head = &lock->class->locks_before; 1035 head = &lock->class->locks_before;
1027 1036
1028 list_for_each_entry(entry, head, entry) { 1037 DEBUG_LOCKS_WARN_ON(!irqs_disabled());
1038
1039 list_for_each_entry_rcu(entry, head, entry) {
1029 if (!lock_accessed(entry)) { 1040 if (!lock_accessed(entry)) {
1030 unsigned int cq_depth; 1041 unsigned int cq_depth;
1031 mark_lock_accessed(entry, lock); 1042 mark_lock_accessed(entry, lock);
@@ -2022,7 +2033,7 @@ static inline int lookup_chain_cache(struct task_struct *curr,
2022 * We can walk it lock-free, because entries only get added 2033 * We can walk it lock-free, because entries only get added
2023 * to the hash: 2034 * to the hash:
2024 */ 2035 */
2025 list_for_each_entry(chain, hash_head, entry) { 2036 list_for_each_entry_rcu(chain, hash_head, entry) {
2026 if (chain->chain_key == chain_key) { 2037 if (chain->chain_key == chain_key) {
2027cache_hit: 2038cache_hit:
2028 debug_atomic_inc(chain_lookup_hits); 2039 debug_atomic_inc(chain_lookup_hits);
@@ -2996,8 +3007,18 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name,
2996 if (unlikely(!debug_locks)) 3007 if (unlikely(!debug_locks))
2997 return; 3008 return;
2998 3009
2999 if (subclass) 3010 if (subclass) {
3011 unsigned long flags;
3012
3013 if (DEBUG_LOCKS_WARN_ON(current->lockdep_recursion))
3014 return;
3015
3016 raw_local_irq_save(flags);
3017 current->lockdep_recursion = 1;
3000 register_lock_class(lock, subclass, 1); 3018 register_lock_class(lock, subclass, 1);
3019 current->lockdep_recursion = 0;
3020 raw_local_irq_restore(flags);
3021 }
3001} 3022}
3002EXPORT_SYMBOL_GPL(lockdep_init_map); 3023EXPORT_SYMBOL_GPL(lockdep_init_map);
3003 3024
@@ -3887,9 +3908,17 @@ static inline int within(const void *addr, void *start, unsigned long size)
3887 return addr >= start && addr < start + size; 3908 return addr >= start && addr < start + size;
3888} 3909}
3889 3910
3911/*
3912 * Used in module.c to remove lock classes from memory that is going to be
3913 * freed; and possibly re-used by other modules.
3914 *
3915 * We will have had one sync_sched() before getting here, so we're guaranteed
3916 * nobody will look up these exact classes -- they're properly dead but still
3917 * allocated.
3918 */
3890void lockdep_free_key_range(void *start, unsigned long size) 3919void lockdep_free_key_range(void *start, unsigned long size)
3891{ 3920{
3892 struct lock_class *class, *next; 3921 struct lock_class *class;
3893 struct list_head *head; 3922 struct list_head *head;
3894 unsigned long flags; 3923 unsigned long flags;
3895 int i; 3924 int i;
@@ -3905,7 +3934,7 @@ void lockdep_free_key_range(void *start, unsigned long size)
3905 head = classhash_table + i; 3934 head = classhash_table + i;
3906 if (list_empty(head)) 3935 if (list_empty(head))
3907 continue; 3936 continue;
3908 list_for_each_entry_safe(class, next, head, hash_entry) { 3937 list_for_each_entry_rcu(class, head, hash_entry) {
3909 if (within(class->key, start, size)) 3938 if (within(class->key, start, size))
3910 zap_class(class); 3939 zap_class(class);
3911 else if (within(class->name, start, size)) 3940 else if (within(class->name, start, size))
@@ -3916,11 +3945,25 @@ void lockdep_free_key_range(void *start, unsigned long size)
3916 if (locked) 3945 if (locked)
3917 graph_unlock(); 3946 graph_unlock();
3918 raw_local_irq_restore(flags); 3947 raw_local_irq_restore(flags);
3948
3949 /*
3950 * Wait for any possible iterators from look_up_lock_class() to pass
3951 * before continuing to free the memory they refer to.
3952 *
3953 * sync_sched() is sufficient because the read-side is IRQ disable.
3954 */
3955 synchronize_sched();
3956
3957 /*
3958 * XXX at this point we could return the resources to the pool;
3959 * instead we leak them. We would need to change to bitmap allocators
3960 * instead of the linear allocators we have now.
3961 */
3919} 3962}
3920 3963
3921void lockdep_reset_lock(struct lockdep_map *lock) 3964void lockdep_reset_lock(struct lockdep_map *lock)
3922{ 3965{
3923 struct lock_class *class, *next; 3966 struct lock_class *class;
3924 struct list_head *head; 3967 struct list_head *head;
3925 unsigned long flags; 3968 unsigned long flags;
3926 int i, j; 3969 int i, j;
@@ -3948,7 +3991,7 @@ void lockdep_reset_lock(struct lockdep_map *lock)
3948 head = classhash_table + i; 3991 head = classhash_table + i;
3949 if (list_empty(head)) 3992 if (list_empty(head))
3950 continue; 3993 continue;
3951 list_for_each_entry_safe(class, next, head, hash_entry) { 3994 list_for_each_entry_rcu(class, head, hash_entry) {
3952 int match = 0; 3995 int match = 0;
3953 3996
3954 for (j = 0; j < NR_LOCKDEP_CACHING_CLASSES; j++) 3997 for (j = 0; j < NR_LOCKDEP_CACHING_CLASSES; j++)
diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h
index d1fe2ba5bac9..75e114bdf3f2 100644
--- a/kernel/locking/mcs_spinlock.h
+++ b/kernel/locking/mcs_spinlock.h
@@ -78,7 +78,7 @@ void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
78 */ 78 */
79 return; 79 return;
80 } 80 }
81 ACCESS_ONCE(prev->next) = node; 81 WRITE_ONCE(prev->next, node);
82 82
83 /* Wait until the lock holder passes the lock down. */ 83 /* Wait until the lock holder passes the lock down. */
84 arch_mcs_spin_lock_contended(&node->locked); 84 arch_mcs_spin_lock_contended(&node->locked);
@@ -91,7 +91,7 @@ void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
91static inline 91static inline
92void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node) 92void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
93{ 93{
94 struct mcs_spinlock *next = ACCESS_ONCE(node->next); 94 struct mcs_spinlock *next = READ_ONCE(node->next);
95 95
96 if (likely(!next)) { 96 if (likely(!next)) {
97 /* 97 /*
@@ -100,7 +100,7 @@ void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
100 if (likely(cmpxchg(lock, node, NULL) == node)) 100 if (likely(cmpxchg(lock, node, NULL) == node))
101 return; 101 return;
102 /* Wait until the next pointer is set */ 102 /* Wait until the next pointer is set */
103 while (!(next = ACCESS_ONCE(node->next))) 103 while (!(next = READ_ONCE(node->next)))
104 cpu_relax_lowlatency(); 104 cpu_relax_lowlatency();
105 } 105 }
106 106
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index 94674e5919cb..4cccea6b8934 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -25,7 +25,7 @@
25#include <linux/spinlock.h> 25#include <linux/spinlock.h>
26#include <linux/interrupt.h> 26#include <linux/interrupt.h>
27#include <linux/debug_locks.h> 27#include <linux/debug_locks.h>
28#include "mcs_spinlock.h" 28#include <linux/osq_lock.h>
29 29
30/* 30/*
31 * In the DEBUG case we are using the "NULL fastpath" for mutexes, 31 * In the DEBUG case we are using the "NULL fastpath" for mutexes,
@@ -217,44 +217,35 @@ ww_mutex_set_context_slowpath(struct ww_mutex *lock,
217} 217}
218 218
219#ifdef CONFIG_MUTEX_SPIN_ON_OWNER 219#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
220static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
221{
222 if (lock->owner != owner)
223 return false;
224
225 /*
226 * Ensure we emit the owner->on_cpu, dereference _after_ checking
227 * lock->owner still matches owner, if that fails, owner might
228 * point to free()d memory, if it still matches, the rcu_read_lock()
229 * ensures the memory stays valid.
230 */
231 barrier();
232
233 return owner->on_cpu;
234}
235
236/* 220/*
237 * Look out! "owner" is an entirely speculative pointer 221 * Look out! "owner" is an entirely speculative pointer
238 * access and not reliable. 222 * access and not reliable.
239 */ 223 */
240static noinline 224static noinline
241int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner) 225bool mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)
242{ 226{
227 bool ret = true;
228
243 rcu_read_lock(); 229 rcu_read_lock();
244 while (owner_running(lock, owner)) { 230 while (lock->owner == owner) {
245 if (need_resched()) 231 /*
232 * Ensure we emit the owner->on_cpu, dereference _after_
233 * checking lock->owner still matches owner. If that fails,
234 * owner might point to freed memory. If it still matches,
235 * the rcu_read_lock() ensures the memory stays valid.
236 */
237 barrier();
238
239 if (!owner->on_cpu || need_resched()) {
240 ret = false;
246 break; 241 break;
242 }
247 243
248 cpu_relax_lowlatency(); 244 cpu_relax_lowlatency();
249 } 245 }
250 rcu_read_unlock(); 246 rcu_read_unlock();
251 247
252 /* 248 return ret;
253 * We break out the loop above on need_resched() and when the
254 * owner changed, which is a sign for heavy contention. Return
255 * success only when lock->owner is NULL.
256 */
257 return lock->owner == NULL;
258} 249}
259 250
260/* 251/*
@@ -269,7 +260,7 @@ static inline int mutex_can_spin_on_owner(struct mutex *lock)
269 return 0; 260 return 0;
270 261
271 rcu_read_lock(); 262 rcu_read_lock();
272 owner = ACCESS_ONCE(lock->owner); 263 owner = READ_ONCE(lock->owner);
273 if (owner) 264 if (owner)
274 retval = owner->on_cpu; 265 retval = owner->on_cpu;
275 rcu_read_unlock(); 266 rcu_read_unlock();
@@ -343,7 +334,7 @@ static bool mutex_optimistic_spin(struct mutex *lock,
343 * As such, when deadlock detection needs to be 334 * As such, when deadlock detection needs to be
344 * performed the optimistic spinning cannot be done. 335 * performed the optimistic spinning cannot be done.
345 */ 336 */
346 if (ACCESS_ONCE(ww->ctx)) 337 if (READ_ONCE(ww->ctx))
347 break; 338 break;
348 } 339 }
349 340
@@ -351,7 +342,7 @@ static bool mutex_optimistic_spin(struct mutex *lock,
351 * If there's an owner, wait for it to either 342 * If there's an owner, wait for it to either
352 * release the lock or go to sleep. 343 * release the lock or go to sleep.
353 */ 344 */
354 owner = ACCESS_ONCE(lock->owner); 345 owner = READ_ONCE(lock->owner);
355 if (owner && !mutex_spin_on_owner(lock, owner)) 346 if (owner && !mutex_spin_on_owner(lock, owner))
356 break; 347 break;
357 348
@@ -490,7 +481,7 @@ static inline int __sched
490__ww_mutex_lock_check_stamp(struct mutex *lock, struct ww_acquire_ctx *ctx) 481__ww_mutex_lock_check_stamp(struct mutex *lock, struct ww_acquire_ctx *ctx)
491{ 482{
492 struct ww_mutex *ww = container_of(lock, struct ww_mutex, base); 483 struct ww_mutex *ww = container_of(lock, struct ww_mutex, base);
493 struct ww_acquire_ctx *hold_ctx = ACCESS_ONCE(ww->ctx); 484 struct ww_acquire_ctx *hold_ctx = READ_ONCE(ww->ctx);
494 485
495 if (!hold_ctx) 486 if (!hold_ctx)
496 return 0; 487 return 0;
diff --git a/kernel/locking/osq_lock.c b/kernel/locking/osq_lock.c
index c112d00341b0..dc85ee23a26f 100644
--- a/kernel/locking/osq_lock.c
+++ b/kernel/locking/osq_lock.c
@@ -98,7 +98,7 @@ bool osq_lock(struct optimistic_spin_queue *lock)
98 98
99 prev = decode_cpu(old); 99 prev = decode_cpu(old);
100 node->prev = prev; 100 node->prev = prev;
101 ACCESS_ONCE(prev->next) = node; 101 WRITE_ONCE(prev->next, node);
102 102
103 /* 103 /*
104 * Normally @prev is untouchable after the above store; because at that 104 * Normally @prev is untouchable after the above store; because at that
@@ -109,7 +109,7 @@ bool osq_lock(struct optimistic_spin_queue *lock)
109 * cmpxchg in an attempt to undo our queueing. 109 * cmpxchg in an attempt to undo our queueing.
110 */ 110 */
111 111
112 while (!ACCESS_ONCE(node->locked)) { 112 while (!READ_ONCE(node->locked)) {
113 /* 113 /*
114 * If we need to reschedule bail... so we can block. 114 * If we need to reschedule bail... so we can block.
115 */ 115 */
@@ -148,7 +148,7 @@ unqueue:
148 * Or we race against a concurrent unqueue()'s step-B, in which 148 * Or we race against a concurrent unqueue()'s step-B, in which
149 * case its step-C will write us a new @node->prev pointer. 149 * case its step-C will write us a new @node->prev pointer.
150 */ 150 */
151 prev = ACCESS_ONCE(node->prev); 151 prev = READ_ONCE(node->prev);
152 } 152 }
153 153
154 /* 154 /*
@@ -170,8 +170,8 @@ unqueue:
170 * it will wait in Step-A. 170 * it will wait in Step-A.
171 */ 171 */
172 172
173 ACCESS_ONCE(next->prev) = prev; 173 WRITE_ONCE(next->prev, prev);
174 ACCESS_ONCE(prev->next) = next; 174 WRITE_ONCE(prev->next, next);
175 175
176 return false; 176 return false;
177} 177}
@@ -193,11 +193,11 @@ void osq_unlock(struct optimistic_spin_queue *lock)
193 node = this_cpu_ptr(&osq_node); 193 node = this_cpu_ptr(&osq_node);
194 next = xchg(&node->next, NULL); 194 next = xchg(&node->next, NULL);
195 if (next) { 195 if (next) {
196 ACCESS_ONCE(next->locked) = 1; 196 WRITE_ONCE(next->locked, 1);
197 return; 197 return;
198 } 198 }
199 199
200 next = osq_wait_next(lock, node, NULL); 200 next = osq_wait_next(lock, node, NULL);
201 if (next) 201 if (next)
202 ACCESS_ONCE(next->locked) = 1; 202 WRITE_ONCE(next->locked, 1);
203} 203}
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 6357265a31ad..b73279367087 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -349,7 +349,7 @@ static inline struct rt_mutex *task_blocked_on_lock(struct task_struct *p)
349 * 349 *
350 * @task: the task owning the mutex (owner) for which a chain walk is 350 * @task: the task owning the mutex (owner) for which a chain walk is
351 * probably needed 351 * probably needed
352 * @deadlock_detect: do we have to carry out deadlock detection? 352 * @chwalk: do we have to carry out deadlock detection?
353 * @orig_lock: the mutex (can be NULL if we are walking the chain to recheck 353 * @orig_lock: the mutex (can be NULL if we are walking the chain to recheck
354 * things for a task that has just got its priority adjusted, and 354 * things for a task that has just got its priority adjusted, and
355 * is waiting on a mutex) 355 * is waiting on a mutex)
diff --git a/kernel/locking/rwsem-spinlock.c b/kernel/locking/rwsem-spinlock.c
index 2555ae15ec14..3a5048572065 100644
--- a/kernel/locking/rwsem-spinlock.c
+++ b/kernel/locking/rwsem-spinlock.c
@@ -85,6 +85,13 @@ __rwsem_do_wake(struct rw_semaphore *sem, int wakewrite)
85 85
86 list_del(&waiter->list); 86 list_del(&waiter->list);
87 tsk = waiter->task; 87 tsk = waiter->task;
88 /*
89 * Make sure we do not wakeup the next reader before
90 * setting the nil condition to grant the next reader;
91 * otherwise we could miss the wakeup on the other
92 * side and end up sleeping again. See the pairing
93 * in rwsem_down_read_failed().
94 */
88 smp_mb(); 95 smp_mb();
89 waiter->task = NULL; 96 waiter->task = NULL;
90 wake_up_process(tsk); 97 wake_up_process(tsk);
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index 2f7cc4076f50..3417d0172a5d 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -14,8 +14,9 @@
14#include <linux/init.h> 14#include <linux/init.h>
15#include <linux/export.h> 15#include <linux/export.h>
16#include <linux/sched/rt.h> 16#include <linux/sched/rt.h>
17#include <linux/osq_lock.h>
17 18
18#include "mcs_spinlock.h" 19#include "rwsem.h"
19 20
20/* 21/*
21 * Guide to the rw_semaphore's count field for common values. 22 * Guide to the rw_semaphore's count field for common values.
@@ -186,6 +187,13 @@ __rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type)
186 waiter = list_entry(next, struct rwsem_waiter, list); 187 waiter = list_entry(next, struct rwsem_waiter, list);
187 next = waiter->list.next; 188 next = waiter->list.next;
188 tsk = waiter->task; 189 tsk = waiter->task;
190 /*
191 * Make sure we do not wakeup the next reader before
192 * setting the nil condition to grant the next reader;
193 * otherwise we could miss the wakeup on the other
194 * side and end up sleeping again. See the pairing
195 * in rwsem_down_read_failed().
196 */
189 smp_mb(); 197 smp_mb();
190 waiter->task = NULL; 198 waiter->task = NULL;
191 wake_up_process(tsk); 199 wake_up_process(tsk);
@@ -258,6 +266,7 @@ static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem)
258 RWSEM_ACTIVE_WRITE_BIAS) == RWSEM_WAITING_BIAS) { 266 RWSEM_ACTIVE_WRITE_BIAS) == RWSEM_WAITING_BIAS) {
259 if (!list_is_singular(&sem->wait_list)) 267 if (!list_is_singular(&sem->wait_list))
260 rwsem_atomic_update(RWSEM_WAITING_BIAS, sem); 268 rwsem_atomic_update(RWSEM_WAITING_BIAS, sem);
269 rwsem_set_owner(sem);
261 return true; 270 return true;
262 } 271 }
263 272
@@ -270,15 +279,17 @@ static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem)
270 */ 279 */
271static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem) 280static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem)
272{ 281{
273 long old, count = ACCESS_ONCE(sem->count); 282 long old, count = READ_ONCE(sem->count);
274 283
275 while (true) { 284 while (true) {
276 if (!(count == 0 || count == RWSEM_WAITING_BIAS)) 285 if (!(count == 0 || count == RWSEM_WAITING_BIAS))
277 return false; 286 return false;
278 287
279 old = cmpxchg(&sem->count, count, count + RWSEM_ACTIVE_WRITE_BIAS); 288 old = cmpxchg(&sem->count, count, count + RWSEM_ACTIVE_WRITE_BIAS);
280 if (old == count) 289 if (old == count) {
290 rwsem_set_owner(sem);
281 return true; 291 return true;
292 }
282 293
283 count = old; 294 count = old;
284 } 295 }
@@ -287,60 +298,67 @@ static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem)
287static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem) 298static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)
288{ 299{
289 struct task_struct *owner; 300 struct task_struct *owner;
290 bool on_cpu = false; 301 bool ret = true;
291 302
292 if (need_resched()) 303 if (need_resched())
293 return false; 304 return false;
294 305
295 rcu_read_lock(); 306 rcu_read_lock();
296 owner = ACCESS_ONCE(sem->owner); 307 owner = READ_ONCE(sem->owner);
297 if (owner) 308 if (!owner) {
298 on_cpu = owner->on_cpu; 309 long count = READ_ONCE(sem->count);
299 rcu_read_unlock(); 310 /*
300 311 * If sem->owner is not set, yet we have just recently entered the
301 /* 312 * slowpath with the lock being active, then there is a possibility
302 * If sem->owner is not set, yet we have just recently entered the 313 * reader(s) may have the lock. To be safe, bail spinning in these
303 * slowpath, then there is a possibility reader(s) may have the lock. 314 * situations.
304 * To be safe, avoid spinning in these situations. 315 */
305 */ 316 if (count & RWSEM_ACTIVE_MASK)
306 return on_cpu; 317 ret = false;
307} 318 goto done;
308 319 }
309static inline bool owner_running(struct rw_semaphore *sem,
310 struct task_struct *owner)
311{
312 if (sem->owner != owner)
313 return false;
314
315 /*
316 * Ensure we emit the owner->on_cpu, dereference _after_ checking
317 * sem->owner still matches owner, if that fails, owner might
318 * point to free()d memory, if it still matches, the rcu_read_lock()
319 * ensures the memory stays valid.
320 */
321 barrier();
322 320
323 return owner->on_cpu; 321 ret = owner->on_cpu;
322done:
323 rcu_read_unlock();
324 return ret;
324} 325}
325 326
326static noinline 327static noinline
327bool rwsem_spin_on_owner(struct rw_semaphore *sem, struct task_struct *owner) 328bool rwsem_spin_on_owner(struct rw_semaphore *sem, struct task_struct *owner)
328{ 329{
330 long count;
331
329 rcu_read_lock(); 332 rcu_read_lock();
330 while (owner_running(sem, owner)) { 333 while (sem->owner == owner) {
331 if (need_resched()) 334 /*
332 break; 335 * Ensure we emit the owner->on_cpu, dereference _after_
336 * checking sem->owner still matches owner, if that fails,
337 * owner might point to free()d memory, if it still matches,
338 * the rcu_read_lock() ensures the memory stays valid.
339 */
340 barrier();
341
342 /* abort spinning when need_resched or owner is not running */
343 if (!owner->on_cpu || need_resched()) {
344 rcu_read_unlock();
345 return false;
346 }
333 347
334 cpu_relax_lowlatency(); 348 cpu_relax_lowlatency();
335 } 349 }
336 rcu_read_unlock(); 350 rcu_read_unlock();
337 351
352 if (READ_ONCE(sem->owner))
353 return true; /* new owner, continue spinning */
354
338 /* 355 /*
339 * We break out the loop above on need_resched() or when the 356 * When the owner is not set, the lock could be free or
340 * owner changed, which is a sign for heavy contention. Return 357 * held by readers. Check the counter to verify the
341 * success only when sem->owner is NULL. 358 * state.
342 */ 359 */
343 return sem->owner == NULL; 360 count = READ_ONCE(sem->count);
361 return (count == 0 || count == RWSEM_WAITING_BIAS);
344} 362}
345 363
346static bool rwsem_optimistic_spin(struct rw_semaphore *sem) 364static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
@@ -358,7 +376,7 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
358 goto done; 376 goto done;
359 377
360 while (true) { 378 while (true) {
361 owner = ACCESS_ONCE(sem->owner); 379 owner = READ_ONCE(sem->owner);
362 if (owner && !rwsem_spin_on_owner(sem, owner)) 380 if (owner && !rwsem_spin_on_owner(sem, owner))
363 break; 381 break;
364 382
@@ -432,7 +450,7 @@ struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem)
432 450
433 /* we're now waiting on the lock, but no longer actively locking */ 451 /* we're now waiting on the lock, but no longer actively locking */
434 if (waiting) { 452 if (waiting) {
435 count = ACCESS_ONCE(sem->count); 453 count = READ_ONCE(sem->count);
436 454
437 /* 455 /*
438 * If there were already threads queued before us and there are 456 * If there were already threads queued before us and there are
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index e2d3bc7f03b4..205be0ce34de 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -9,29 +9,9 @@
9#include <linux/sched.h> 9#include <linux/sched.h>
10#include <linux/export.h> 10#include <linux/export.h>
11#include <linux/rwsem.h> 11#include <linux/rwsem.h>
12
13#include <linux/atomic.h> 12#include <linux/atomic.h>
14 13
15#ifdef CONFIG_RWSEM_SPIN_ON_OWNER 14#include "rwsem.h"
16static inline void rwsem_set_owner(struct rw_semaphore *sem)
17{
18 sem->owner = current;
19}
20
21static inline void rwsem_clear_owner(struct rw_semaphore *sem)
22{
23 sem->owner = NULL;
24}
25
26#else
27static inline void rwsem_set_owner(struct rw_semaphore *sem)
28{
29}
30
31static inline void rwsem_clear_owner(struct rw_semaphore *sem)
32{
33}
34#endif
35 15
36/* 16/*
37 * lock for reading 17 * lock for reading
diff --git a/kernel/locking/rwsem.h b/kernel/locking/rwsem.h
new file mode 100644
index 000000000000..870ed9a5b426
--- /dev/null
+++ b/kernel/locking/rwsem.h
@@ -0,0 +1,20 @@
1#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
2static inline void rwsem_set_owner(struct rw_semaphore *sem)
3{
4 sem->owner = current;
5}
6
7static inline void rwsem_clear_owner(struct rw_semaphore *sem)
8{
9 sem->owner = NULL;
10}
11
12#else
13static inline void rwsem_set_owner(struct rw_semaphore *sem)
14{
15}
16
17static inline void rwsem_clear_owner(struct rw_semaphore *sem)
18{
19}
20#endif
diff --git a/kernel/module.c b/kernel/module.c
index b3d634ed06c9..650b038ae520 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1865,7 +1865,7 @@ static void free_module(struct module *mod)
1865 kfree(mod->args); 1865 kfree(mod->args);
1866 percpu_modfree(mod); 1866 percpu_modfree(mod);
1867 1867
1868 /* Free lock-classes: */ 1868 /* Free lock-classes; relies on the preceding sync_rcu(). */
1869 lockdep_free_key_range(mod->module_core, mod->core_size); 1869 lockdep_free_key_range(mod->module_core, mod->core_size);
1870 1870
1871 /* Finally, free the core (containing the module structure) */ 1871 /* Finally, free the core (containing the module structure) */
@@ -2479,6 +2479,23 @@ static int elf_header_check(struct load_info *info)
2479 return 0; 2479 return 0;
2480} 2480}
2481 2481
2482#define COPY_CHUNK_SIZE (16*PAGE_SIZE)
2483
2484static int copy_chunked_from_user(void *dst, const void __user *usrc, unsigned long len)
2485{
2486 do {
2487 unsigned long n = min(len, COPY_CHUNK_SIZE);
2488
2489 if (copy_from_user(dst, usrc, n) != 0)
2490 return -EFAULT;
2491 cond_resched();
2492 dst += n;
2493 usrc += n;
2494 len -= n;
2495 } while (len);
2496 return 0;
2497}
2498
2482/* Sets info->hdr and info->len. */ 2499/* Sets info->hdr and info->len. */
2483static int copy_module_from_user(const void __user *umod, unsigned long len, 2500static int copy_module_from_user(const void __user *umod, unsigned long len,
2484 struct load_info *info) 2501 struct load_info *info)
@@ -2498,7 +2515,7 @@ static int copy_module_from_user(const void __user *umod, unsigned long len,
2498 if (!info->hdr) 2515 if (!info->hdr)
2499 return -ENOMEM; 2516 return -ENOMEM;
2500 2517
2501 if (copy_from_user(info->hdr, umod, info->len) != 0) { 2518 if (copy_chunked_from_user(info->hdr, umod, info->len) != 0) {
2502 vfree(info->hdr); 2519 vfree(info->hdr);
2503 return -EFAULT; 2520 return -EFAULT;
2504 } 2521 }
@@ -2753,6 +2770,9 @@ static int find_module_sections(struct module *mod, struct load_info *info)
2753 mod->trace_events = section_objs(info, "_ftrace_events", 2770 mod->trace_events = section_objs(info, "_ftrace_events",
2754 sizeof(*mod->trace_events), 2771 sizeof(*mod->trace_events),
2755 &mod->num_trace_events); 2772 &mod->num_trace_events);
2773 mod->trace_enums = section_objs(info, "_ftrace_enum_map",
2774 sizeof(*mod->trace_enums),
2775 &mod->num_trace_enums);
2756#endif 2776#endif
2757#ifdef CONFIG_TRACING 2777#ifdef CONFIG_TRACING
2758 mod->trace_bprintk_fmt_start = section_objs(info, "__trace_printk_fmt", 2778 mod->trace_bprintk_fmt_start = section_objs(info, "__trace_printk_fmt",
@@ -3349,9 +3369,6 @@ static int load_module(struct load_info *info, const char __user *uargs,
3349 module_bug_cleanup(mod); 3369 module_bug_cleanup(mod);
3350 mutex_unlock(&module_mutex); 3370 mutex_unlock(&module_mutex);
3351 3371
3352 /* Free lock-classes: */
3353 lockdep_free_key_range(mod->module_core, mod->core_size);
3354
3355 /* we can't deallocate the module until we clear memory protection */ 3372 /* we can't deallocate the module until we clear memory protection */
3356 unset_module_init_ro_nx(mod); 3373 unset_module_init_ro_nx(mod);
3357 unset_module_core_ro_nx(mod); 3374 unset_module_core_ro_nx(mod);
@@ -3375,6 +3392,9 @@ static int load_module(struct load_info *info, const char __user *uargs,
3375 synchronize_rcu(); 3392 synchronize_rcu();
3376 mutex_unlock(&module_mutex); 3393 mutex_unlock(&module_mutex);
3377 free_module: 3394 free_module:
3395 /* Free lock-classes; relies on the preceding sync_rcu() */
3396 lockdep_free_key_range(mod->module_core, mod->core_size);
3397
3378 module_deallocate(mod, info); 3398 module_deallocate(mod, info);
3379 free_copy: 3399 free_copy:
3380 free_copy(info); 3400 free_copy(info);
diff --git a/kernel/pid.c b/kernel/pid.c
index cd36a5e0d173..4fd07d5b7baf 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -182,7 +182,7 @@ static int alloc_pidmap(struct pid_namespace *pid_ns)
182 spin_unlock_irq(&pidmap_lock); 182 spin_unlock_irq(&pidmap_lock);
183 kfree(page); 183 kfree(page);
184 if (unlikely(!map->page)) 184 if (unlikely(!map->page))
185 break; 185 return -ENOMEM;
186 } 186 }
187 if (likely(atomic_read(&map->nr_free))) { 187 if (likely(atomic_read(&map->nr_free))) {
188 for ( ; ; ) { 188 for ( ; ; ) {
@@ -210,7 +210,7 @@ static int alloc_pidmap(struct pid_namespace *pid_ns)
210 } 210 }
211 pid = mk_pid(pid_ns, map, offset); 211 pid = mk_pid(pid_ns, map, offset);
212 } 212 }
213 return -1; 213 return -EAGAIN;
214} 214}
215 215
216int next_pidmap(struct pid_namespace *pid_ns, unsigned int last) 216int next_pidmap(struct pid_namespace *pid_ns, unsigned int last)
@@ -301,17 +301,20 @@ struct pid *alloc_pid(struct pid_namespace *ns)
301 int i, nr; 301 int i, nr;
302 struct pid_namespace *tmp; 302 struct pid_namespace *tmp;
303 struct upid *upid; 303 struct upid *upid;
304 int retval = -ENOMEM;
304 305
305 pid = kmem_cache_alloc(ns->pid_cachep, GFP_KERNEL); 306 pid = kmem_cache_alloc(ns->pid_cachep, GFP_KERNEL);
306 if (!pid) 307 if (!pid)
307 goto out; 308 return ERR_PTR(retval);
308 309
309 tmp = ns; 310 tmp = ns;
310 pid->level = ns->level; 311 pid->level = ns->level;
311 for (i = ns->level; i >= 0; i--) { 312 for (i = ns->level; i >= 0; i--) {
312 nr = alloc_pidmap(tmp); 313 nr = alloc_pidmap(tmp);
313 if (nr < 0) 314 if (IS_ERR_VALUE(nr)) {
315 retval = nr;
314 goto out_free; 316 goto out_free;
317 }
315 318
316 pid->numbers[i].nr = nr; 319 pid->numbers[i].nr = nr;
317 pid->numbers[i].ns = tmp; 320 pid->numbers[i].ns = tmp;
@@ -339,7 +342,6 @@ struct pid *alloc_pid(struct pid_namespace *ns)
339 } 342 }
340 spin_unlock_irq(&pidmap_lock); 343 spin_unlock_irq(&pidmap_lock);
341 344
342out:
343 return pid; 345 return pid;
344 346
345out_unlock: 347out_unlock:
@@ -351,8 +353,7 @@ out_free:
351 free_pidmap(pid->numbers + i); 353 free_pidmap(pid->numbers + i);
352 354
353 kmem_cache_free(ns->pid_cachep, pid); 355 kmem_cache_free(ns->pid_cachep, pid);
354 pid = NULL; 356 return ERR_PTR(retval);
355 goto out;
356} 357}
357 358
358void disable_pid_allocation(struct pid_namespace *ns) 359void disable_pid_allocation(struct pid_namespace *ns)
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 9a59d042ea84..86e8157a450f 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -11,7 +11,7 @@
11#include <linux/export.h> 11#include <linux/export.h>
12#include <linux/kobject.h> 12#include <linux/kobject.h>
13#include <linux/string.h> 13#include <linux/string.h>
14#include <linux/resume-trace.h> 14#include <linux/pm-trace.h>
15#include <linux/workqueue.h> 15#include <linux/workqueue.h>
16#include <linux/debugfs.h> 16#include <linux/debugfs.h>
17#include <linux/seq_file.h> 17#include <linux/seq_file.h>
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index c24d5a23bf93..5235dd4e1e2f 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -955,25 +955,6 @@ static void mark_nosave_pages(struct memory_bitmap *bm)
955 } 955 }
956} 956}
957 957
958static bool is_nosave_page(unsigned long pfn)
959{
960 struct nosave_region *region;
961
962 list_for_each_entry(region, &nosave_regions, list) {
963 if (pfn >= region->start_pfn && pfn < region->end_pfn) {
964 pr_err("PM: %#010llx in e820 nosave region: "
965 "[mem %#010llx-%#010llx]\n",
966 (unsigned long long) pfn << PAGE_SHIFT,
967 (unsigned long long) region->start_pfn << PAGE_SHIFT,
968 ((unsigned long long) region->end_pfn << PAGE_SHIFT)
969 - 1);
970 return true;
971 }
972 }
973
974 return false;
975}
976
977/** 958/**
978 * create_basic_memory_bitmaps - create bitmaps needed for marking page 959 * create_basic_memory_bitmaps - create bitmaps needed for marking page
979 * frames that should not be saved and free page frames. The pointers 960 * frames that should not be saved and free page frames. The pointers
@@ -2042,7 +2023,7 @@ static int mark_unsafe_pages(struct memory_bitmap *bm)
2042 do { 2023 do {
2043 pfn = memory_bm_next_pfn(bm); 2024 pfn = memory_bm_next_pfn(bm);
2044 if (likely(pfn != BM_END_OF_MAP)) { 2025 if (likely(pfn != BM_END_OF_MAP)) {
2045 if (likely(pfn_valid(pfn)) && !is_nosave_page(pfn)) 2026 if (likely(pfn_valid(pfn)))
2046 swsusp_set_page_free(pfn_to_page(pfn)); 2027 swsusp_set_page_free(pfn_to_page(pfn));
2047 else 2028 else
2048 return -EFAULT; 2029 return -EFAULT;
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index b7d6b3a721b1..8d7a1ef72758 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -28,6 +28,7 @@
28#include <linux/ftrace.h> 28#include <linux/ftrace.h>
29#include <trace/events/power.h> 29#include <trace/events/power.h>
30#include <linux/compiler.h> 30#include <linux/compiler.h>
31#include <linux/moduleparam.h>
31 32
32#include "power.h" 33#include "power.h"
33 34
@@ -233,12 +234,20 @@ static bool platform_suspend_again(suspend_state_t state)
233 suspend_ops->suspend_again() : false; 234 suspend_ops->suspend_again() : false;
234} 235}
235 236
237#ifdef CONFIG_PM_DEBUG
238static unsigned int pm_test_delay = 5;
239module_param(pm_test_delay, uint, 0644);
240MODULE_PARM_DESC(pm_test_delay,
241 "Number of seconds to wait before resuming from suspend test");
242#endif
243
236static int suspend_test(int level) 244static int suspend_test(int level)
237{ 245{
238#ifdef CONFIG_PM_DEBUG 246#ifdef CONFIG_PM_DEBUG
239 if (pm_test_level == level) { 247 if (pm_test_level == level) {
240 printk(KERN_INFO "suspend debug: Waiting for 5 seconds.\n"); 248 printk(KERN_INFO "suspend debug: Waiting for %d second(s).\n",
241 mdelay(5000); 249 pm_test_delay);
250 mdelay(pm_test_delay * 1000);
242 return 1; 251 return 1;
243 } 252 }
244#endif /* !CONFIG_PM_DEBUG */ 253#endif /* !CONFIG_PM_DEBUG */
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index bb0635bd74f2..c099b082cd02 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -32,7 +32,6 @@
32#include <linux/security.h> 32#include <linux/security.h>
33#include <linux/bootmem.h> 33#include <linux/bootmem.h>
34#include <linux/memblock.h> 34#include <linux/memblock.h>
35#include <linux/aio.h>
36#include <linux/syscalls.h> 35#include <linux/syscalls.h>
37#include <linux/kexec.h> 36#include <linux/kexec.h>
38#include <linux/kdb.h> 37#include <linux/kdb.h>
@@ -46,6 +45,7 @@
46#include <linux/irq_work.h> 45#include <linux/irq_work.h>
47#include <linux/utsname.h> 46#include <linux/utsname.h>
48#include <linux/ctype.h> 47#include <linux/ctype.h>
48#include <linux/uio.h>
49 49
50#include <asm/uaccess.h> 50#include <asm/uaccess.h>
51 51
@@ -521,7 +521,7 @@ static ssize_t devkmsg_write(struct kiocb *iocb, struct iov_iter *from)
521 int i; 521 int i;
522 int level = default_message_loglevel; 522 int level = default_message_loglevel;
523 int facility = 1; /* LOG_USER */ 523 int facility = 1; /* LOG_USER */
524 size_t len = iocb->ki_nbytes; 524 size_t len = iov_iter_count(from);
525 ssize_t ret = len; 525 ssize_t ret = len;
526 526
527 if (len > LOG_LINE_MAX) 527 if (len > LOG_LINE_MAX)
@@ -2017,24 +2017,6 @@ int add_preferred_console(char *name, int idx, char *options)
2017 return __add_preferred_console(name, idx, options, NULL); 2017 return __add_preferred_console(name, idx, options, NULL);
2018} 2018}
2019 2019
2020int update_console_cmdline(char *name, int idx, char *name_new, int idx_new, char *options)
2021{
2022 struct console_cmdline *c;
2023 int i;
2024
2025 for (i = 0, c = console_cmdline;
2026 i < MAX_CMDLINECONSOLES && c->name[0];
2027 i++, c++)
2028 if (strcmp(c->name, name) == 0 && c->index == idx) {
2029 strlcpy(c->name, name_new, sizeof(c->name));
2030 c->options = options;
2031 c->index = idx_new;
2032 return i;
2033 }
2034 /* not found */
2035 return -1;
2036}
2037
2038bool console_suspend_enabled = true; 2020bool console_suspend_enabled = true;
2039EXPORT_SYMBOL(console_suspend_enabled); 2021EXPORT_SYMBOL(console_suspend_enabled);
2040 2022
@@ -2436,9 +2418,6 @@ void register_console(struct console *newcon)
2436 if (preferred_console < 0 || bcon || !console_drivers) 2418 if (preferred_console < 0 || bcon || !console_drivers)
2437 preferred_console = selected_console; 2419 preferred_console = selected_console;
2438 2420
2439 if (newcon->early_setup)
2440 newcon->early_setup();
2441
2442 /* 2421 /*
2443 * See if we want to use this console driver. If we 2422 * See if we want to use this console driver. If we
2444 * didn't select a console we take the first one 2423 * didn't select a console we take the first one
@@ -2464,23 +2443,27 @@ void register_console(struct console *newcon)
2464 for (i = 0, c = console_cmdline; 2443 for (i = 0, c = console_cmdline;
2465 i < MAX_CMDLINECONSOLES && c->name[0]; 2444 i < MAX_CMDLINECONSOLES && c->name[0];
2466 i++, c++) { 2445 i++, c++) {
2467 BUILD_BUG_ON(sizeof(c->name) != sizeof(newcon->name)); 2446 if (!newcon->match ||
2468 if (strcmp(c->name, newcon->name) != 0) 2447 newcon->match(newcon, c->name, c->index, c->options) != 0) {
2469 continue; 2448 /* default matching */
2470 if (newcon->index >= 0 && 2449 BUILD_BUG_ON(sizeof(c->name) != sizeof(newcon->name));
2471 newcon->index != c->index) 2450 if (strcmp(c->name, newcon->name) != 0)
2472 continue; 2451 continue;
2473 if (newcon->index < 0) 2452 if (newcon->index >= 0 &&
2474 newcon->index = c->index; 2453 newcon->index != c->index)
2454 continue;
2455 if (newcon->index < 0)
2456 newcon->index = c->index;
2475 2457
2476 if (_braille_register_console(newcon, c)) 2458 if (_braille_register_console(newcon, c))
2477 return; 2459 return;
2460
2461 if (newcon->setup &&
2462 newcon->setup(newcon, c->options) != 0)
2463 break;
2464 }
2478 2465
2479 if (newcon->setup &&
2480 newcon->setup(newcon, console_cmdline[i].options) != 0)
2481 break;
2482 newcon->flags |= CON_ENABLED; 2466 newcon->flags |= CON_ENABLED;
2483 newcon->index = c->index;
2484 if (i == selected_console) { 2467 if (i == selected_console) {
2485 newcon->flags |= CON_CONSDEV; 2468 newcon->flags |= CON_CONSDEV;
2486 preferred_console = selected_console; 2469 preferred_console = selected_console;
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 227fec36b12a..c8e0e050a36a 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -456,8 +456,6 @@ static bool __ptrace_detach(struct task_struct *tracer, struct task_struct *p)
456 456
457static int ptrace_detach(struct task_struct *child, unsigned int data) 457static int ptrace_detach(struct task_struct *child, unsigned int data)
458{ 458{
459 bool dead = false;
460
461 if (!valid_signal(data)) 459 if (!valid_signal(data))
462 return -EIO; 460 return -EIO;
463 461
@@ -467,18 +465,19 @@ static int ptrace_detach(struct task_struct *child, unsigned int data)
467 465
468 write_lock_irq(&tasklist_lock); 466 write_lock_irq(&tasklist_lock);
469 /* 467 /*
470 * This child can be already killed. Make sure de_thread() or 468 * We rely on ptrace_freeze_traced(). It can't be killed and
471 * our sub-thread doing do_wait() didn't do release_task() yet. 469 * untraced by another thread, it can't be a zombie.
472 */ 470 */
473 if (child->ptrace) { 471 WARN_ON(!child->ptrace || child->exit_state);
474 child->exit_code = data; 472 /*
475 dead = __ptrace_detach(current, child); 473 * tasklist_lock avoids the race with wait_task_stopped(), see
476 } 474 * the comment in ptrace_resume().
475 */
476 child->exit_code = data;
477 __ptrace_detach(current, child);
477 write_unlock_irq(&tasklist_lock); 478 write_unlock_irq(&tasklist_lock);
478 479
479 proc_ptrace_connector(child, PTRACE_DETACH); 480 proc_ptrace_connector(child, PTRACE_DETACH);
480 if (unlikely(dead))
481 release_task(child);
482 481
483 return 0; 482 return 0;
484} 483}
@@ -697,6 +696,8 @@ static int ptrace_peek_siginfo(struct task_struct *child,
697static int ptrace_resume(struct task_struct *child, long request, 696static int ptrace_resume(struct task_struct *child, long request,
698 unsigned long data) 697 unsigned long data)
699{ 698{
699 bool need_siglock;
700
700 if (!valid_signal(data)) 701 if (!valid_signal(data))
701 return -EIO; 702 return -EIO;
702 703
@@ -724,8 +725,26 @@ static int ptrace_resume(struct task_struct *child, long request,
724 user_disable_single_step(child); 725 user_disable_single_step(child);
725 } 726 }
726 727
728 /*
729 * Change ->exit_code and ->state under siglock to avoid the race
730 * with wait_task_stopped() in between; a non-zero ->exit_code will
731 * wrongly look like another report from tracee.
732 *
733 * Note that we need siglock even if ->exit_code == data and/or this
734 * status was not reported yet, the new status must not be cleared by
735 * wait_task_stopped() after resume.
736 *
737 * If data == 0 we do not care if wait_task_stopped() reports the old
738 * status and clears the code too; this can't race with the tracee, it
739 * takes siglock after resume.
740 */
741 need_siglock = data && !thread_group_empty(current);
742 if (need_siglock)
743 spin_lock_irq(&child->sighand->siglock);
727 child->exit_code = data; 744 child->exit_code = data;
728 wake_up_state(child, __TASK_TRACED); 745 wake_up_state(child, __TASK_TRACED);
746 if (need_siglock)
747 spin_unlock_irq(&child->sighand->siglock);
729 748
730 return 0; 749 return 0;
731} 750}
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 30d42aa55d83..8dbe27611ec3 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -853,6 +853,8 @@ rcu_torture_fqs(void *arg)
853static int 853static int
854rcu_torture_writer(void *arg) 854rcu_torture_writer(void *arg)
855{ 855{
856 bool can_expedite = !rcu_gp_is_expedited();
857 int expediting = 0;
856 unsigned long gp_snap; 858 unsigned long gp_snap;
857 bool gp_cond1 = gp_cond, gp_exp1 = gp_exp, gp_normal1 = gp_normal; 859 bool gp_cond1 = gp_cond, gp_exp1 = gp_exp, gp_normal1 = gp_normal;
858 bool gp_sync1 = gp_sync; 860 bool gp_sync1 = gp_sync;
@@ -865,9 +867,15 @@ rcu_torture_writer(void *arg)
865 int nsynctypes = 0; 867 int nsynctypes = 0;
866 868
867 VERBOSE_TOROUT_STRING("rcu_torture_writer task started"); 869 VERBOSE_TOROUT_STRING("rcu_torture_writer task started");
870 pr_alert("%s" TORTURE_FLAG
871 " Grace periods expedited from boot/sysfs for %s,\n",
872 torture_type, cur_ops->name);
873 pr_alert("%s" TORTURE_FLAG
874 " Testing of dynamic grace-period expediting diabled.\n",
875 torture_type);
868 876
869 /* Initialize synctype[] array. If none set, take default. */ 877 /* Initialize synctype[] array. If none set, take default. */
870 if (!gp_cond1 && !gp_exp1 && !gp_normal1 && !gp_sync) 878 if (!gp_cond1 && !gp_exp1 && !gp_normal1 && !gp_sync1)
871 gp_cond1 = gp_exp1 = gp_normal1 = gp_sync1 = true; 879 gp_cond1 = gp_exp1 = gp_normal1 = gp_sync1 = true;
872 if (gp_cond1 && cur_ops->get_state && cur_ops->cond_sync) 880 if (gp_cond1 && cur_ops->get_state && cur_ops->cond_sync)
873 synctype[nsynctypes++] = RTWS_COND_GET; 881 synctype[nsynctypes++] = RTWS_COND_GET;
@@ -949,9 +957,26 @@ rcu_torture_writer(void *arg)
949 } 957 }
950 } 958 }
951 rcutorture_record_progress(++rcu_torture_current_version); 959 rcutorture_record_progress(++rcu_torture_current_version);
960 /* Cycle through nesting levels of rcu_expedite_gp() calls. */
961 if (can_expedite &&
962 !(torture_random(&rand) & 0xff & (!!expediting - 1))) {
963 WARN_ON_ONCE(expediting == 0 && rcu_gp_is_expedited());
964 if (expediting >= 0)
965 rcu_expedite_gp();
966 else
967 rcu_unexpedite_gp();
968 if (++expediting > 3)
969 expediting = -expediting;
970 }
952 rcu_torture_writer_state = RTWS_STUTTER; 971 rcu_torture_writer_state = RTWS_STUTTER;
953 stutter_wait("rcu_torture_writer"); 972 stutter_wait("rcu_torture_writer");
954 } while (!torture_must_stop()); 973 } while (!torture_must_stop());
974 /* Reset expediting back to unexpedited. */
975 if (expediting > 0)
976 expediting = -expediting;
977 while (can_expedite && expediting++ < 0)
978 rcu_unexpedite_gp();
979 WARN_ON_ONCE(can_expedite && rcu_gp_is_expedited());
955 rcu_torture_writer_state = RTWS_STOPPING; 980 rcu_torture_writer_state = RTWS_STOPPING;
956 torture_kthread_stopping("rcu_torture_writer"); 981 torture_kthread_stopping("rcu_torture_writer");
957 return 0; 982 return 0;
diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c
index 445bf8ffe3fb..cad76e76b4e7 100644
--- a/kernel/rcu/srcu.c
+++ b/kernel/rcu/srcu.c
@@ -402,23 +402,6 @@ void call_srcu(struct srcu_struct *sp, struct rcu_head *head,
402} 402}
403EXPORT_SYMBOL_GPL(call_srcu); 403EXPORT_SYMBOL_GPL(call_srcu);
404 404
405struct rcu_synchronize {
406 struct rcu_head head;
407 struct completion completion;
408};
409
410/*
411 * Awaken the corresponding synchronize_srcu() instance now that a
412 * grace period has elapsed.
413 */
414static void wakeme_after_rcu(struct rcu_head *head)
415{
416 struct rcu_synchronize *rcu;
417
418 rcu = container_of(head, struct rcu_synchronize, head);
419 complete(&rcu->completion);
420}
421
422static void srcu_advance_batches(struct srcu_struct *sp, int trycount); 405static void srcu_advance_batches(struct srcu_struct *sp, int trycount);
423static void srcu_reschedule(struct srcu_struct *sp); 406static void srcu_reschedule(struct srcu_struct *sp);
424 407
@@ -507,7 +490,7 @@ static void __synchronize_srcu(struct srcu_struct *sp, int trycount)
507 */ 490 */
508void synchronize_srcu(struct srcu_struct *sp) 491void synchronize_srcu(struct srcu_struct *sp)
509{ 492{
510 __synchronize_srcu(sp, rcu_expedited 493 __synchronize_srcu(sp, rcu_gp_is_expedited()
511 ? SYNCHRONIZE_SRCU_EXP_TRYCOUNT 494 ? SYNCHRONIZE_SRCU_EXP_TRYCOUNT
512 : SYNCHRONIZE_SRCU_TRYCOUNT); 495 : SYNCHRONIZE_SRCU_TRYCOUNT);
513} 496}
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index cc9ceca7bde1..069742d61c68 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -103,8 +103,7 @@ EXPORT_SYMBOL(__rcu_is_watching);
103static int rcu_qsctr_help(struct rcu_ctrlblk *rcp) 103static int rcu_qsctr_help(struct rcu_ctrlblk *rcp)
104{ 104{
105 RCU_TRACE(reset_cpu_stall_ticks(rcp)); 105 RCU_TRACE(reset_cpu_stall_ticks(rcp));
106 if (rcp->rcucblist != NULL && 106 if (rcp->donetail != rcp->curtail) {
107 rcp->donetail != rcp->curtail) {
108 rcp->donetail = rcp->curtail; 107 rcp->donetail = rcp->curtail;
109 return 1; 108 return 1;
110 } 109 }
@@ -169,17 +168,6 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
169 unsigned long flags; 168 unsigned long flags;
170 RCU_TRACE(int cb_count = 0); 169 RCU_TRACE(int cb_count = 0);
171 170
172 /* If no RCU callbacks ready to invoke, just return. */
173 if (&rcp->rcucblist == rcp->donetail) {
174 RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, 0, -1));
175 RCU_TRACE(trace_rcu_batch_end(rcp->name, 0,
176 !!ACCESS_ONCE(rcp->rcucblist),
177 need_resched(),
178 is_idle_task(current),
179 false));
180 return;
181 }
182
183 /* Move the ready-to-invoke callbacks to a local list. */ 171 /* Move the ready-to-invoke callbacks to a local list. */
184 local_irq_save(flags); 172 local_irq_save(flags);
185 RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, rcp->qlen, -1)); 173 RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, rcp->qlen, -1));
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 48d640ca1a05..233165da782f 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -91,8 +91,10 @@ static const char *tp_##sname##_varname __used __tracepoint_string = sname##_var
91 91
92#define RCU_STATE_INITIALIZER(sname, sabbr, cr) \ 92#define RCU_STATE_INITIALIZER(sname, sabbr, cr) \
93DEFINE_RCU_TPS(sname) \ 93DEFINE_RCU_TPS(sname) \
94DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, sname##_data); \
94struct rcu_state sname##_state = { \ 95struct rcu_state sname##_state = { \
95 .level = { &sname##_state.node[0] }, \ 96 .level = { &sname##_state.node[0] }, \
97 .rda = &sname##_data, \
96 .call = cr, \ 98 .call = cr, \
97 .fqs_state = RCU_GP_IDLE, \ 99 .fqs_state = RCU_GP_IDLE, \
98 .gpnum = 0UL - 300UL, \ 100 .gpnum = 0UL - 300UL, \
@@ -101,11 +103,9 @@ struct rcu_state sname##_state = { \
101 .orphan_nxttail = &sname##_state.orphan_nxtlist, \ 103 .orphan_nxttail = &sname##_state.orphan_nxtlist, \
102 .orphan_donetail = &sname##_state.orphan_donelist, \ 104 .orphan_donetail = &sname##_state.orphan_donelist, \
103 .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \ 105 .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \
104 .onoff_mutex = __MUTEX_INITIALIZER(sname##_state.onoff_mutex), \
105 .name = RCU_STATE_NAME(sname), \ 106 .name = RCU_STATE_NAME(sname), \
106 .abbr = sabbr, \ 107 .abbr = sabbr, \
107}; \ 108}
108DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, sname##_data)
109 109
110RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched); 110RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched);
111RCU_STATE_INITIALIZER(rcu_bh, 'b', call_rcu_bh); 111RCU_STATE_INITIALIZER(rcu_bh, 'b', call_rcu_bh);
@@ -152,6 +152,8 @@ EXPORT_SYMBOL_GPL(rcu_scheduler_active);
152 */ 152 */
153static int rcu_scheduler_fully_active __read_mostly; 153static int rcu_scheduler_fully_active __read_mostly;
154 154
155static void rcu_init_new_rnp(struct rcu_node *rnp_leaf);
156static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf);
155static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu); 157static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu);
156static void invoke_rcu_core(void); 158static void invoke_rcu_core(void);
157static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp); 159static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp);
@@ -160,6 +162,12 @@ static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp);
160static int kthread_prio = CONFIG_RCU_KTHREAD_PRIO; 162static int kthread_prio = CONFIG_RCU_KTHREAD_PRIO;
161module_param(kthread_prio, int, 0644); 163module_param(kthread_prio, int, 0644);
162 164
165/* Delay in jiffies for grace-period initialization delays. */
166static int gp_init_delay = IS_ENABLED(CONFIG_RCU_TORTURE_TEST_SLOW_INIT)
167 ? CONFIG_RCU_TORTURE_TEST_SLOW_INIT_DELAY
168 : 0;
169module_param(gp_init_delay, int, 0644);
170
163/* 171/*
164 * Track the rcutorture test sequence number and the update version 172 * Track the rcutorture test sequence number and the update version
165 * number within a given test. The rcutorture_testseq is incremented 173 * number within a given test. The rcutorture_testseq is incremented
@@ -173,6 +181,17 @@ unsigned long rcutorture_testseq;
173unsigned long rcutorture_vernum; 181unsigned long rcutorture_vernum;
174 182
175/* 183/*
184 * Compute the mask of online CPUs for the specified rcu_node structure.
185 * This will not be stable unless the rcu_node structure's ->lock is
186 * held, but the bit corresponding to the current CPU will be stable
187 * in most contexts.
188 */
189unsigned long rcu_rnp_online_cpus(struct rcu_node *rnp)
190{
191 return ACCESS_ONCE(rnp->qsmaskinitnext);
192}
193
194/*
176 * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s 195 * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s
177 * permit this function to be invoked without holding the root rcu_node 196 * permit this function to be invoked without holding the root rcu_node
178 * structure's ->lock, but of course results can be subject to change. 197 * structure's ->lock, but of course results can be subject to change.
@@ -292,10 +311,10 @@ void rcu_note_context_switch(void)
292EXPORT_SYMBOL_GPL(rcu_note_context_switch); 311EXPORT_SYMBOL_GPL(rcu_note_context_switch);
293 312
294/* 313/*
295 * Register a quiesecent state for all RCU flavors. If there is an 314 * Register a quiescent state for all RCU flavors. If there is an
296 * emergency, invoke rcu_momentary_dyntick_idle() to do a heavy-weight 315 * emergency, invoke rcu_momentary_dyntick_idle() to do a heavy-weight
297 * dyntick-idle quiescent state visible to other CPUs (but only for those 316 * dyntick-idle quiescent state visible to other CPUs (but only for those
298 * RCU flavors in desparate need of a quiescent state, which will normally 317 * RCU flavors in desperate need of a quiescent state, which will normally
299 * be none of them). Either way, do a lightweight quiescent state for 318 * be none of them). Either way, do a lightweight quiescent state for
300 * all RCU flavors. 319 * all RCU flavors.
301 */ 320 */
@@ -410,6 +429,15 @@ void rcu_bh_force_quiescent_state(void)
410EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state); 429EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state);
411 430
412/* 431/*
432 * Force a quiescent state for RCU-sched.
433 */
434void rcu_sched_force_quiescent_state(void)
435{
436 force_quiescent_state(&rcu_sched_state);
437}
438EXPORT_SYMBOL_GPL(rcu_sched_force_quiescent_state);
439
440/*
413 * Show the state of the grace-period kthreads. 441 * Show the state of the grace-period kthreads.
414 */ 442 */
415void show_rcu_gp_kthreads(void) 443void show_rcu_gp_kthreads(void)
@@ -483,15 +511,6 @@ void rcutorture_record_progress(unsigned long vernum)
483EXPORT_SYMBOL_GPL(rcutorture_record_progress); 511EXPORT_SYMBOL_GPL(rcutorture_record_progress);
484 512
485/* 513/*
486 * Force a quiescent state for RCU-sched.
487 */
488void rcu_sched_force_quiescent_state(void)
489{
490 force_quiescent_state(&rcu_sched_state);
491}
492EXPORT_SYMBOL_GPL(rcu_sched_force_quiescent_state);
493
494/*
495 * Does the CPU have callbacks ready to be invoked? 514 * Does the CPU have callbacks ready to be invoked?
496 */ 515 */
497static int 516static int
@@ -954,7 +973,7 @@ bool rcu_lockdep_current_cpu_online(void)
954 preempt_disable(); 973 preempt_disable();
955 rdp = this_cpu_ptr(&rcu_sched_data); 974 rdp = this_cpu_ptr(&rcu_sched_data);
956 rnp = rdp->mynode; 975 rnp = rdp->mynode;
957 ret = (rdp->grpmask & rnp->qsmaskinit) || 976 ret = (rdp->grpmask & rcu_rnp_online_cpus(rnp)) ||
958 !rcu_scheduler_fully_active; 977 !rcu_scheduler_fully_active;
959 preempt_enable(); 978 preempt_enable();
960 return ret; 979 return ret;
@@ -1196,9 +1215,10 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum)
1196 } else { 1215 } else {
1197 j = jiffies; 1216 j = jiffies;
1198 gpa = ACCESS_ONCE(rsp->gp_activity); 1217 gpa = ACCESS_ONCE(rsp->gp_activity);
1199 pr_err("All QSes seen, last %s kthread activity %ld (%ld-%ld), jiffies_till_next_fqs=%ld\n", 1218 pr_err("All QSes seen, last %s kthread activity %ld (%ld-%ld), jiffies_till_next_fqs=%ld, root ->qsmask %#lx\n",
1200 rsp->name, j - gpa, j, gpa, 1219 rsp->name, j - gpa, j, gpa,
1201 jiffies_till_next_fqs); 1220 jiffies_till_next_fqs,
1221 rcu_get_root(rsp)->qsmask);
1202 /* In this case, the current CPU might be at fault. */ 1222 /* In this case, the current CPU might be at fault. */
1203 sched_show_task(current); 1223 sched_show_task(current);
1204 } 1224 }
@@ -1328,20 +1348,30 @@ void rcu_cpu_stall_reset(void)
1328} 1348}
1329 1349
1330/* 1350/*
1331 * Initialize the specified rcu_data structure's callback list to empty. 1351 * Initialize the specified rcu_data structure's default callback list
1352 * to empty. The default callback list is the one that is not used by
1353 * no-callbacks CPUs.
1332 */ 1354 */
1333static void init_callback_list(struct rcu_data *rdp) 1355static void init_default_callback_list(struct rcu_data *rdp)
1334{ 1356{
1335 int i; 1357 int i;
1336 1358
1337 if (init_nocb_callback_list(rdp))
1338 return;
1339 rdp->nxtlist = NULL; 1359 rdp->nxtlist = NULL;
1340 for (i = 0; i < RCU_NEXT_SIZE; i++) 1360 for (i = 0; i < RCU_NEXT_SIZE; i++)
1341 rdp->nxttail[i] = &rdp->nxtlist; 1361 rdp->nxttail[i] = &rdp->nxtlist;
1342} 1362}
1343 1363
1344/* 1364/*
1365 * Initialize the specified rcu_data structure's callback list to empty.
1366 */
1367static void init_callback_list(struct rcu_data *rdp)
1368{
1369 if (init_nocb_callback_list(rdp))
1370 return;
1371 init_default_callback_list(rdp);
1372}
1373
1374/*
1345 * Determine the value that ->completed will have at the end of the 1375 * Determine the value that ->completed will have at the end of the
1346 * next subsequent grace period. This is used to tag callbacks so that 1376 * next subsequent grace period. This is used to tag callbacks so that
1347 * a CPU can invoke callbacks in a timely fashion even if that CPU has 1377 * a CPU can invoke callbacks in a timely fashion even if that CPU has
@@ -1703,11 +1733,11 @@ static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp)
1703 */ 1733 */
1704static int rcu_gp_init(struct rcu_state *rsp) 1734static int rcu_gp_init(struct rcu_state *rsp)
1705{ 1735{
1736 unsigned long oldmask;
1706 struct rcu_data *rdp; 1737 struct rcu_data *rdp;
1707 struct rcu_node *rnp = rcu_get_root(rsp); 1738 struct rcu_node *rnp = rcu_get_root(rsp);
1708 1739
1709 ACCESS_ONCE(rsp->gp_activity) = jiffies; 1740 ACCESS_ONCE(rsp->gp_activity) = jiffies;
1710 rcu_bind_gp_kthread();
1711 raw_spin_lock_irq(&rnp->lock); 1741 raw_spin_lock_irq(&rnp->lock);
1712 smp_mb__after_unlock_lock(); 1742 smp_mb__after_unlock_lock();
1713 if (!ACCESS_ONCE(rsp->gp_flags)) { 1743 if (!ACCESS_ONCE(rsp->gp_flags)) {
@@ -1733,9 +1763,54 @@ static int rcu_gp_init(struct rcu_state *rsp)
1733 trace_rcu_grace_period(rsp->name, rsp->gpnum, TPS("start")); 1763 trace_rcu_grace_period(rsp->name, rsp->gpnum, TPS("start"));
1734 raw_spin_unlock_irq(&rnp->lock); 1764 raw_spin_unlock_irq(&rnp->lock);
1735 1765
1736 /* Exclude any concurrent CPU-hotplug operations. */ 1766 /*
1737 mutex_lock(&rsp->onoff_mutex); 1767 * Apply per-leaf buffered online and offline operations to the
1738 smp_mb__after_unlock_lock(); /* ->gpnum increment before GP! */ 1768 * rcu_node tree. Note that this new grace period need not wait
1769 * for subsequent online CPUs, and that quiescent-state forcing
1770 * will handle subsequent offline CPUs.
1771 */
1772 rcu_for_each_leaf_node(rsp, rnp) {
1773 raw_spin_lock_irq(&rnp->lock);
1774 smp_mb__after_unlock_lock();
1775 if (rnp->qsmaskinit == rnp->qsmaskinitnext &&
1776 !rnp->wait_blkd_tasks) {
1777 /* Nothing to do on this leaf rcu_node structure. */
1778 raw_spin_unlock_irq(&rnp->lock);
1779 continue;
1780 }
1781
1782 /* Record old state, apply changes to ->qsmaskinit field. */
1783 oldmask = rnp->qsmaskinit;
1784 rnp->qsmaskinit = rnp->qsmaskinitnext;
1785
1786 /* If zero-ness of ->qsmaskinit changed, propagate up tree. */
1787 if (!oldmask != !rnp->qsmaskinit) {
1788 if (!oldmask) /* First online CPU for this rcu_node. */
1789 rcu_init_new_rnp(rnp);
1790 else if (rcu_preempt_has_tasks(rnp)) /* blocked tasks */
1791 rnp->wait_blkd_tasks = true;
1792 else /* Last offline CPU and can propagate. */
1793 rcu_cleanup_dead_rnp(rnp);
1794 }
1795
1796 /*
1797 * If all waited-on tasks from prior grace period are
1798 * done, and if all this rcu_node structure's CPUs are
1799 * still offline, propagate up the rcu_node tree and
1800 * clear ->wait_blkd_tasks. Otherwise, if one of this
1801 * rcu_node structure's CPUs has since come back online,
1802 * simply clear ->wait_blkd_tasks (but rcu_cleanup_dead_rnp()
1803 * checks for this, so just call it unconditionally).
1804 */
1805 if (rnp->wait_blkd_tasks &&
1806 (!rcu_preempt_has_tasks(rnp) ||
1807 rnp->qsmaskinit)) {
1808 rnp->wait_blkd_tasks = false;
1809 rcu_cleanup_dead_rnp(rnp);
1810 }
1811
1812 raw_spin_unlock_irq(&rnp->lock);
1813 }
1739 1814
1740 /* 1815 /*
1741 * Set the quiescent-state-needed bits in all the rcu_node 1816 * Set the quiescent-state-needed bits in all the rcu_node
@@ -1757,8 +1832,8 @@ static int rcu_gp_init(struct rcu_state *rsp)
1757 rcu_preempt_check_blocked_tasks(rnp); 1832 rcu_preempt_check_blocked_tasks(rnp);
1758 rnp->qsmask = rnp->qsmaskinit; 1833 rnp->qsmask = rnp->qsmaskinit;
1759 ACCESS_ONCE(rnp->gpnum) = rsp->gpnum; 1834 ACCESS_ONCE(rnp->gpnum) = rsp->gpnum;
1760 WARN_ON_ONCE(rnp->completed != rsp->completed); 1835 if (WARN_ON_ONCE(rnp->completed != rsp->completed))
1761 ACCESS_ONCE(rnp->completed) = rsp->completed; 1836 ACCESS_ONCE(rnp->completed) = rsp->completed;
1762 if (rnp == rdp->mynode) 1837 if (rnp == rdp->mynode)
1763 (void)__note_gp_changes(rsp, rnp, rdp); 1838 (void)__note_gp_changes(rsp, rnp, rdp);
1764 rcu_preempt_boost_start_gp(rnp); 1839 rcu_preempt_boost_start_gp(rnp);
@@ -1768,9 +1843,12 @@ static int rcu_gp_init(struct rcu_state *rsp)
1768 raw_spin_unlock_irq(&rnp->lock); 1843 raw_spin_unlock_irq(&rnp->lock);
1769 cond_resched_rcu_qs(); 1844 cond_resched_rcu_qs();
1770 ACCESS_ONCE(rsp->gp_activity) = jiffies; 1845 ACCESS_ONCE(rsp->gp_activity) = jiffies;
1846 if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_SLOW_INIT) &&
1847 gp_init_delay > 0 &&
1848 !(rsp->gpnum % (rcu_num_nodes * 10)))
1849 schedule_timeout_uninterruptible(gp_init_delay);
1771 } 1850 }
1772 1851
1773 mutex_unlock(&rsp->onoff_mutex);
1774 return 1; 1852 return 1;
1775} 1853}
1776 1854
@@ -1798,7 +1876,7 @@ static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in)
1798 fqs_state = RCU_FORCE_QS; 1876 fqs_state = RCU_FORCE_QS;
1799 } else { 1877 } else {
1800 /* Handle dyntick-idle and offline CPUs. */ 1878 /* Handle dyntick-idle and offline CPUs. */
1801 isidle = false; 1879 isidle = true;
1802 force_qs_rnp(rsp, rcu_implicit_dynticks_qs, &isidle, &maxj); 1880 force_qs_rnp(rsp, rcu_implicit_dynticks_qs, &isidle, &maxj);
1803 } 1881 }
1804 /* Clear flag to prevent immediate re-entry. */ 1882 /* Clear flag to prevent immediate re-entry. */
@@ -1852,6 +1930,8 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
1852 rcu_for_each_node_breadth_first(rsp, rnp) { 1930 rcu_for_each_node_breadth_first(rsp, rnp) {
1853 raw_spin_lock_irq(&rnp->lock); 1931 raw_spin_lock_irq(&rnp->lock);
1854 smp_mb__after_unlock_lock(); 1932 smp_mb__after_unlock_lock();
1933 WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp));
1934 WARN_ON_ONCE(rnp->qsmask);
1855 ACCESS_ONCE(rnp->completed) = rsp->gpnum; 1935 ACCESS_ONCE(rnp->completed) = rsp->gpnum;
1856 rdp = this_cpu_ptr(rsp->rda); 1936 rdp = this_cpu_ptr(rsp->rda);
1857 if (rnp == rdp->mynode) 1937 if (rnp == rdp->mynode)
@@ -1895,6 +1975,7 @@ static int __noreturn rcu_gp_kthread(void *arg)
1895 struct rcu_state *rsp = arg; 1975 struct rcu_state *rsp = arg;
1896 struct rcu_node *rnp = rcu_get_root(rsp); 1976 struct rcu_node *rnp = rcu_get_root(rsp);
1897 1977
1978 rcu_bind_gp_kthread();
1898 for (;;) { 1979 for (;;) {
1899 1980
1900 /* Handle grace-period start. */ 1981 /* Handle grace-period start. */
@@ -2062,25 +2143,32 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
2062 * Similar to rcu_report_qs_rdp(), for which it is a helper function. 2143 * Similar to rcu_report_qs_rdp(), for which it is a helper function.
2063 * Allows quiescent states for a group of CPUs to be reported at one go 2144 * Allows quiescent states for a group of CPUs to be reported at one go
2064 * to the specified rcu_node structure, though all the CPUs in the group 2145 * to the specified rcu_node structure, though all the CPUs in the group
2065 * must be represented by the same rcu_node structure (which need not be 2146 * must be represented by the same rcu_node structure (which need not be a
2066 * a leaf rcu_node structure, though it often will be). That structure's 2147 * leaf rcu_node structure, though it often will be). The gps parameter
2067 * lock must be held upon entry, and it is released before return. 2148 * is the grace-period snapshot, which means that the quiescent states
2149 * are valid only if rnp->gpnum is equal to gps. That structure's lock
2150 * must be held upon entry, and it is released before return.
2068 */ 2151 */
2069static void 2152static void
2070rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, 2153rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
2071 struct rcu_node *rnp, unsigned long flags) 2154 struct rcu_node *rnp, unsigned long gps, unsigned long flags)
2072 __releases(rnp->lock) 2155 __releases(rnp->lock)
2073{ 2156{
2157 unsigned long oldmask = 0;
2074 struct rcu_node *rnp_c; 2158 struct rcu_node *rnp_c;
2075 2159
2076 /* Walk up the rcu_node hierarchy. */ 2160 /* Walk up the rcu_node hierarchy. */
2077 for (;;) { 2161 for (;;) {
2078 if (!(rnp->qsmask & mask)) { 2162 if (!(rnp->qsmask & mask) || rnp->gpnum != gps) {
2079 2163
2080 /* Our bit has already been cleared, so done. */ 2164 /*
2165 * Our bit has already been cleared, or the
2166 * relevant grace period is already over, so done.
2167 */
2081 raw_spin_unlock_irqrestore(&rnp->lock, flags); 2168 raw_spin_unlock_irqrestore(&rnp->lock, flags);
2082 return; 2169 return;
2083 } 2170 }
2171 WARN_ON_ONCE(oldmask); /* Any child must be all zeroed! */
2084 rnp->qsmask &= ~mask; 2172 rnp->qsmask &= ~mask;
2085 trace_rcu_quiescent_state_report(rsp->name, rnp->gpnum, 2173 trace_rcu_quiescent_state_report(rsp->name, rnp->gpnum,
2086 mask, rnp->qsmask, rnp->level, 2174 mask, rnp->qsmask, rnp->level,
@@ -2104,7 +2192,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
2104 rnp = rnp->parent; 2192 rnp = rnp->parent;
2105 raw_spin_lock_irqsave(&rnp->lock, flags); 2193 raw_spin_lock_irqsave(&rnp->lock, flags);
2106 smp_mb__after_unlock_lock(); 2194 smp_mb__after_unlock_lock();
2107 WARN_ON_ONCE(rnp_c->qsmask); 2195 oldmask = rnp_c->qsmask;
2108 } 2196 }
2109 2197
2110 /* 2198 /*
@@ -2116,6 +2204,46 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
2116} 2204}
2117 2205
2118/* 2206/*
2207 * Record a quiescent state for all tasks that were previously queued
2208 * on the specified rcu_node structure and that were blocking the current
2209 * RCU grace period. The caller must hold the specified rnp->lock with
2210 * irqs disabled, and this lock is released upon return, but irqs remain
2211 * disabled.
2212 */
2213static void rcu_report_unblock_qs_rnp(struct rcu_state *rsp,
2214 struct rcu_node *rnp, unsigned long flags)
2215 __releases(rnp->lock)
2216{
2217 unsigned long gps;
2218 unsigned long mask;
2219 struct rcu_node *rnp_p;
2220
2221 if (rcu_state_p == &rcu_sched_state || rsp != rcu_state_p ||
2222 rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) {
2223 raw_spin_unlock_irqrestore(&rnp->lock, flags);
2224 return; /* Still need more quiescent states! */
2225 }
2226
2227 rnp_p = rnp->parent;
2228 if (rnp_p == NULL) {
2229 /*
2230 * Only one rcu_node structure in the tree, so don't
2231 * try to report up to its nonexistent parent!
2232 */
2233 rcu_report_qs_rsp(rsp, flags);
2234 return;
2235 }
2236
2237 /* Report up the rest of the hierarchy, tracking current ->gpnum. */
2238 gps = rnp->gpnum;
2239 mask = rnp->grpmask;
2240 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
2241 raw_spin_lock(&rnp_p->lock); /* irqs already disabled. */
2242 smp_mb__after_unlock_lock();
2243 rcu_report_qs_rnp(mask, rsp, rnp_p, gps, flags);
2244}
2245
2246/*
2119 * Record a quiescent state for the specified CPU to that CPU's rcu_data 2247 * Record a quiescent state for the specified CPU to that CPU's rcu_data
2120 * structure. This must be either called from the specified CPU, or 2248 * structure. This must be either called from the specified CPU, or
2121 * called when the specified CPU is known to be offline (and when it is 2249 * called when the specified CPU is known to be offline (and when it is
@@ -2163,7 +2291,8 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
2163 */ 2291 */
2164 needwake = rcu_accelerate_cbs(rsp, rnp, rdp); 2292 needwake = rcu_accelerate_cbs(rsp, rnp, rdp);
2165 2293
2166 rcu_report_qs_rnp(mask, rsp, rnp, flags); /* rlses rnp->lock */ 2294 rcu_report_qs_rnp(mask, rsp, rnp, rnp->gpnum, flags);
2295 /* ^^^ Released rnp->lock */
2167 if (needwake) 2296 if (needwake)
2168 rcu_gp_kthread_wake(rsp); 2297 rcu_gp_kthread_wake(rsp);
2169 } 2298 }
@@ -2256,8 +2385,12 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp,
2256 rsp->orphan_donetail = rdp->nxttail[RCU_DONE_TAIL]; 2385 rsp->orphan_donetail = rdp->nxttail[RCU_DONE_TAIL];
2257 } 2386 }
2258 2387
2259 /* Finally, initialize the rcu_data structure's list to empty. */ 2388 /*
2389 * Finally, initialize the rcu_data structure's list to empty and
2390 * disallow further callbacks on this CPU.
2391 */
2260 init_callback_list(rdp); 2392 init_callback_list(rdp);
2393 rdp->nxttail[RCU_NEXT_TAIL] = NULL;
2261} 2394}
2262 2395
2263/* 2396/*
@@ -2355,6 +2488,7 @@ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf)
2355 raw_spin_lock(&rnp->lock); /* irqs already disabled. */ 2488 raw_spin_lock(&rnp->lock); /* irqs already disabled. */
2356 smp_mb__after_unlock_lock(); /* GP memory ordering. */ 2489 smp_mb__after_unlock_lock(); /* GP memory ordering. */
2357 rnp->qsmaskinit &= ~mask; 2490 rnp->qsmaskinit &= ~mask;
2491 rnp->qsmask &= ~mask;
2358 if (rnp->qsmaskinit) { 2492 if (rnp->qsmaskinit) {
2359 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ 2493 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
2360 return; 2494 return;
@@ -2364,6 +2498,26 @@ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf)
2364} 2498}
2365 2499
2366/* 2500/*
2501 * The CPU is exiting the idle loop into the arch_cpu_idle_dead()
2502 * function. We now remove it from the rcu_node tree's ->qsmaskinit
2503 * bit masks.
2504 */
2505static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp)
2506{
2507 unsigned long flags;
2508 unsigned long mask;
2509 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
2510 struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */
2511
2512 /* Remove outgoing CPU from mask in the leaf rcu_node structure. */
2513 mask = rdp->grpmask;
2514 raw_spin_lock_irqsave(&rnp->lock, flags);
2515 smp_mb__after_unlock_lock(); /* Enforce GP memory-order guarantee. */
2516 rnp->qsmaskinitnext &= ~mask;
2517 raw_spin_unlock_irqrestore(&rnp->lock, flags);
2518}
2519
2520/*
2367 * The CPU has been completely removed, and some other CPU is reporting 2521 * The CPU has been completely removed, and some other CPU is reporting
2368 * this fact from process context. Do the remainder of the cleanup, 2522 * this fact from process context. Do the remainder of the cleanup,
2369 * including orphaning the outgoing CPU's RCU callbacks, and also 2523 * including orphaning the outgoing CPU's RCU callbacks, and also
@@ -2379,29 +2533,15 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
2379 /* Adjust any no-longer-needed kthreads. */ 2533 /* Adjust any no-longer-needed kthreads. */
2380 rcu_boost_kthread_setaffinity(rnp, -1); 2534 rcu_boost_kthread_setaffinity(rnp, -1);
2381 2535
2382 /* Exclude any attempts to start a new grace period. */
2383 mutex_lock(&rsp->onoff_mutex);
2384 raw_spin_lock_irqsave(&rsp->orphan_lock, flags);
2385
2386 /* Orphan the dead CPU's callbacks, and adopt them if appropriate. */ 2536 /* Orphan the dead CPU's callbacks, and adopt them if appropriate. */
2537 raw_spin_lock_irqsave(&rsp->orphan_lock, flags);
2387 rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp); 2538 rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp);
2388 rcu_adopt_orphan_cbs(rsp, flags); 2539 rcu_adopt_orphan_cbs(rsp, flags);
2389 raw_spin_unlock_irqrestore(&rsp->orphan_lock, flags); 2540 raw_spin_unlock_irqrestore(&rsp->orphan_lock, flags);
2390 2541
2391 /* Remove outgoing CPU from mask in the leaf rcu_node structure. */
2392 raw_spin_lock_irqsave(&rnp->lock, flags);
2393 smp_mb__after_unlock_lock(); /* Enforce GP memory-order guarantee. */
2394 rnp->qsmaskinit &= ~rdp->grpmask;
2395 if (rnp->qsmaskinit == 0 && !rcu_preempt_has_tasks(rnp))
2396 rcu_cleanup_dead_rnp(rnp);
2397 rcu_report_qs_rnp(rdp->grpmask, rsp, rnp, flags); /* Rlses rnp->lock. */
2398 WARN_ONCE(rdp->qlen != 0 || rdp->nxtlist != NULL, 2542 WARN_ONCE(rdp->qlen != 0 || rdp->nxtlist != NULL,
2399 "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, nxtlist=%p\n", 2543 "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, nxtlist=%p\n",
2400 cpu, rdp->qlen, rdp->nxtlist); 2544 cpu, rdp->qlen, rdp->nxtlist);
2401 init_callback_list(rdp);
2402 /* Disallow further callbacks on this CPU. */
2403 rdp->nxttail[RCU_NEXT_TAIL] = NULL;
2404 mutex_unlock(&rsp->onoff_mutex);
2405} 2545}
2406 2546
2407#else /* #ifdef CONFIG_HOTPLUG_CPU */ 2547#else /* #ifdef CONFIG_HOTPLUG_CPU */
@@ -2414,6 +2554,10 @@ static void __maybe_unused rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf)
2414{ 2554{
2415} 2555}
2416 2556
2557static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp)
2558{
2559}
2560
2417static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) 2561static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
2418{ 2562{
2419} 2563}
@@ -2589,26 +2733,47 @@ static void force_qs_rnp(struct rcu_state *rsp,
2589 return; 2733 return;
2590 } 2734 }
2591 if (rnp->qsmask == 0) { 2735 if (rnp->qsmask == 0) {
2592 rcu_initiate_boost(rnp, flags); /* releases rnp->lock */ 2736 if (rcu_state_p == &rcu_sched_state ||
2593 continue; 2737 rsp != rcu_state_p ||
2738 rcu_preempt_blocked_readers_cgp(rnp)) {
2739 /*
2740 * No point in scanning bits because they
2741 * are all zero. But we might need to
2742 * priority-boost blocked readers.
2743 */
2744 rcu_initiate_boost(rnp, flags);
2745 /* rcu_initiate_boost() releases rnp->lock */
2746 continue;
2747 }
2748 if (rnp->parent &&
2749 (rnp->parent->qsmask & rnp->grpmask)) {
2750 /*
2751 * Race between grace-period
2752 * initialization and task exiting RCU
2753 * read-side critical section: Report.
2754 */
2755 rcu_report_unblock_qs_rnp(rsp, rnp, flags);
2756 /* rcu_report_unblock_qs_rnp() rlses ->lock */
2757 continue;
2758 }
2594 } 2759 }
2595 cpu = rnp->grplo; 2760 cpu = rnp->grplo;
2596 bit = 1; 2761 bit = 1;
2597 for (; cpu <= rnp->grphi; cpu++, bit <<= 1) { 2762 for (; cpu <= rnp->grphi; cpu++, bit <<= 1) {
2598 if ((rnp->qsmask & bit) != 0) { 2763 if ((rnp->qsmask & bit) != 0) {
2599 if ((rnp->qsmaskinit & bit) != 0) 2764 if ((rnp->qsmaskinit & bit) == 0)
2600 *isidle = false; 2765 *isidle = false; /* Pending hotplug. */
2601 if (f(per_cpu_ptr(rsp->rda, cpu), isidle, maxj)) 2766 if (f(per_cpu_ptr(rsp->rda, cpu), isidle, maxj))
2602 mask |= bit; 2767 mask |= bit;
2603 } 2768 }
2604 } 2769 }
2605 if (mask != 0) { 2770 if (mask != 0) {
2606 2771 /* Idle/offline CPUs, report (releases rnp->lock. */
2607 /* rcu_report_qs_rnp() releases rnp->lock. */ 2772 rcu_report_qs_rnp(mask, rsp, rnp, rnp->gpnum, flags);
2608 rcu_report_qs_rnp(mask, rsp, rnp, flags); 2773 } else {
2609 continue; 2774 /* Nothing to do here, so just drop the lock. */
2775 raw_spin_unlock_irqrestore(&rnp->lock, flags);
2610 } 2776 }
2611 raw_spin_unlock_irqrestore(&rnp->lock, flags);
2612 } 2777 }
2613} 2778}
2614 2779
@@ -2741,7 +2906,7 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp,
2741 * If called from an extended quiescent state, invoke the RCU 2906 * If called from an extended quiescent state, invoke the RCU
2742 * core in order to force a re-evaluation of RCU's idleness. 2907 * core in order to force a re-evaluation of RCU's idleness.
2743 */ 2908 */
2744 if (!rcu_is_watching() && cpu_online(smp_processor_id())) 2909 if (!rcu_is_watching())
2745 invoke_rcu_core(); 2910 invoke_rcu_core();
2746 2911
2747 /* If interrupts were disabled or CPU offline, don't invoke RCU core. */ 2912 /* If interrupts were disabled or CPU offline, don't invoke RCU core. */
@@ -2827,11 +2992,22 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
2827 2992
2828 if (cpu != -1) 2993 if (cpu != -1)
2829 rdp = per_cpu_ptr(rsp->rda, cpu); 2994 rdp = per_cpu_ptr(rsp->rda, cpu);
2830 offline = !__call_rcu_nocb(rdp, head, lazy, flags); 2995 if (likely(rdp->mynode)) {
2831 WARN_ON_ONCE(offline); 2996 /* Post-boot, so this should be for a no-CBs CPU. */
2832 /* _call_rcu() is illegal on offline CPU; leak the callback. */ 2997 offline = !__call_rcu_nocb(rdp, head, lazy, flags);
2833 local_irq_restore(flags); 2998 WARN_ON_ONCE(offline);
2834 return; 2999 /* Offline CPU, _call_rcu() illegal, leak callback. */
3000 local_irq_restore(flags);
3001 return;
3002 }
3003 /*
3004 * Very early boot, before rcu_init(). Initialize if needed
3005 * and then drop through to queue the callback.
3006 */
3007 BUG_ON(cpu != -1);
3008 WARN_ON_ONCE(!rcu_is_watching());
3009 if (!likely(rdp->nxtlist))
3010 init_default_callback_list(rdp);
2835 } 3011 }
2836 ACCESS_ONCE(rdp->qlen) = rdp->qlen + 1; 3012 ACCESS_ONCE(rdp->qlen) = rdp->qlen + 1;
2837 if (lazy) 3013 if (lazy)
@@ -2954,7 +3130,7 @@ void synchronize_sched(void)
2954 "Illegal synchronize_sched() in RCU-sched read-side critical section"); 3130 "Illegal synchronize_sched() in RCU-sched read-side critical section");
2955 if (rcu_blocking_is_gp()) 3131 if (rcu_blocking_is_gp())
2956 return; 3132 return;
2957 if (rcu_expedited) 3133 if (rcu_gp_is_expedited())
2958 synchronize_sched_expedited(); 3134 synchronize_sched_expedited();
2959 else 3135 else
2960 wait_rcu_gp(call_rcu_sched); 3136 wait_rcu_gp(call_rcu_sched);
@@ -2981,7 +3157,7 @@ void synchronize_rcu_bh(void)
2981 "Illegal synchronize_rcu_bh() in RCU-bh read-side critical section"); 3157 "Illegal synchronize_rcu_bh() in RCU-bh read-side critical section");
2982 if (rcu_blocking_is_gp()) 3158 if (rcu_blocking_is_gp())
2983 return; 3159 return;
2984 if (rcu_expedited) 3160 if (rcu_gp_is_expedited())
2985 synchronize_rcu_bh_expedited(); 3161 synchronize_rcu_bh_expedited();
2986 else 3162 else
2987 wait_rcu_gp(call_rcu_bh); 3163 wait_rcu_gp(call_rcu_bh);
@@ -3518,6 +3694,28 @@ void rcu_barrier_sched(void)
3518EXPORT_SYMBOL_GPL(rcu_barrier_sched); 3694EXPORT_SYMBOL_GPL(rcu_barrier_sched);
3519 3695
3520/* 3696/*
3697 * Propagate ->qsinitmask bits up the rcu_node tree to account for the
3698 * first CPU in a given leaf rcu_node structure coming online. The caller
3699 * must hold the corresponding leaf rcu_node ->lock with interrrupts
3700 * disabled.
3701 */
3702static void rcu_init_new_rnp(struct rcu_node *rnp_leaf)
3703{
3704 long mask;
3705 struct rcu_node *rnp = rnp_leaf;
3706
3707 for (;;) {
3708 mask = rnp->grpmask;
3709 rnp = rnp->parent;
3710 if (rnp == NULL)
3711 return;
3712 raw_spin_lock(&rnp->lock); /* Interrupts already disabled. */
3713 rnp->qsmaskinit |= mask;
3714 raw_spin_unlock(&rnp->lock); /* Interrupts remain disabled. */
3715 }
3716}
3717
3718/*
3521 * Do boot-time initialization of a CPU's per-CPU RCU data. 3719 * Do boot-time initialization of a CPU's per-CPU RCU data.
3522 */ 3720 */
3523static void __init 3721static void __init
@@ -3553,49 +3751,37 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
3553 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); 3751 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
3554 struct rcu_node *rnp = rcu_get_root(rsp); 3752 struct rcu_node *rnp = rcu_get_root(rsp);
3555 3753
3556 /* Exclude new grace periods. */
3557 mutex_lock(&rsp->onoff_mutex);
3558
3559 /* Set up local state, ensuring consistent view of global state. */ 3754 /* Set up local state, ensuring consistent view of global state. */
3560 raw_spin_lock_irqsave(&rnp->lock, flags); 3755 raw_spin_lock_irqsave(&rnp->lock, flags);
3561 rdp->beenonline = 1; /* We have now been online. */ 3756 rdp->beenonline = 1; /* We have now been online. */
3562 rdp->qlen_last_fqs_check = 0; 3757 rdp->qlen_last_fqs_check = 0;
3563 rdp->n_force_qs_snap = rsp->n_force_qs; 3758 rdp->n_force_qs_snap = rsp->n_force_qs;
3564 rdp->blimit = blimit; 3759 rdp->blimit = blimit;
3565 init_callback_list(rdp); /* Re-enable callbacks on this CPU. */ 3760 if (!rdp->nxtlist)
3761 init_callback_list(rdp); /* Re-enable callbacks on this CPU. */
3566 rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; 3762 rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
3567 rcu_sysidle_init_percpu_data(rdp->dynticks); 3763 rcu_sysidle_init_percpu_data(rdp->dynticks);
3568 atomic_set(&rdp->dynticks->dynticks, 3764 atomic_set(&rdp->dynticks->dynticks,
3569 (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1); 3765 (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1);
3570 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ 3766 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
3571 3767
3572 /* Add CPU to rcu_node bitmasks. */ 3768 /*
3769 * Add CPU to leaf rcu_node pending-online bitmask. Any needed
3770 * propagation up the rcu_node tree will happen at the beginning
3771 * of the next grace period.
3772 */
3573 rnp = rdp->mynode; 3773 rnp = rdp->mynode;
3574 mask = rdp->grpmask; 3774 mask = rdp->grpmask;
3575 do { 3775 raw_spin_lock(&rnp->lock); /* irqs already disabled. */
3576 /* Exclude any attempts to start a new GP on small systems. */ 3776 smp_mb__after_unlock_lock();
3577 raw_spin_lock(&rnp->lock); /* irqs already disabled. */ 3777 rnp->qsmaskinitnext |= mask;
3578 rnp->qsmaskinit |= mask; 3778 rdp->gpnum = rnp->completed; /* Make CPU later note any new GP. */
3579 mask = rnp->grpmask; 3779 rdp->completed = rnp->completed;
3580 if (rnp == rdp->mynode) { 3780 rdp->passed_quiesce = false;
3581 /* 3781 rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr);
3582 * If there is a grace period in progress, we will 3782 rdp->qs_pending = false;
3583 * set up to wait for it next time we run the 3783 trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl"));
3584 * RCU core code. 3784 raw_spin_unlock_irqrestore(&rnp->lock, flags);
3585 */
3586 rdp->gpnum = rnp->completed;
3587 rdp->completed = rnp->completed;
3588 rdp->passed_quiesce = 0;
3589 rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr);
3590 rdp->qs_pending = 0;
3591 trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl"));
3592 }
3593 raw_spin_unlock(&rnp->lock); /* irqs already disabled. */
3594 rnp = rnp->parent;
3595 } while (rnp != NULL && !(rnp->qsmaskinit & mask));
3596 local_irq_restore(flags);
3597
3598 mutex_unlock(&rsp->onoff_mutex);
3599} 3785}
3600 3786
3601static void rcu_prepare_cpu(int cpu) 3787static void rcu_prepare_cpu(int cpu)
@@ -3609,15 +3795,14 @@ static void rcu_prepare_cpu(int cpu)
3609/* 3795/*
3610 * Handle CPU online/offline notification events. 3796 * Handle CPU online/offline notification events.
3611 */ 3797 */
3612static int rcu_cpu_notify(struct notifier_block *self, 3798int rcu_cpu_notify(struct notifier_block *self,
3613 unsigned long action, void *hcpu) 3799 unsigned long action, void *hcpu)
3614{ 3800{
3615 long cpu = (long)hcpu; 3801 long cpu = (long)hcpu;
3616 struct rcu_data *rdp = per_cpu_ptr(rcu_state_p->rda, cpu); 3802 struct rcu_data *rdp = per_cpu_ptr(rcu_state_p->rda, cpu);
3617 struct rcu_node *rnp = rdp->mynode; 3803 struct rcu_node *rnp = rdp->mynode;
3618 struct rcu_state *rsp; 3804 struct rcu_state *rsp;
3619 3805
3620 trace_rcu_utilization(TPS("Start CPU hotplug"));
3621 switch (action) { 3806 switch (action) {
3622 case CPU_UP_PREPARE: 3807 case CPU_UP_PREPARE:
3623 case CPU_UP_PREPARE_FROZEN: 3808 case CPU_UP_PREPARE_FROZEN:
@@ -3637,6 +3822,11 @@ static int rcu_cpu_notify(struct notifier_block *self,
3637 for_each_rcu_flavor(rsp) 3822 for_each_rcu_flavor(rsp)
3638 rcu_cleanup_dying_cpu(rsp); 3823 rcu_cleanup_dying_cpu(rsp);
3639 break; 3824 break;
3825 case CPU_DYING_IDLE:
3826 for_each_rcu_flavor(rsp) {
3827 rcu_cleanup_dying_idle_cpu(cpu, rsp);
3828 }
3829 break;
3640 case CPU_DEAD: 3830 case CPU_DEAD:
3641 case CPU_DEAD_FROZEN: 3831 case CPU_DEAD_FROZEN:
3642 case CPU_UP_CANCELED: 3832 case CPU_UP_CANCELED:
@@ -3649,7 +3839,6 @@ static int rcu_cpu_notify(struct notifier_block *self,
3649 default: 3839 default:
3650 break; 3840 break;
3651 } 3841 }
3652 trace_rcu_utilization(TPS("End CPU hotplug"));
3653 return NOTIFY_OK; 3842 return NOTIFY_OK;
3654} 3843}
3655 3844
@@ -3660,11 +3849,12 @@ static int rcu_pm_notify(struct notifier_block *self,
3660 case PM_HIBERNATION_PREPARE: 3849 case PM_HIBERNATION_PREPARE:
3661 case PM_SUSPEND_PREPARE: 3850 case PM_SUSPEND_PREPARE:
3662 if (nr_cpu_ids <= 256) /* Expediting bad for large systems. */ 3851 if (nr_cpu_ids <= 256) /* Expediting bad for large systems. */
3663 rcu_expedited = 1; 3852 rcu_expedite_gp();
3664 break; 3853 break;
3665 case PM_POST_HIBERNATION: 3854 case PM_POST_HIBERNATION:
3666 case PM_POST_SUSPEND: 3855 case PM_POST_SUSPEND:
3667 rcu_expedited = 0; 3856 if (nr_cpu_ids <= 256) /* Expediting bad for large systems. */
3857 rcu_unexpedite_gp();
3668 break; 3858 break;
3669 default: 3859 default:
3670 break; 3860 break;
@@ -3734,30 +3924,26 @@ void rcu_scheduler_starting(void)
3734 * Compute the per-level fanout, either using the exact fanout specified 3924 * Compute the per-level fanout, either using the exact fanout specified
3735 * or balancing the tree, depending on CONFIG_RCU_FANOUT_EXACT. 3925 * or balancing the tree, depending on CONFIG_RCU_FANOUT_EXACT.
3736 */ 3926 */
3737#ifdef CONFIG_RCU_FANOUT_EXACT
3738static void __init rcu_init_levelspread(struct rcu_state *rsp)
3739{
3740 int i;
3741
3742 rsp->levelspread[rcu_num_lvls - 1] = rcu_fanout_leaf;
3743 for (i = rcu_num_lvls - 2; i >= 0; i--)
3744 rsp->levelspread[i] = CONFIG_RCU_FANOUT;
3745}
3746#else /* #ifdef CONFIG_RCU_FANOUT_EXACT */
3747static void __init rcu_init_levelspread(struct rcu_state *rsp) 3927static void __init rcu_init_levelspread(struct rcu_state *rsp)
3748{ 3928{
3749 int ccur;
3750 int cprv;
3751 int i; 3929 int i;
3752 3930
3753 cprv = nr_cpu_ids; 3931 if (IS_ENABLED(CONFIG_RCU_FANOUT_EXACT)) {
3754 for (i = rcu_num_lvls - 1; i >= 0; i--) { 3932 rsp->levelspread[rcu_num_lvls - 1] = rcu_fanout_leaf;
3755 ccur = rsp->levelcnt[i]; 3933 for (i = rcu_num_lvls - 2; i >= 0; i--)
3756 rsp->levelspread[i] = (cprv + ccur - 1) / ccur; 3934 rsp->levelspread[i] = CONFIG_RCU_FANOUT;
3757 cprv = ccur; 3935 } else {
3936 int ccur;
3937 int cprv;
3938
3939 cprv = nr_cpu_ids;
3940 for (i = rcu_num_lvls - 1; i >= 0; i--) {
3941 ccur = rsp->levelcnt[i];
3942 rsp->levelspread[i] = (cprv + ccur - 1) / ccur;
3943 cprv = ccur;
3944 }
3758 } 3945 }
3759} 3946}
3760#endif /* #else #ifdef CONFIG_RCU_FANOUT_EXACT */
3761 3947
3762/* 3948/*
3763 * Helper function for rcu_init() that initializes one rcu_state structure. 3949 * Helper function for rcu_init() that initializes one rcu_state structure.
@@ -3833,7 +4019,6 @@ static void __init rcu_init_one(struct rcu_state *rsp,
3833 } 4019 }
3834 } 4020 }
3835 4021
3836 rsp->rda = rda;
3837 init_waitqueue_head(&rsp->gp_wq); 4022 init_waitqueue_head(&rsp->gp_wq);
3838 rnp = rsp->level[rcu_num_lvls - 1]; 4023 rnp = rsp->level[rcu_num_lvls - 1];
3839 for_each_possible_cpu(i) { 4024 for_each_possible_cpu(i) {
@@ -3926,6 +4111,8 @@ void __init rcu_init(void)
3926{ 4111{
3927 int cpu; 4112 int cpu;
3928 4113
4114 rcu_early_boot_tests();
4115
3929 rcu_bootup_announce(); 4116 rcu_bootup_announce();
3930 rcu_init_geometry(); 4117 rcu_init_geometry();
3931 rcu_init_one(&rcu_bh_state, &rcu_bh_data); 4118 rcu_init_one(&rcu_bh_state, &rcu_bh_data);
@@ -3942,8 +4129,6 @@ void __init rcu_init(void)
3942 pm_notifier(rcu_pm_notify, 0); 4129 pm_notifier(rcu_pm_notify, 0);
3943 for_each_online_cpu(cpu) 4130 for_each_online_cpu(cpu)
3944 rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu); 4131 rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu);
3945
3946 rcu_early_boot_tests();
3947} 4132}
3948 4133
3949#include "tree_plugin.h" 4134#include "tree_plugin.h"
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 119de399eb2f..a69d3dab2ec4 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -141,12 +141,20 @@ struct rcu_node {
141 /* complete (only for PREEMPT_RCU). */ 141 /* complete (only for PREEMPT_RCU). */
142 unsigned long qsmaskinit; 142 unsigned long qsmaskinit;
143 /* Per-GP initial value for qsmask & expmask. */ 143 /* Per-GP initial value for qsmask & expmask. */
144 /* Initialized from ->qsmaskinitnext at the */
145 /* beginning of each grace period. */
146 unsigned long qsmaskinitnext;
147 /* Online CPUs for next grace period. */
144 unsigned long grpmask; /* Mask to apply to parent qsmask. */ 148 unsigned long grpmask; /* Mask to apply to parent qsmask. */
145 /* Only one bit will be set in this mask. */ 149 /* Only one bit will be set in this mask. */
146 int grplo; /* lowest-numbered CPU or group here. */ 150 int grplo; /* lowest-numbered CPU or group here. */
147 int grphi; /* highest-numbered CPU or group here. */ 151 int grphi; /* highest-numbered CPU or group here. */
148 u8 grpnum; /* CPU/group number for next level up. */ 152 u8 grpnum; /* CPU/group number for next level up. */
149 u8 level; /* root is at level 0. */ 153 u8 level; /* root is at level 0. */
154 bool wait_blkd_tasks;/* Necessary to wait for blocked tasks to */
155 /* exit RCU read-side critical sections */
156 /* before propagating offline up the */
157 /* rcu_node tree? */
150 struct rcu_node *parent; 158 struct rcu_node *parent;
151 struct list_head blkd_tasks; 159 struct list_head blkd_tasks;
152 /* Tasks blocked in RCU read-side critical */ 160 /* Tasks blocked in RCU read-side critical */
@@ -448,8 +456,6 @@ struct rcu_state {
448 long qlen; /* Total number of callbacks. */ 456 long qlen; /* Total number of callbacks. */
449 /* End of fields guarded by orphan_lock. */ 457 /* End of fields guarded by orphan_lock. */
450 458
451 struct mutex onoff_mutex; /* Coordinate hotplug & GPs. */
452
453 struct mutex barrier_mutex; /* Guards barrier fields. */ 459 struct mutex barrier_mutex; /* Guards barrier fields. */
454 atomic_t barrier_cpu_count; /* # CPUs waiting on. */ 460 atomic_t barrier_cpu_count; /* # CPUs waiting on. */
455 struct completion barrier_completion; /* Wake at barrier end. */ 461 struct completion barrier_completion; /* Wake at barrier end. */
@@ -559,6 +565,7 @@ static void rcu_prepare_kthreads(int cpu);
559static void rcu_cleanup_after_idle(void); 565static void rcu_cleanup_after_idle(void);
560static void rcu_prepare_for_idle(void); 566static void rcu_prepare_for_idle(void);
561static void rcu_idle_count_callbacks_posted(void); 567static void rcu_idle_count_callbacks_posted(void);
568static bool rcu_preempt_has_tasks(struct rcu_node *rnp);
562static void print_cpu_stall_info_begin(void); 569static void print_cpu_stall_info_begin(void);
563static void print_cpu_stall_info(struct rcu_state *rsp, int cpu); 570static void print_cpu_stall_info(struct rcu_state *rsp, int cpu);
564static void print_cpu_stall_info_end(void); 571static void print_cpu_stall_info_end(void);
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 0a571e9a0f1d..8c0ec0f5a027 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -58,38 +58,33 @@ static bool __read_mostly rcu_nocb_poll; /* Offload kthread are to poll. */
58 */ 58 */
59static void __init rcu_bootup_announce_oddness(void) 59static void __init rcu_bootup_announce_oddness(void)
60{ 60{
61#ifdef CONFIG_RCU_TRACE 61 if (IS_ENABLED(CONFIG_RCU_TRACE))
62 pr_info("\tRCU debugfs-based tracing is enabled.\n"); 62 pr_info("\tRCU debugfs-based tracing is enabled.\n");
63#endif 63 if ((IS_ENABLED(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 64) ||
64#if (defined(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 64) || (!defined(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 32) 64 (!IS_ENABLED(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 32))
65 pr_info("\tCONFIG_RCU_FANOUT set to non-default value of %d\n", 65 pr_info("\tCONFIG_RCU_FANOUT set to non-default value of %d\n",
66 CONFIG_RCU_FANOUT); 66 CONFIG_RCU_FANOUT);
67#endif 67 if (IS_ENABLED(CONFIG_RCU_FANOUT_EXACT))
68#ifdef CONFIG_RCU_FANOUT_EXACT 68 pr_info("\tHierarchical RCU autobalancing is disabled.\n");
69 pr_info("\tHierarchical RCU autobalancing is disabled.\n"); 69 if (IS_ENABLED(CONFIG_RCU_FAST_NO_HZ))
70#endif 70 pr_info("\tRCU dyntick-idle grace-period acceleration is enabled.\n");
71#ifdef CONFIG_RCU_FAST_NO_HZ 71 if (IS_ENABLED(CONFIG_PROVE_RCU))
72 pr_info("\tRCU dyntick-idle grace-period acceleration is enabled.\n"); 72 pr_info("\tRCU lockdep checking is enabled.\n");
73#endif 73 if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_RUNNABLE))
74#ifdef CONFIG_PROVE_RCU 74 pr_info("\tRCU torture testing starts during boot.\n");
75 pr_info("\tRCU lockdep checking is enabled.\n"); 75 if (IS_ENABLED(CONFIG_RCU_CPU_STALL_INFO))
76#endif 76 pr_info("\tAdditional per-CPU info printed with stalls.\n");
77#ifdef CONFIG_RCU_TORTURE_TEST_RUNNABLE 77 if (NUM_RCU_LVL_4 != 0)
78 pr_info("\tRCU torture testing starts during boot.\n"); 78 pr_info("\tFour-level hierarchy is enabled.\n");
79#endif 79 if (CONFIG_RCU_FANOUT_LEAF != 16)
80#if defined(CONFIG_RCU_CPU_STALL_INFO) 80 pr_info("\tBuild-time adjustment of leaf fanout to %d.\n",
81 pr_info("\tAdditional per-CPU info printed with stalls.\n"); 81 CONFIG_RCU_FANOUT_LEAF);
82#endif
83#if NUM_RCU_LVL_4 != 0
84 pr_info("\tFour-level hierarchy is enabled.\n");
85#endif
86 if (rcu_fanout_leaf != CONFIG_RCU_FANOUT_LEAF) 82 if (rcu_fanout_leaf != CONFIG_RCU_FANOUT_LEAF)
87 pr_info("\tBoot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf); 83 pr_info("\tBoot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf);
88 if (nr_cpu_ids != NR_CPUS) 84 if (nr_cpu_ids != NR_CPUS)
89 pr_info("\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids); 85 pr_info("\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids);
90#ifdef CONFIG_RCU_BOOST 86 if (IS_ENABLED(CONFIG_RCU_BOOST))
91 pr_info("\tRCU kthread priority: %d.\n", kthread_prio); 87 pr_info("\tRCU kthread priority: %d.\n", kthread_prio);
92#endif
93} 88}
94 89
95#ifdef CONFIG_PREEMPT_RCU 90#ifdef CONFIG_PREEMPT_RCU
@@ -180,7 +175,7 @@ static void rcu_preempt_note_context_switch(void)
180 * But first, note that the current CPU must still be 175 * But first, note that the current CPU must still be
181 * on line! 176 * on line!
182 */ 177 */
183 WARN_ON_ONCE((rdp->grpmask & rnp->qsmaskinit) == 0); 178 WARN_ON_ONCE((rdp->grpmask & rcu_rnp_online_cpus(rnp)) == 0);
184 WARN_ON_ONCE(!list_empty(&t->rcu_node_entry)); 179 WARN_ON_ONCE(!list_empty(&t->rcu_node_entry));
185 if ((rnp->qsmask & rdp->grpmask) && rnp->gp_tasks != NULL) { 180 if ((rnp->qsmask & rdp->grpmask) && rnp->gp_tasks != NULL) {
186 list_add(&t->rcu_node_entry, rnp->gp_tasks->prev); 181 list_add(&t->rcu_node_entry, rnp->gp_tasks->prev);
@@ -233,43 +228,6 @@ static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp)
233} 228}
234 229
235/* 230/*
236 * Record a quiescent state for all tasks that were previously queued
237 * on the specified rcu_node structure and that were blocking the current
238 * RCU grace period. The caller must hold the specified rnp->lock with
239 * irqs disabled, and this lock is released upon return, but irqs remain
240 * disabled.
241 */
242static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
243 __releases(rnp->lock)
244{
245 unsigned long mask;
246 struct rcu_node *rnp_p;
247
248 if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) {
249 raw_spin_unlock_irqrestore(&rnp->lock, flags);
250 return; /* Still need more quiescent states! */
251 }
252
253 rnp_p = rnp->parent;
254 if (rnp_p == NULL) {
255 /*
256 * Either there is only one rcu_node in the tree,
257 * or tasks were kicked up to root rcu_node due to
258 * CPUs going offline.
259 */
260 rcu_report_qs_rsp(&rcu_preempt_state, flags);
261 return;
262 }
263
264 /* Report up the rest of the hierarchy. */
265 mask = rnp->grpmask;
266 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
267 raw_spin_lock(&rnp_p->lock); /* irqs already disabled. */
268 smp_mb__after_unlock_lock();
269 rcu_report_qs_rnp(mask, &rcu_preempt_state, rnp_p, flags);
270}
271
272/*
273 * Advance a ->blkd_tasks-list pointer to the next entry, instead 231 * Advance a ->blkd_tasks-list pointer to the next entry, instead
274 * returning NULL if at the end of the list. 232 * returning NULL if at the end of the list.
275 */ 233 */
@@ -300,7 +258,6 @@ static bool rcu_preempt_has_tasks(struct rcu_node *rnp)
300 */ 258 */
301void rcu_read_unlock_special(struct task_struct *t) 259void rcu_read_unlock_special(struct task_struct *t)
302{ 260{
303 bool empty;
304 bool empty_exp; 261 bool empty_exp;
305 bool empty_norm; 262 bool empty_norm;
306 bool empty_exp_now; 263 bool empty_exp_now;
@@ -334,7 +291,13 @@ void rcu_read_unlock_special(struct task_struct *t)
334 } 291 }
335 292
336 /* Hardware IRQ handlers cannot block, complain if they get here. */ 293 /* Hardware IRQ handlers cannot block, complain if they get here. */
337 if (WARN_ON_ONCE(in_irq() || in_serving_softirq())) { 294 if (in_irq() || in_serving_softirq()) {
295 lockdep_rcu_suspicious(__FILE__, __LINE__,
296 "rcu_read_unlock() from irq or softirq with blocking in critical section!!!\n");
297 pr_alert("->rcu_read_unlock_special: %#x (b: %d, nq: %d)\n",
298 t->rcu_read_unlock_special.s,
299 t->rcu_read_unlock_special.b.blocked,
300 t->rcu_read_unlock_special.b.need_qs);
338 local_irq_restore(flags); 301 local_irq_restore(flags);
339 return; 302 return;
340 } 303 }
@@ -356,7 +319,6 @@ void rcu_read_unlock_special(struct task_struct *t)
356 break; 319 break;
357 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ 320 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
358 } 321 }
359 empty = !rcu_preempt_has_tasks(rnp);
360 empty_norm = !rcu_preempt_blocked_readers_cgp(rnp); 322 empty_norm = !rcu_preempt_blocked_readers_cgp(rnp);
361 empty_exp = !rcu_preempted_readers_exp(rnp); 323 empty_exp = !rcu_preempted_readers_exp(rnp);
362 smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */ 324 smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */
@@ -377,14 +339,6 @@ void rcu_read_unlock_special(struct task_struct *t)
377#endif /* #ifdef CONFIG_RCU_BOOST */ 339#endif /* #ifdef CONFIG_RCU_BOOST */
378 340
379 /* 341 /*
380 * If this was the last task on the list, go see if we
381 * need to propagate ->qsmaskinit bit clearing up the
382 * rcu_node tree.
383 */
384 if (!empty && !rcu_preempt_has_tasks(rnp))
385 rcu_cleanup_dead_rnp(rnp);
386
387 /*
388 * If this was the last task on the current list, and if 342 * If this was the last task on the current list, and if
389 * we aren't waiting on any CPUs, report the quiescent state. 343 * we aren't waiting on any CPUs, report the quiescent state.
390 * Note that rcu_report_unblock_qs_rnp() releases rnp->lock, 344 * Note that rcu_report_unblock_qs_rnp() releases rnp->lock,
@@ -399,7 +353,8 @@ void rcu_read_unlock_special(struct task_struct *t)
399 rnp->grplo, 353 rnp->grplo,
400 rnp->grphi, 354 rnp->grphi,
401 !!rnp->gp_tasks); 355 !!rnp->gp_tasks);
402 rcu_report_unblock_qs_rnp(rnp, flags); 356 rcu_report_unblock_qs_rnp(&rcu_preempt_state,
357 rnp, flags);
403 } else { 358 } else {
404 raw_spin_unlock_irqrestore(&rnp->lock, flags); 359 raw_spin_unlock_irqrestore(&rnp->lock, flags);
405 } 360 }
@@ -520,10 +475,6 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
520 WARN_ON_ONCE(rnp->qsmask); 475 WARN_ON_ONCE(rnp->qsmask);
521} 476}
522 477
523#ifdef CONFIG_HOTPLUG_CPU
524
525#endif /* #ifdef CONFIG_HOTPLUG_CPU */
526
527/* 478/*
528 * Check for a quiescent state from the current CPU. When a task blocks, 479 * Check for a quiescent state from the current CPU. When a task blocks,
529 * the task is recorded in the corresponding CPU's rcu_node structure, 480 * the task is recorded in the corresponding CPU's rcu_node structure,
@@ -585,7 +536,7 @@ void synchronize_rcu(void)
585 "Illegal synchronize_rcu() in RCU read-side critical section"); 536 "Illegal synchronize_rcu() in RCU read-side critical section");
586 if (!rcu_scheduler_active) 537 if (!rcu_scheduler_active)
587 return; 538 return;
588 if (rcu_expedited) 539 if (rcu_gp_is_expedited())
589 synchronize_rcu_expedited(); 540 synchronize_rcu_expedited();
590 else 541 else
591 wait_rcu_gp(call_rcu); 542 wait_rcu_gp(call_rcu);
@@ -630,9 +581,6 @@ static int sync_rcu_preempt_exp_done(struct rcu_node *rnp)
630 * recursively up the tree. (Calm down, calm down, we do the recursion 581 * recursively up the tree. (Calm down, calm down, we do the recursion
631 * iteratively!) 582 * iteratively!)
632 * 583 *
633 * Most callers will set the "wake" flag, but the task initiating the
634 * expedited grace period need not wake itself.
635 *
636 * Caller must hold sync_rcu_preempt_exp_mutex. 584 * Caller must hold sync_rcu_preempt_exp_mutex.
637 */ 585 */
638static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, 586static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
@@ -667,29 +615,85 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
667 615
668/* 616/*
669 * Snapshot the tasks blocking the newly started preemptible-RCU expedited 617 * Snapshot the tasks blocking the newly started preemptible-RCU expedited
670 * grace period for the specified rcu_node structure. If there are no such 618 * grace period for the specified rcu_node structure, phase 1. If there
671 * tasks, report it up the rcu_node hierarchy. 619 * are such tasks, set the ->expmask bits up the rcu_node tree and also
620 * set the ->expmask bits on the leaf rcu_node structures to tell phase 2
621 * that work is needed here.
672 * 622 *
673 * Caller must hold sync_rcu_preempt_exp_mutex and must exclude 623 * Caller must hold sync_rcu_preempt_exp_mutex.
674 * CPU hotplug operations.
675 */ 624 */
676static void 625static void
677sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp) 626sync_rcu_preempt_exp_init1(struct rcu_state *rsp, struct rcu_node *rnp)
678{ 627{
679 unsigned long flags; 628 unsigned long flags;
680 int must_wait = 0; 629 unsigned long mask;
630 struct rcu_node *rnp_up;
681 631
682 raw_spin_lock_irqsave(&rnp->lock, flags); 632 raw_spin_lock_irqsave(&rnp->lock, flags);
683 smp_mb__after_unlock_lock(); 633 smp_mb__after_unlock_lock();
634 WARN_ON_ONCE(rnp->expmask);
635 WARN_ON_ONCE(rnp->exp_tasks);
684 if (!rcu_preempt_has_tasks(rnp)) { 636 if (!rcu_preempt_has_tasks(rnp)) {
637 /* No blocked tasks, nothing to do. */
685 raw_spin_unlock_irqrestore(&rnp->lock, flags); 638 raw_spin_unlock_irqrestore(&rnp->lock, flags);
686 } else { 639 return;
640 }
641 /* Call for Phase 2 and propagate ->expmask bits up the tree. */
642 rnp->expmask = 1;
643 rnp_up = rnp;
644 while (rnp_up->parent) {
645 mask = rnp_up->grpmask;
646 rnp_up = rnp_up->parent;
647 if (rnp_up->expmask & mask)
648 break;
649 raw_spin_lock(&rnp_up->lock); /* irqs already off */
650 smp_mb__after_unlock_lock();
651 rnp_up->expmask |= mask;
652 raw_spin_unlock(&rnp_up->lock); /* irqs still off */
653 }
654 raw_spin_unlock_irqrestore(&rnp->lock, flags);
655}
656
657/*
658 * Snapshot the tasks blocking the newly started preemptible-RCU expedited
659 * grace period for the specified rcu_node structure, phase 2. If the
660 * leaf rcu_node structure has its ->expmask field set, check for tasks.
661 * If there are some, clear ->expmask and set ->exp_tasks accordingly,
662 * then initiate RCU priority boosting. Otherwise, clear ->expmask and
663 * invoke rcu_report_exp_rnp() to clear out the upper-level ->expmask bits,
664 * enabling rcu_read_unlock_special() to do the bit-clearing.
665 *
666 * Caller must hold sync_rcu_preempt_exp_mutex.
667 */
668static void
669sync_rcu_preempt_exp_init2(struct rcu_state *rsp, struct rcu_node *rnp)
670{
671 unsigned long flags;
672
673 raw_spin_lock_irqsave(&rnp->lock, flags);
674 smp_mb__after_unlock_lock();
675 if (!rnp->expmask) {
676 /* Phase 1 didn't do anything, so Phase 2 doesn't either. */
677 raw_spin_unlock_irqrestore(&rnp->lock, flags);
678 return;
679 }
680
681 /* Phase 1 is over. */
682 rnp->expmask = 0;
683
684 /*
685 * If there are still blocked tasks, set up ->exp_tasks so that
686 * rcu_read_unlock_special() will wake us and then boost them.
687 */
688 if (rcu_preempt_has_tasks(rnp)) {
687 rnp->exp_tasks = rnp->blkd_tasks.next; 689 rnp->exp_tasks = rnp->blkd_tasks.next;
688 rcu_initiate_boost(rnp, flags); /* releases rnp->lock */ 690 rcu_initiate_boost(rnp, flags); /* releases rnp->lock */
689 must_wait = 1; 691 return;
690 } 692 }
691 if (!must_wait) 693
692 rcu_report_exp_rnp(rsp, rnp, false); /* Don't wake self. */ 694 /* No longer any blocked tasks, so undo bit setting. */
695 raw_spin_unlock_irqrestore(&rnp->lock, flags);
696 rcu_report_exp_rnp(rsp, rnp, false);
693} 697}
694 698
695/** 699/**
@@ -706,7 +710,6 @@ sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp)
706 */ 710 */
707void synchronize_rcu_expedited(void) 711void synchronize_rcu_expedited(void)
708{ 712{
709 unsigned long flags;
710 struct rcu_node *rnp; 713 struct rcu_node *rnp;
711 struct rcu_state *rsp = &rcu_preempt_state; 714 struct rcu_state *rsp = &rcu_preempt_state;
712 unsigned long snap; 715 unsigned long snap;
@@ -757,19 +760,16 @@ void synchronize_rcu_expedited(void)
757 /* force all RCU readers onto ->blkd_tasks lists. */ 760 /* force all RCU readers onto ->blkd_tasks lists. */
758 synchronize_sched_expedited(); 761 synchronize_sched_expedited();
759 762
760 /* Initialize ->expmask for all non-leaf rcu_node structures. */ 763 /*
761 rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) { 764 * Snapshot current state of ->blkd_tasks lists into ->expmask.
762 raw_spin_lock_irqsave(&rnp->lock, flags); 765 * Phase 1 sets bits and phase 2 permits rcu_read_unlock_special()
763 smp_mb__after_unlock_lock(); 766 * to start clearing them. Doing this in one phase leads to
764 rnp->expmask = rnp->qsmaskinit; 767 * strange races between setting and clearing bits, so just say "no"!
765 raw_spin_unlock_irqrestore(&rnp->lock, flags); 768 */
766 } 769 rcu_for_each_leaf_node(rsp, rnp)
767 770 sync_rcu_preempt_exp_init1(rsp, rnp);
768 /* Snapshot current state of ->blkd_tasks lists. */
769 rcu_for_each_leaf_node(rsp, rnp) 771 rcu_for_each_leaf_node(rsp, rnp)
770 sync_rcu_preempt_exp_init(rsp, rnp); 772 sync_rcu_preempt_exp_init2(rsp, rnp);
771 if (NUM_RCU_NODES > 1)
772 sync_rcu_preempt_exp_init(rsp, rcu_get_root(rsp));
773 773
774 put_online_cpus(); 774 put_online_cpus();
775 775
@@ -859,8 +859,6 @@ static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp)
859 return 0; 859 return 0;
860} 860}
861 861
862#ifdef CONFIG_HOTPLUG_CPU
863
864/* 862/*
865 * Because there is no preemptible RCU, there can be no readers blocked. 863 * Because there is no preemptible RCU, there can be no readers blocked.
866 */ 864 */
@@ -869,8 +867,6 @@ static bool rcu_preempt_has_tasks(struct rcu_node *rnp)
869 return false; 867 return false;
870} 868}
871 869
872#endif /* #ifdef CONFIG_HOTPLUG_CPU */
873
874/* 870/*
875 * Because preemptible RCU does not exist, we never have to check for 871 * Because preemptible RCU does not exist, we never have to check for
876 * tasks blocked within RCU read-side critical sections. 872 * tasks blocked within RCU read-side critical sections.
@@ -1170,7 +1166,7 @@ static void rcu_preempt_boost_start_gp(struct rcu_node *rnp)
1170 * Returns zero if all is well, a negated errno otherwise. 1166 * Returns zero if all is well, a negated errno otherwise.
1171 */ 1167 */
1172static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp, 1168static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
1173 struct rcu_node *rnp) 1169 struct rcu_node *rnp)
1174{ 1170{
1175 int rnp_index = rnp - &rsp->node[0]; 1171 int rnp_index = rnp - &rsp->node[0];
1176 unsigned long flags; 1172 unsigned long flags;
@@ -1180,7 +1176,7 @@ static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
1180 if (&rcu_preempt_state != rsp) 1176 if (&rcu_preempt_state != rsp)
1181 return 0; 1177 return 0;
1182 1178
1183 if (!rcu_scheduler_fully_active || rnp->qsmaskinit == 0) 1179 if (!rcu_scheduler_fully_active || rcu_rnp_online_cpus(rnp) == 0)
1184 return 0; 1180 return 0;
1185 1181
1186 rsp->boost = 1; 1182 rsp->boost = 1;
@@ -1273,7 +1269,7 @@ static void rcu_cpu_kthread(unsigned int cpu)
1273static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) 1269static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
1274{ 1270{
1275 struct task_struct *t = rnp->boost_kthread_task; 1271 struct task_struct *t = rnp->boost_kthread_task;
1276 unsigned long mask = rnp->qsmaskinit; 1272 unsigned long mask = rcu_rnp_online_cpus(rnp);
1277 cpumask_var_t cm; 1273 cpumask_var_t cm;
1278 int cpu; 1274 int cpu;
1279 1275
@@ -1945,7 +1941,8 @@ static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu)
1945 rhp = ACCESS_ONCE(rdp->nocb_follower_head); 1941 rhp = ACCESS_ONCE(rdp->nocb_follower_head);
1946 1942
1947 /* Having no rcuo kthread but CBs after scheduler starts is bad! */ 1943 /* Having no rcuo kthread but CBs after scheduler starts is bad! */
1948 if (!ACCESS_ONCE(rdp->nocb_kthread) && rhp) { 1944 if (!ACCESS_ONCE(rdp->nocb_kthread) && rhp &&
1945 rcu_scheduler_fully_active) {
1949 /* RCU callback enqueued before CPU first came online??? */ 1946 /* RCU callback enqueued before CPU first came online??? */
1950 pr_err("RCU: Never-onlined no-CBs CPU %d has CB %p\n", 1947 pr_err("RCU: Never-onlined no-CBs CPU %d has CB %p\n",
1951 cpu, rhp->func); 1948 cpu, rhp->func);
@@ -2392,18 +2389,8 @@ void __init rcu_init_nohz(void)
2392 pr_info("\tPoll for callbacks from no-CBs CPUs.\n"); 2389 pr_info("\tPoll for callbacks from no-CBs CPUs.\n");
2393 2390
2394 for_each_rcu_flavor(rsp) { 2391 for_each_rcu_flavor(rsp) {
2395 for_each_cpu(cpu, rcu_nocb_mask) { 2392 for_each_cpu(cpu, rcu_nocb_mask)
2396 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); 2393 init_nocb_callback_list(per_cpu_ptr(rsp->rda, cpu));
2397
2398 /*
2399 * If there are early callbacks, they will need
2400 * to be moved to the nocb lists.
2401 */
2402 WARN_ON_ONCE(rdp->nxttail[RCU_NEXT_TAIL] !=
2403 &rdp->nxtlist &&
2404 rdp->nxttail[RCU_NEXT_TAIL] != NULL);
2405 init_nocb_callback_list(rdp);
2406 }
2407 rcu_organize_nocb_kthreads(rsp); 2394 rcu_organize_nocb_kthreads(rsp);
2408 } 2395 }
2409} 2396}
@@ -2540,6 +2527,16 @@ static bool init_nocb_callback_list(struct rcu_data *rdp)
2540 if (!rcu_is_nocb_cpu(rdp->cpu)) 2527 if (!rcu_is_nocb_cpu(rdp->cpu))
2541 return false; 2528 return false;
2542 2529
2530 /* If there are early-boot callbacks, move them to nocb lists. */
2531 if (rdp->nxtlist) {
2532 rdp->nocb_head = rdp->nxtlist;
2533 rdp->nocb_tail = rdp->nxttail[RCU_NEXT_TAIL];
2534 atomic_long_set(&rdp->nocb_q_count, rdp->qlen);
2535 atomic_long_set(&rdp->nocb_q_count_lazy, rdp->qlen_lazy);
2536 rdp->nxtlist = NULL;
2537 rdp->qlen = 0;
2538 rdp->qlen_lazy = 0;
2539 }
2543 rdp->nxttail[RCU_NEXT_TAIL] = NULL; 2540 rdp->nxttail[RCU_NEXT_TAIL] = NULL;
2544 return true; 2541 return true;
2545} 2542}
@@ -2763,7 +2760,8 @@ static void rcu_sysidle_exit(int irq)
2763 2760
2764/* 2761/*
2765 * Check to see if the current CPU is idle. Note that usermode execution 2762 * Check to see if the current CPU is idle. Note that usermode execution
2766 * does not count as idle. The caller must have disabled interrupts. 2763 * does not count as idle. The caller must have disabled interrupts,
2764 * and must be running on tick_do_timer_cpu.
2767 */ 2765 */
2768static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle, 2766static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle,
2769 unsigned long *maxj) 2767 unsigned long *maxj)
@@ -2784,8 +2782,8 @@ static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle,
2784 if (!*isidle || rdp->rsp != rcu_state_p || 2782 if (!*isidle || rdp->rsp != rcu_state_p ||
2785 cpu_is_offline(rdp->cpu) || rdp->cpu == tick_do_timer_cpu) 2783 cpu_is_offline(rdp->cpu) || rdp->cpu == tick_do_timer_cpu)
2786 return; 2784 return;
2787 if (rcu_gp_in_progress(rdp->rsp)) 2785 /* Verify affinity of current kthread. */
2788 WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu); 2786 WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu);
2789 2787
2790 /* Pick up current idle and NMI-nesting counter and check. */ 2788 /* Pick up current idle and NMI-nesting counter and check. */
2791 cur = atomic_read(&rdtp->dynticks_idle); 2789 cur = atomic_read(&rdtp->dynticks_idle);
@@ -3068,11 +3066,10 @@ static void rcu_bind_gp_kthread(void)
3068 return; 3066 return;
3069#ifdef CONFIG_NO_HZ_FULL_SYSIDLE 3067#ifdef CONFIG_NO_HZ_FULL_SYSIDLE
3070 cpu = tick_do_timer_cpu; 3068 cpu = tick_do_timer_cpu;
3071 if (cpu >= 0 && cpu < nr_cpu_ids && raw_smp_processor_id() != cpu) 3069 if (cpu >= 0 && cpu < nr_cpu_ids)
3072 set_cpus_allowed_ptr(current, cpumask_of(cpu)); 3070 set_cpus_allowed_ptr(current, cpumask_of(cpu));
3073#else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ 3071#else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
3074 if (!is_housekeeping_cpu(raw_smp_processor_id())) 3072 housekeeping_affine(current);
3075 housekeeping_affine(current);
3076#endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ 3073#endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
3077} 3074}
3078 3075
diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c
index fbb6240509ea..f92361efd0f5 100644
--- a/kernel/rcu/tree_trace.c
+++ b/kernel/rcu/tree_trace.c
@@ -283,8 +283,8 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
283 seq_puts(m, "\n"); 283 seq_puts(m, "\n");
284 level = rnp->level; 284 level = rnp->level;
285 } 285 }
286 seq_printf(m, "%lx/%lx %c%c>%c %d:%d ^%d ", 286 seq_printf(m, "%lx/%lx->%lx %c%c>%c %d:%d ^%d ",
287 rnp->qsmask, rnp->qsmaskinit, 287 rnp->qsmask, rnp->qsmaskinit, rnp->qsmaskinitnext,
288 ".G"[rnp->gp_tasks != NULL], 288 ".G"[rnp->gp_tasks != NULL],
289 ".E"[rnp->exp_tasks != NULL], 289 ".E"[rnp->exp_tasks != NULL],
290 ".T"[!list_empty(&rnp->blkd_tasks)], 290 ".T"[!list_empty(&rnp->blkd_tasks)],
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index e0d31a345ee6..1f133350da01 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -62,6 +62,63 @@ MODULE_ALIAS("rcupdate");
62 62
63module_param(rcu_expedited, int, 0); 63module_param(rcu_expedited, int, 0);
64 64
65#ifndef CONFIG_TINY_RCU
66
67static atomic_t rcu_expedited_nesting =
68 ATOMIC_INIT(IS_ENABLED(CONFIG_RCU_EXPEDITE_BOOT) ? 1 : 0);
69
70/*
71 * Should normal grace-period primitives be expedited? Intended for
72 * use within RCU. Note that this function takes the rcu_expedited
73 * sysfs/boot variable into account as well as the rcu_expedite_gp()
74 * nesting. So looping on rcu_unexpedite_gp() until rcu_gp_is_expedited()
75 * returns false is a -really- bad idea.
76 */
77bool rcu_gp_is_expedited(void)
78{
79 return rcu_expedited || atomic_read(&rcu_expedited_nesting);
80}
81EXPORT_SYMBOL_GPL(rcu_gp_is_expedited);
82
83/**
84 * rcu_expedite_gp - Expedite future RCU grace periods
85 *
86 * After a call to this function, future calls to synchronize_rcu() and
87 * friends act as the corresponding synchronize_rcu_expedited() function
88 * had instead been called.
89 */
90void rcu_expedite_gp(void)
91{
92 atomic_inc(&rcu_expedited_nesting);
93}
94EXPORT_SYMBOL_GPL(rcu_expedite_gp);
95
96/**
97 * rcu_unexpedite_gp - Cancel prior rcu_expedite_gp() invocation
98 *
99 * Undo a prior call to rcu_expedite_gp(). If all prior calls to
100 * rcu_expedite_gp() are undone by a subsequent call to rcu_unexpedite_gp(),
101 * and if the rcu_expedited sysfs/boot parameter is not set, then all
102 * subsequent calls to synchronize_rcu() and friends will return to
103 * their normal non-expedited behavior.
104 */
105void rcu_unexpedite_gp(void)
106{
107 atomic_dec(&rcu_expedited_nesting);
108}
109EXPORT_SYMBOL_GPL(rcu_unexpedite_gp);
110
111#endif /* #ifndef CONFIG_TINY_RCU */
112
113/*
114 * Inform RCU of the end of the in-kernel boot sequence.
115 */
116void rcu_end_inkernel_boot(void)
117{
118 if (IS_ENABLED(CONFIG_RCU_EXPEDITE_BOOT))
119 rcu_unexpedite_gp();
120}
121
65#ifdef CONFIG_PREEMPT_RCU 122#ifdef CONFIG_PREEMPT_RCU
66 123
67/* 124/*
@@ -199,16 +256,13 @@ EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held);
199 256
200#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ 257#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
201 258
202struct rcu_synchronize { 259/**
203 struct rcu_head head; 260 * wakeme_after_rcu() - Callback function to awaken a task after grace period
204 struct completion completion; 261 * @head: Pointer to rcu_head member within rcu_synchronize structure
205}; 262 *
206 263 * Awaken the corresponding task now that a grace period has elapsed.
207/*
208 * Awaken the corresponding synchronize_rcu() instance now that a
209 * grace period has elapsed.
210 */ 264 */
211static void wakeme_after_rcu(struct rcu_head *head) 265void wakeme_after_rcu(struct rcu_head *head)
212{ 266{
213 struct rcu_synchronize *rcu; 267 struct rcu_synchronize *rcu;
214 268
diff --git a/kernel/reboot.c b/kernel/reboot.c
index 5925f5ae8dff..d20c85d9f8c0 100644
--- a/kernel/reboot.c
+++ b/kernel/reboot.c
@@ -387,8 +387,9 @@ void ctrl_alt_del(void)
387} 387}
388 388
389char poweroff_cmd[POWEROFF_CMD_PATH_LEN] = "/sbin/poweroff"; 389char poweroff_cmd[POWEROFF_CMD_PATH_LEN] = "/sbin/poweroff";
390static const char reboot_cmd[] = "/sbin/reboot";
390 391
391static int __orderly_poweroff(bool force) 392static int run_cmd(const char *cmd)
392{ 393{
393 char **argv; 394 char **argv;
394 static char *envp[] = { 395 static char *envp[] = {
@@ -397,8 +398,7 @@ static int __orderly_poweroff(bool force)
397 NULL 398 NULL
398 }; 399 };
399 int ret; 400 int ret;
400 401 argv = argv_split(GFP_KERNEL, cmd, NULL);
401 argv = argv_split(GFP_KERNEL, poweroff_cmd, NULL);
402 if (argv) { 402 if (argv) {
403 ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC); 403 ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
404 argv_free(argv); 404 argv_free(argv);
@@ -406,8 +406,33 @@ static int __orderly_poweroff(bool force)
406 ret = -ENOMEM; 406 ret = -ENOMEM;
407 } 407 }
408 408
409 return ret;
410}
411
412static int __orderly_reboot(void)
413{
414 int ret;
415
416 ret = run_cmd(reboot_cmd);
417
418 if (ret) {
419 pr_warn("Failed to start orderly reboot: forcing the issue\n");
420 emergency_sync();
421 kernel_restart(NULL);
422 }
423
424 return ret;
425}
426
427static int __orderly_poweroff(bool force)
428{
429 int ret;
430
431 ret = run_cmd(poweroff_cmd);
432
409 if (ret && force) { 433 if (ret && force) {
410 pr_warn("Failed to start orderly shutdown: forcing the issue\n"); 434 pr_warn("Failed to start orderly shutdown: forcing the issue\n");
435
411 /* 436 /*
412 * I guess this should try to kick off some daemon to sync and 437 * I guess this should try to kick off some daemon to sync and
413 * poweroff asap. Or not even bother syncing if we're doing an 438 * poweroff asap. Or not even bother syncing if we're doing an
@@ -436,15 +461,33 @@ static DECLARE_WORK(poweroff_work, poweroff_work_func);
436 * This may be called from any context to trigger a system shutdown. 461 * This may be called from any context to trigger a system shutdown.
437 * If the orderly shutdown fails, it will force an immediate shutdown. 462 * If the orderly shutdown fails, it will force an immediate shutdown.
438 */ 463 */
439int orderly_poweroff(bool force) 464void orderly_poweroff(bool force)
440{ 465{
441 if (force) /* do not override the pending "true" */ 466 if (force) /* do not override the pending "true" */
442 poweroff_force = true; 467 poweroff_force = true;
443 schedule_work(&poweroff_work); 468 schedule_work(&poweroff_work);
444 return 0;
445} 469}
446EXPORT_SYMBOL_GPL(orderly_poweroff); 470EXPORT_SYMBOL_GPL(orderly_poweroff);
447 471
472static void reboot_work_func(struct work_struct *work)
473{
474 __orderly_reboot();
475}
476
477static DECLARE_WORK(reboot_work, reboot_work_func);
478
479/**
480 * orderly_reboot - Trigger an orderly system reboot
481 *
482 * This may be called from any context to trigger a system reboot.
483 * If the orderly reboot fails, it will force an immediate reboot.
484 */
485void orderly_reboot(void)
486{
487 schedule_work(&reboot_work);
488}
489EXPORT_SYMBOL_GPL(orderly_reboot);
490
448static int __init reboot_setup(char *str) 491static int __init reboot_setup(char *str)
449{ 492{
450 for (;;) { 493 for (;;) {
diff --git a/kernel/resource.c b/kernel/resource.c
index 19f2357dfda3..90552aab5f2d 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -1034,8 +1034,6 @@ resource_size_t resource_alignment(struct resource *res)
1034 * 1034 *
1035 * request_region creates a new busy region. 1035 * request_region creates a new busy region.
1036 * 1036 *
1037 * check_region returns non-zero if the area is already busy.
1038 *
1039 * release_region releases a matching busy region. 1037 * release_region releases a matching busy region.
1040 */ 1038 */
1041 1039
@@ -1098,36 +1096,6 @@ struct resource * __request_region(struct resource *parent,
1098EXPORT_SYMBOL(__request_region); 1096EXPORT_SYMBOL(__request_region);
1099 1097
1100/** 1098/**
1101 * __check_region - check if a resource region is busy or free
1102 * @parent: parent resource descriptor
1103 * @start: resource start address
1104 * @n: resource region size
1105 *
1106 * Returns 0 if the region is free at the moment it is checked,
1107 * returns %-EBUSY if the region is busy.
1108 *
1109 * NOTE:
1110 * This function is deprecated because its use is racy.
1111 * Even if it returns 0, a subsequent call to request_region()
1112 * may fail because another driver etc. just allocated the region.
1113 * Do NOT use it. It will be removed from the kernel.
1114 */
1115int __check_region(struct resource *parent, resource_size_t start,
1116 resource_size_t n)
1117{
1118 struct resource * res;
1119
1120 res = __request_region(parent, start, n, "check-region", 0);
1121 if (!res)
1122 return -EBUSY;
1123
1124 release_resource(res);
1125 free_resource(res);
1126 return 0;
1127}
1128EXPORT_SYMBOL(__check_region);
1129
1130/**
1131 * __release_region - release a previously reserved resource region 1099 * __release_region - release a previously reserved resource region
1132 * @parent: parent resource descriptor 1100 * @parent: parent resource descriptor
1133 * @start: resource start address 1101 * @start: resource start address
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index f0f831e8a345..f9123a82cbb6 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -306,6 +306,9 @@ __read_mostly int scheduler_running;
306 */ 306 */
307int sysctl_sched_rt_runtime = 950000; 307int sysctl_sched_rt_runtime = 950000;
308 308
309/* cpus with isolated domains */
310cpumask_var_t cpu_isolated_map;
311
309/* 312/*
310 * this_rq_lock - lock this runqueue and disable interrupts. 313 * this_rq_lock - lock this runqueue and disable interrupts.
311 */ 314 */
@@ -690,6 +693,23 @@ static inline bool got_nohz_idle_kick(void)
690bool sched_can_stop_tick(void) 693bool sched_can_stop_tick(void)
691{ 694{
692 /* 695 /*
696 * FIFO realtime policy runs the highest priority task. Other runnable
697 * tasks are of a lower priority. The scheduler tick does nothing.
698 */
699 if (current->policy == SCHED_FIFO)
700 return true;
701
702 /*
703 * Round-robin realtime tasks time slice with other tasks at the same
704 * realtime priority. Is this task the only one at this priority?
705 */
706 if (current->policy == SCHED_RR) {
707 struct sched_rt_entity *rt_se = &current->rt;
708
709 return rt_se->run_list.prev == rt_se->run_list.next;
710 }
711
712 /*
693 * More than one running task need preemption. 713 * More than one running task need preemption.
694 * nr_running update is assumed to be visible 714 * nr_running update is assumed to be visible
695 * after IPI is sent from wakers. 715 * after IPI is sent from wakers.
@@ -996,6 +1016,13 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
996 rq_clock_skip_update(rq, true); 1016 rq_clock_skip_update(rq, true);
997} 1017}
998 1018
1019static ATOMIC_NOTIFIER_HEAD(task_migration_notifier);
1020
1021void register_task_migration_notifier(struct notifier_block *n)
1022{
1023 atomic_notifier_chain_register(&task_migration_notifier, n);
1024}
1025
999#ifdef CONFIG_SMP 1026#ifdef CONFIG_SMP
1000void set_task_cpu(struct task_struct *p, unsigned int new_cpu) 1027void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
1001{ 1028{
@@ -1026,10 +1053,18 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
1026 trace_sched_migrate_task(p, new_cpu); 1053 trace_sched_migrate_task(p, new_cpu);
1027 1054
1028 if (task_cpu(p) != new_cpu) { 1055 if (task_cpu(p) != new_cpu) {
1056 struct task_migration_notifier tmn;
1057
1029 if (p->sched_class->migrate_task_rq) 1058 if (p->sched_class->migrate_task_rq)
1030 p->sched_class->migrate_task_rq(p, new_cpu); 1059 p->sched_class->migrate_task_rq(p, new_cpu);
1031 p->se.nr_migrations++; 1060 p->se.nr_migrations++;
1032 perf_sw_event_sched(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 0); 1061 perf_sw_event_sched(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 0);
1062
1063 tmn.task = p;
1064 tmn.from_cpu = task_cpu(p);
1065 tmn.to_cpu = new_cpu;
1066
1067 atomic_notifier_call_chain(&task_migration_notifier, 0, &tmn);
1033 } 1068 }
1034 1069
1035 __set_task_cpu(p, new_cpu); 1070 __set_task_cpu(p, new_cpu);
@@ -2818,7 +2853,7 @@ asmlinkage __visible void __sched schedule_user(void)
2818 * we find a better solution. 2853 * we find a better solution.
2819 * 2854 *
2820 * NB: There are buggy callers of this function. Ideally we 2855 * NB: There are buggy callers of this function. Ideally we
2821 * should warn if prev_state != IN_USER, but that will trigger 2856 * should warn if prev_state != CONTEXT_USER, but that will trigger
2822 * too frequently to make sense yet. 2857 * too frequently to make sense yet.
2823 */ 2858 */
2824 enum ctx_state prev_state = exception_enter(); 2859 enum ctx_state prev_state = exception_enter();
@@ -3034,6 +3069,8 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
3034 } else { 3069 } else {
3035 if (dl_prio(oldprio)) 3070 if (dl_prio(oldprio))
3036 p->dl.dl_boosted = 0; 3071 p->dl.dl_boosted = 0;
3072 if (rt_prio(oldprio))
3073 p->rt.timeout = 0;
3037 p->sched_class = &fair_sched_class; 3074 p->sched_class = &fair_sched_class;
3038 } 3075 }
3039 3076
@@ -5318,36 +5355,13 @@ static int sched_cpu_active(struct notifier_block *nfb,
5318static int sched_cpu_inactive(struct notifier_block *nfb, 5355static int sched_cpu_inactive(struct notifier_block *nfb,
5319 unsigned long action, void *hcpu) 5356 unsigned long action, void *hcpu)
5320{ 5357{
5321 unsigned long flags;
5322 long cpu = (long)hcpu;
5323 struct dl_bw *dl_b;
5324
5325 switch (action & ~CPU_TASKS_FROZEN) { 5358 switch (action & ~CPU_TASKS_FROZEN) {
5326 case CPU_DOWN_PREPARE: 5359 case CPU_DOWN_PREPARE:
5327 set_cpu_active(cpu, false); 5360 set_cpu_active((long)hcpu, false);
5328
5329 /* explicitly allow suspend */
5330 if (!(action & CPU_TASKS_FROZEN)) {
5331 bool overflow;
5332 int cpus;
5333
5334 rcu_read_lock_sched();
5335 dl_b = dl_bw_of(cpu);
5336
5337 raw_spin_lock_irqsave(&dl_b->lock, flags);
5338 cpus = dl_bw_cpus(cpu);
5339 overflow = __dl_overflow(dl_b, cpus, 0, 0);
5340 raw_spin_unlock_irqrestore(&dl_b->lock, flags);
5341
5342 rcu_read_unlock_sched();
5343
5344 if (overflow)
5345 return notifier_from_errno(-EBUSY);
5346 }
5347 return NOTIFY_OK; 5361 return NOTIFY_OK;
5362 default:
5363 return NOTIFY_DONE;
5348 } 5364 }
5349
5350 return NOTIFY_DONE;
5351} 5365}
5352 5366
5353static int __init migration_init(void) 5367static int __init migration_init(void)
@@ -5428,17 +5442,6 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
5428 break; 5442 break;
5429 } 5443 }
5430 5444
5431 /*
5432 * Even though we initialize ->capacity to something semi-sane,
5433 * we leave capacity_orig unset. This allows us to detect if
5434 * domain iteration is still funny without causing /0 traps.
5435 */
5436 if (!group->sgc->capacity_orig) {
5437 printk(KERN_CONT "\n");
5438 printk(KERN_ERR "ERROR: domain->cpu_capacity not set\n");
5439 break;
5440 }
5441
5442 if (!cpumask_weight(sched_group_cpus(group))) { 5445 if (!cpumask_weight(sched_group_cpus(group))) {
5443 printk(KERN_CONT "\n"); 5446 printk(KERN_CONT "\n");
5444 printk(KERN_ERR "ERROR: empty group\n"); 5447 printk(KERN_ERR "ERROR: empty group\n");
@@ -5811,9 +5814,6 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
5811 update_top_cache_domain(cpu); 5814 update_top_cache_domain(cpu);
5812} 5815}
5813 5816
5814/* cpus with isolated domains */
5815static cpumask_var_t cpu_isolated_map;
5816
5817/* Setup the mask of cpus configured for isolated domains */ 5817/* Setup the mask of cpus configured for isolated domains */
5818static int __init isolated_cpu_setup(char *str) 5818static int __init isolated_cpu_setup(char *str)
5819{ 5819{
@@ -5922,7 +5922,6 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
5922 * die on a /0 trap. 5922 * die on a /0 trap.
5923 */ 5923 */
5924 sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span); 5924 sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span);
5925 sg->sgc->capacity_orig = sg->sgc->capacity;
5926 5925
5927 /* 5926 /*
5928 * Make sure the first group of this domain contains the 5927 * Make sure the first group of this domain contains the
@@ -6233,6 +6232,7 @@ sd_init(struct sched_domain_topology_level *tl, int cpu)
6233 */ 6232 */
6234 6233
6235 if (sd->flags & SD_SHARE_CPUCAPACITY) { 6234 if (sd->flags & SD_SHARE_CPUCAPACITY) {
6235 sd->flags |= SD_PREFER_SIBLING;
6236 sd->imbalance_pct = 110; 6236 sd->imbalance_pct = 110;
6237 sd->smt_gain = 1178; /* ~15% */ 6237 sd->smt_gain = 1178; /* ~15% */
6238 6238
@@ -6998,7 +6998,6 @@ static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,
6998 */ 6998 */
6999 6999
7000 case CPU_ONLINE: 7000 case CPU_ONLINE:
7001 case CPU_DOWN_FAILED:
7002 cpuset_update_active_cpus(true); 7001 cpuset_update_active_cpus(true);
7003 break; 7002 break;
7004 default: 7003 default:
@@ -7010,8 +7009,30 @@ static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,
7010static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action, 7009static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,
7011 void *hcpu) 7010 void *hcpu)
7012{ 7011{
7013 switch (action) { 7012 unsigned long flags;
7013 long cpu = (long)hcpu;
7014 struct dl_bw *dl_b;
7015
7016 switch (action & ~CPU_TASKS_FROZEN) {
7014 case CPU_DOWN_PREPARE: 7017 case CPU_DOWN_PREPARE:
7018 /* explicitly allow suspend */
7019 if (!(action & CPU_TASKS_FROZEN)) {
7020 bool overflow;
7021 int cpus;
7022
7023 rcu_read_lock_sched();
7024 dl_b = dl_bw_of(cpu);
7025
7026 raw_spin_lock_irqsave(&dl_b->lock, flags);
7027 cpus = dl_bw_cpus(cpu);
7028 overflow = __dl_overflow(dl_b, cpus, 0, 0);
7029 raw_spin_unlock_irqrestore(&dl_b->lock, flags);
7030
7031 rcu_read_unlock_sched();
7032
7033 if (overflow)
7034 return notifier_from_errno(-EBUSY);
7035 }
7015 cpuset_update_active_cpus(false); 7036 cpuset_update_active_cpus(false);
7016 break; 7037 break;
7017 case CPU_DOWN_PREPARE_FROZEN: 7038 case CPU_DOWN_PREPARE_FROZEN:
@@ -7156,8 +7177,8 @@ void __init sched_init(void)
7156 rq->calc_load_active = 0; 7177 rq->calc_load_active = 0;
7157 rq->calc_load_update = jiffies + LOAD_FREQ; 7178 rq->calc_load_update = jiffies + LOAD_FREQ;
7158 init_cfs_rq(&rq->cfs); 7179 init_cfs_rq(&rq->cfs);
7159 init_rt_rq(&rq->rt, rq); 7180 init_rt_rq(&rq->rt);
7160 init_dl_rq(&rq->dl, rq); 7181 init_dl_rq(&rq->dl);
7161#ifdef CONFIG_FAIR_GROUP_SCHED 7182#ifdef CONFIG_FAIR_GROUP_SCHED
7162 root_task_group.shares = ROOT_TASK_GROUP_LOAD; 7183 root_task_group.shares = ROOT_TASK_GROUP_LOAD;
7163 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); 7184 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
@@ -7197,7 +7218,7 @@ void __init sched_init(void)
7197#ifdef CONFIG_SMP 7218#ifdef CONFIG_SMP
7198 rq->sd = NULL; 7219 rq->sd = NULL;
7199 rq->rd = NULL; 7220 rq->rd = NULL;
7200 rq->cpu_capacity = SCHED_CAPACITY_SCALE; 7221 rq->cpu_capacity = rq->cpu_capacity_orig = SCHED_CAPACITY_SCALE;
7201 rq->post_schedule = 0; 7222 rq->post_schedule = 0;
7202 rq->active_balance = 0; 7223 rq->active_balance = 0;
7203 rq->next_balance = jiffies; 7224 rq->next_balance = jiffies;
@@ -7796,7 +7817,7 @@ static int sched_rt_global_constraints(void)
7796} 7817}
7797#endif /* CONFIG_RT_GROUP_SCHED */ 7818#endif /* CONFIG_RT_GROUP_SCHED */
7798 7819
7799static int sched_dl_global_constraints(void) 7820static int sched_dl_global_validate(void)
7800{ 7821{
7801 u64 runtime = global_rt_runtime(); 7822 u64 runtime = global_rt_runtime();
7802 u64 period = global_rt_period(); 7823 u64 period = global_rt_period();
@@ -7897,11 +7918,11 @@ int sched_rt_handler(struct ctl_table *table, int write,
7897 if (ret) 7918 if (ret)
7898 goto undo; 7919 goto undo;
7899 7920
7900 ret = sched_rt_global_constraints(); 7921 ret = sched_dl_global_validate();
7901 if (ret) 7922 if (ret)
7902 goto undo; 7923 goto undo;
7903 7924
7904 ret = sched_dl_global_constraints(); 7925 ret = sched_rt_global_constraints();
7905 if (ret) 7926 if (ret)
7906 goto undo; 7927 goto undo;
7907 7928
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 3fa8fa6d9403..5e95145088fd 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -69,7 +69,7 @@ void init_dl_bw(struct dl_bw *dl_b)
69 dl_b->total_bw = 0; 69 dl_b->total_bw = 0;
70} 70}
71 71
72void init_dl_rq(struct dl_rq *dl_rq, struct rq *rq) 72void init_dl_rq(struct dl_rq *dl_rq)
73{ 73{
74 dl_rq->rb_root = RB_ROOT; 74 dl_rq->rb_root = RB_ROOT;
75 75
@@ -218,6 +218,52 @@ static inline void set_post_schedule(struct rq *rq)
218 rq->post_schedule = has_pushable_dl_tasks(rq); 218 rq->post_schedule = has_pushable_dl_tasks(rq);
219} 219}
220 220
221static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq);
222
223static void dl_task_offline_migration(struct rq *rq, struct task_struct *p)
224{
225 struct rq *later_rq = NULL;
226 bool fallback = false;
227
228 later_rq = find_lock_later_rq(p, rq);
229
230 if (!later_rq) {
231 int cpu;
232
233 /*
234 * If we cannot preempt any rq, fall back to pick any
235 * online cpu.
236 */
237 fallback = true;
238 cpu = cpumask_any_and(cpu_active_mask, tsk_cpus_allowed(p));
239 if (cpu >= nr_cpu_ids) {
240 /*
241 * Fail to find any suitable cpu.
242 * The task will never come back!
243 */
244 BUG_ON(dl_bandwidth_enabled());
245
246 /*
247 * If admission control is disabled we
248 * try a little harder to let the task
249 * run.
250 */
251 cpu = cpumask_any(cpu_active_mask);
252 }
253 later_rq = cpu_rq(cpu);
254 double_lock_balance(rq, later_rq);
255 }
256
257 deactivate_task(rq, p, 0);
258 set_task_cpu(p, later_rq->cpu);
259 activate_task(later_rq, p, ENQUEUE_REPLENISH);
260
261 if (!fallback)
262 resched_curr(later_rq);
263
264 double_unlock_balance(rq, later_rq);
265}
266
221#else 267#else
222 268
223static inline 269static inline
@@ -514,7 +560,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
514 unsigned long flags; 560 unsigned long flags;
515 struct rq *rq; 561 struct rq *rq;
516 562
517 rq = task_rq_lock(current, &flags); 563 rq = task_rq_lock(p, &flags);
518 564
519 /* 565 /*
520 * We need to take care of several possible races here: 566 * We need to take care of several possible races here:
@@ -536,6 +582,17 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
536 sched_clock_tick(); 582 sched_clock_tick();
537 update_rq_clock(rq); 583 update_rq_clock(rq);
538 584
585#ifdef CONFIG_SMP
586 /*
587 * If we find that the rq the task was on is no longer
588 * available, we need to select a new rq.
589 */
590 if (unlikely(!rq->online)) {
591 dl_task_offline_migration(rq, p);
592 goto unlock;
593 }
594#endif
595
539 /* 596 /*
540 * If the throttle happened during sched-out; like: 597 * If the throttle happened during sched-out; like:
541 * 598 *
@@ -569,7 +626,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
569 push_dl_task(rq); 626 push_dl_task(rq);
570#endif 627#endif
571unlock: 628unlock:
572 task_rq_unlock(rq, current, &flags); 629 task_rq_unlock(rq, p, &flags);
573 630
574 return HRTIMER_NORESTART; 631 return HRTIMER_NORESTART;
575} 632}
@@ -914,6 +971,12 @@ static void yield_task_dl(struct rq *rq)
914 } 971 }
915 update_rq_clock(rq); 972 update_rq_clock(rq);
916 update_curr_dl(rq); 973 update_curr_dl(rq);
974 /*
975 * Tell update_rq_clock() that we've just updated,
976 * so we don't do microscopic update in schedule()
977 * and double the fastpath cost.
978 */
979 rq_clock_skip_update(rq, true);
917} 980}
918 981
919#ifdef CONFIG_SMP 982#ifdef CONFIG_SMP
@@ -1659,14 +1722,6 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
1659{ 1722{
1660 int check_resched = 1; 1723 int check_resched = 1;
1661 1724
1662 /*
1663 * If p is throttled, don't consider the possibility
1664 * of preempting rq->curr, the check will be done right
1665 * after its runtime will get replenished.
1666 */
1667 if (unlikely(p->dl.dl_throttled))
1668 return;
1669
1670 if (task_on_rq_queued(p) && rq->curr != p) { 1725 if (task_on_rq_queued(p) && rq->curr != p) {
1671#ifdef CONFIG_SMP 1726#ifdef CONFIG_SMP
1672 if (p->nr_cpus_allowed > 1 && rq->dl.overloaded && 1727 if (p->nr_cpus_allowed > 1 && rq->dl.overloaded &&
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 8baaf858d25c..a245c1fc6f0a 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -71,7 +71,7 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
71 if (!se) { 71 if (!se) {
72 struct sched_avg *avg = &cpu_rq(cpu)->avg; 72 struct sched_avg *avg = &cpu_rq(cpu)->avg;
73 P(avg->runnable_avg_sum); 73 P(avg->runnable_avg_sum);
74 P(avg->runnable_avg_period); 74 P(avg->avg_period);
75 return; 75 return;
76 } 76 }
77 77
@@ -94,8 +94,10 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
94 P(se->load.weight); 94 P(se->load.weight);
95#ifdef CONFIG_SMP 95#ifdef CONFIG_SMP
96 P(se->avg.runnable_avg_sum); 96 P(se->avg.runnable_avg_sum);
97 P(se->avg.runnable_avg_period); 97 P(se->avg.running_avg_sum);
98 P(se->avg.avg_period);
98 P(se->avg.load_avg_contrib); 99 P(se->avg.load_avg_contrib);
100 P(se->avg.utilization_avg_contrib);
99 P(se->avg.decay_count); 101 P(se->avg.decay_count);
100#endif 102#endif
101#undef PN 103#undef PN
@@ -214,6 +216,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
214 cfs_rq->runnable_load_avg); 216 cfs_rq->runnable_load_avg);
215 SEQ_printf(m, " .%-30s: %ld\n", "blocked_load_avg", 217 SEQ_printf(m, " .%-30s: %ld\n", "blocked_load_avg",
216 cfs_rq->blocked_load_avg); 218 cfs_rq->blocked_load_avg);
219 SEQ_printf(m, " .%-30s: %ld\n", "utilization_load_avg",
220 cfs_rq->utilization_load_avg);
217#ifdef CONFIG_FAIR_GROUP_SCHED 221#ifdef CONFIG_FAIR_GROUP_SCHED
218 SEQ_printf(m, " .%-30s: %ld\n", "tg_load_contrib", 222 SEQ_printf(m, " .%-30s: %ld\n", "tg_load_contrib",
219 cfs_rq->tg_load_contrib); 223 cfs_rq->tg_load_contrib);
@@ -636,8 +640,10 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
636 P(se.load.weight); 640 P(se.load.weight);
637#ifdef CONFIG_SMP 641#ifdef CONFIG_SMP
638 P(se.avg.runnable_avg_sum); 642 P(se.avg.runnable_avg_sum);
639 P(se.avg.runnable_avg_period); 643 P(se.avg.running_avg_sum);
644 P(se.avg.avg_period);
640 P(se.avg.load_avg_contrib); 645 P(se.avg.load_avg_contrib);
646 P(se.avg.utilization_avg_contrib);
641 P(se.avg.decay_count); 647 P(se.avg.decay_count);
642#endif 648#endif
643 P(policy); 649 P(policy);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7ce18f3c097a..ffeaa4105e48 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -670,6 +670,7 @@ static int select_idle_sibling(struct task_struct *p, int cpu);
670static unsigned long task_h_load(struct task_struct *p); 670static unsigned long task_h_load(struct task_struct *p);
671 671
672static inline void __update_task_entity_contrib(struct sched_entity *se); 672static inline void __update_task_entity_contrib(struct sched_entity *se);
673static inline void __update_task_entity_utilization(struct sched_entity *se);
673 674
674/* Give new task start runnable values to heavy its load in infant time */ 675/* Give new task start runnable values to heavy its load in infant time */
675void init_task_runnable_average(struct task_struct *p) 676void init_task_runnable_average(struct task_struct *p)
@@ -677,9 +678,10 @@ void init_task_runnable_average(struct task_struct *p)
677 u32 slice; 678 u32 slice;
678 679
679 slice = sched_slice(task_cfs_rq(p), &p->se) >> 10; 680 slice = sched_slice(task_cfs_rq(p), &p->se) >> 10;
680 p->se.avg.runnable_avg_sum = slice; 681 p->se.avg.runnable_avg_sum = p->se.avg.running_avg_sum = slice;
681 p->se.avg.runnable_avg_period = slice; 682 p->se.avg.avg_period = slice;
682 __update_task_entity_contrib(&p->se); 683 __update_task_entity_contrib(&p->se);
684 __update_task_entity_utilization(&p->se);
683} 685}
684#else 686#else
685void init_task_runnable_average(struct task_struct *p) 687void init_task_runnable_average(struct task_struct *p)
@@ -1196,9 +1198,11 @@ static void task_numa_assign(struct task_numa_env *env,
1196static bool load_too_imbalanced(long src_load, long dst_load, 1198static bool load_too_imbalanced(long src_load, long dst_load,
1197 struct task_numa_env *env) 1199 struct task_numa_env *env)
1198{ 1200{
1199 long imb, old_imb;
1200 long orig_src_load, orig_dst_load;
1201 long src_capacity, dst_capacity; 1201 long src_capacity, dst_capacity;
1202 long orig_src_load;
1203 long load_a, load_b;
1204 long moved_load;
1205 long imb;
1202 1206
1203 /* 1207 /*
1204 * The load is corrected for the CPU capacity available on each node. 1208 * The load is corrected for the CPU capacity available on each node.
@@ -1211,30 +1215,39 @@ static bool load_too_imbalanced(long src_load, long dst_load,
1211 dst_capacity = env->dst_stats.compute_capacity; 1215 dst_capacity = env->dst_stats.compute_capacity;
1212 1216
1213 /* We care about the slope of the imbalance, not the direction. */ 1217 /* We care about the slope of the imbalance, not the direction. */
1214 if (dst_load < src_load) 1218 load_a = dst_load;
1215 swap(dst_load, src_load); 1219 load_b = src_load;
1220 if (load_a < load_b)
1221 swap(load_a, load_b);
1216 1222
1217 /* Is the difference below the threshold? */ 1223 /* Is the difference below the threshold? */
1218 imb = dst_load * src_capacity * 100 - 1224 imb = load_a * src_capacity * 100 -
1219 src_load * dst_capacity * env->imbalance_pct; 1225 load_b * dst_capacity * env->imbalance_pct;
1220 if (imb <= 0) 1226 if (imb <= 0)
1221 return false; 1227 return false;
1222 1228
1223 /* 1229 /*
1224 * The imbalance is above the allowed threshold. 1230 * The imbalance is above the allowed threshold.
1225 * Compare it with the old imbalance. 1231 * Allow a move that brings us closer to a balanced situation,
1232 * without moving things past the point of balance.
1226 */ 1233 */
1227 orig_src_load = env->src_stats.load; 1234 orig_src_load = env->src_stats.load;
1228 orig_dst_load = env->dst_stats.load;
1229 1235
1230 if (orig_dst_load < orig_src_load) 1236 /*
1231 swap(orig_dst_load, orig_src_load); 1237 * In a task swap, there will be one load moving from src to dst,
1232 1238 * and another moving back. This is the net sum of both moves.
1233 old_imb = orig_dst_load * src_capacity * 100 - 1239 * A simple task move will always have a positive value.
1234 orig_src_load * dst_capacity * env->imbalance_pct; 1240 * Allow the move if it brings the system closer to a balanced
1241 * situation, without crossing over the balance point.
1242 */
1243 moved_load = orig_src_load - src_load;
1235 1244
1236 /* Would this change make things worse? */ 1245 if (moved_load > 0)
1237 return (imb > old_imb); 1246 /* Moving src -> dst. Did we overshoot balance? */
1247 return src_load * dst_capacity < dst_load * src_capacity;
1248 else
1249 /* Moving dst -> src. Did we overshoot balance? */
1250 return dst_load * src_capacity < src_load * dst_capacity;
1238} 1251}
1239 1252
1240/* 1253/*
@@ -1609,9 +1622,11 @@ static void update_task_scan_period(struct task_struct *p,
1609 /* 1622 /*
1610 * If there were no record hinting faults then either the task is 1623 * If there were no record hinting faults then either the task is
1611 * completely idle or all activity is areas that are not of interest 1624 * completely idle or all activity is areas that are not of interest
1612 * to automatic numa balancing. Scan slower 1625 * to automatic numa balancing. Related to that, if there were failed
1626 * migration then it implies we are migrating too quickly or the local
1627 * node is overloaded. In either case, scan slower
1613 */ 1628 */
1614 if (local + shared == 0) { 1629 if (local + shared == 0 || p->numa_faults_locality[2]) {
1615 p->numa_scan_period = min(p->numa_scan_period_max, 1630 p->numa_scan_period = min(p->numa_scan_period_max,
1616 p->numa_scan_period << 1); 1631 p->numa_scan_period << 1);
1617 1632
@@ -1673,7 +1688,7 @@ static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period)
1673 *period = now - p->last_task_numa_placement; 1688 *period = now - p->last_task_numa_placement;
1674 } else { 1689 } else {
1675 delta = p->se.avg.runnable_avg_sum; 1690 delta = p->se.avg.runnable_avg_sum;
1676 *period = p->se.avg.runnable_avg_period; 1691 *period = p->se.avg.avg_period;
1677 } 1692 }
1678 1693
1679 p->last_sum_exec_runtime = runtime; 1694 p->last_sum_exec_runtime = runtime;
@@ -1763,6 +1778,8 @@ static int preferred_group_nid(struct task_struct *p, int nid)
1763 } 1778 }
1764 } 1779 }
1765 /* Next round, evaluate the nodes within max_group. */ 1780 /* Next round, evaluate the nodes within max_group. */
1781 if (!max_faults)
1782 break;
1766 nodes = max_group; 1783 nodes = max_group;
1767 } 1784 }
1768 return nid; 1785 return nid;
@@ -2080,6 +2097,8 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
2080 2097
2081 if (migrated) 2098 if (migrated)
2082 p->numa_pages_migrated += pages; 2099 p->numa_pages_migrated += pages;
2100 if (flags & TNF_MIGRATE_FAIL)
2101 p->numa_faults_locality[2] += pages;
2083 2102
2084 p->numa_faults[task_faults_idx(NUMA_MEMBUF, mem_node, priv)] += pages; 2103 p->numa_faults[task_faults_idx(NUMA_MEMBUF, mem_node, priv)] += pages;
2085 p->numa_faults[task_faults_idx(NUMA_CPUBUF, cpu_node, priv)] += pages; 2104 p->numa_faults[task_faults_idx(NUMA_CPUBUF, cpu_node, priv)] += pages;
@@ -2161,8 +2180,10 @@ void task_numa_work(struct callback_head *work)
2161 vma = mm->mmap; 2180 vma = mm->mmap;
2162 } 2181 }
2163 for (; vma; vma = vma->vm_next) { 2182 for (; vma; vma = vma->vm_next) {
2164 if (!vma_migratable(vma) || !vma_policy_mof(vma)) 2183 if (!vma_migratable(vma) || !vma_policy_mof(vma) ||
2184 is_vm_hugetlb_page(vma)) {
2165 continue; 2185 continue;
2186 }
2166 2187
2167 /* 2188 /*
2168 * Shared library pages mapped by multiple processes are not 2189 * Shared library pages mapped by multiple processes are not
@@ -2497,13 +2518,15 @@ static u32 __compute_runnable_contrib(u64 n)
2497 * load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... ) 2518 * load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... )
2498 * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}] 2519 * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
2499 */ 2520 */
2500static __always_inline int __update_entity_runnable_avg(u64 now, 2521static __always_inline int __update_entity_runnable_avg(u64 now, int cpu,
2501 struct sched_avg *sa, 2522 struct sched_avg *sa,
2502 int runnable) 2523 int runnable,
2524 int running)
2503{ 2525{
2504 u64 delta, periods; 2526 u64 delta, periods;
2505 u32 runnable_contrib; 2527 u32 runnable_contrib;
2506 int delta_w, decayed = 0; 2528 int delta_w, decayed = 0;
2529 unsigned long scale_freq = arch_scale_freq_capacity(NULL, cpu);
2507 2530
2508 delta = now - sa->last_runnable_update; 2531 delta = now - sa->last_runnable_update;
2509 /* 2532 /*
@@ -2525,7 +2548,7 @@ static __always_inline int __update_entity_runnable_avg(u64 now,
2525 sa->last_runnable_update = now; 2548 sa->last_runnable_update = now;
2526 2549
2527 /* delta_w is the amount already accumulated against our next period */ 2550 /* delta_w is the amount already accumulated against our next period */
2528 delta_w = sa->runnable_avg_period % 1024; 2551 delta_w = sa->avg_period % 1024;
2529 if (delta + delta_w >= 1024) { 2552 if (delta + delta_w >= 1024) {
2530 /* period roll-over */ 2553 /* period roll-over */
2531 decayed = 1; 2554 decayed = 1;
@@ -2538,7 +2561,10 @@ static __always_inline int __update_entity_runnable_avg(u64 now,
2538 delta_w = 1024 - delta_w; 2561 delta_w = 1024 - delta_w;
2539 if (runnable) 2562 if (runnable)
2540 sa->runnable_avg_sum += delta_w; 2563 sa->runnable_avg_sum += delta_w;
2541 sa->runnable_avg_period += delta_w; 2564 if (running)
2565 sa->running_avg_sum += delta_w * scale_freq
2566 >> SCHED_CAPACITY_SHIFT;
2567 sa->avg_period += delta_w;
2542 2568
2543 delta -= delta_w; 2569 delta -= delta_w;
2544 2570
@@ -2548,20 +2574,28 @@ static __always_inline int __update_entity_runnable_avg(u64 now,
2548 2574
2549 sa->runnable_avg_sum = decay_load(sa->runnable_avg_sum, 2575 sa->runnable_avg_sum = decay_load(sa->runnable_avg_sum,
2550 periods + 1); 2576 periods + 1);
2551 sa->runnable_avg_period = decay_load(sa->runnable_avg_period, 2577 sa->running_avg_sum = decay_load(sa->running_avg_sum,
2578 periods + 1);
2579 sa->avg_period = decay_load(sa->avg_period,
2552 periods + 1); 2580 periods + 1);
2553 2581
2554 /* Efficiently calculate \sum (1..n_period) 1024*y^i */ 2582 /* Efficiently calculate \sum (1..n_period) 1024*y^i */
2555 runnable_contrib = __compute_runnable_contrib(periods); 2583 runnable_contrib = __compute_runnable_contrib(periods);
2556 if (runnable) 2584 if (runnable)
2557 sa->runnable_avg_sum += runnable_contrib; 2585 sa->runnable_avg_sum += runnable_contrib;
2558 sa->runnable_avg_period += runnable_contrib; 2586 if (running)
2587 sa->running_avg_sum += runnable_contrib * scale_freq
2588 >> SCHED_CAPACITY_SHIFT;
2589 sa->avg_period += runnable_contrib;
2559 } 2590 }
2560 2591
2561 /* Remainder of delta accrued against u_0` */ 2592 /* Remainder of delta accrued against u_0` */
2562 if (runnable) 2593 if (runnable)
2563 sa->runnable_avg_sum += delta; 2594 sa->runnable_avg_sum += delta;
2564 sa->runnable_avg_period += delta; 2595 if (running)
2596 sa->running_avg_sum += delta * scale_freq
2597 >> SCHED_CAPACITY_SHIFT;
2598 sa->avg_period += delta;
2565 2599
2566 return decayed; 2600 return decayed;
2567} 2601}
@@ -2578,6 +2612,8 @@ static inline u64 __synchronize_entity_decay(struct sched_entity *se)
2578 return 0; 2612 return 0;
2579 2613
2580 se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays); 2614 se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays);
2615 se->avg.utilization_avg_contrib =
2616 decay_load(se->avg.utilization_avg_contrib, decays);
2581 2617
2582 return decays; 2618 return decays;
2583} 2619}
@@ -2613,7 +2649,7 @@ static inline void __update_tg_runnable_avg(struct sched_avg *sa,
2613 2649
2614 /* The fraction of a cpu used by this cfs_rq */ 2650 /* The fraction of a cpu used by this cfs_rq */
2615 contrib = div_u64((u64)sa->runnable_avg_sum << NICE_0_SHIFT, 2651 contrib = div_u64((u64)sa->runnable_avg_sum << NICE_0_SHIFT,
2616 sa->runnable_avg_period + 1); 2652 sa->avg_period + 1);
2617 contrib -= cfs_rq->tg_runnable_contrib; 2653 contrib -= cfs_rq->tg_runnable_contrib;
2618 2654
2619 if (abs(contrib) > cfs_rq->tg_runnable_contrib / 64) { 2655 if (abs(contrib) > cfs_rq->tg_runnable_contrib / 64) {
@@ -2666,7 +2702,8 @@ static inline void __update_group_entity_contrib(struct sched_entity *se)
2666 2702
2667static inline void update_rq_runnable_avg(struct rq *rq, int runnable) 2703static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
2668{ 2704{
2669 __update_entity_runnable_avg(rq_clock_task(rq), &rq->avg, runnable); 2705 __update_entity_runnable_avg(rq_clock_task(rq), cpu_of(rq), &rq->avg,
2706 runnable, runnable);
2670 __update_tg_runnable_avg(&rq->avg, &rq->cfs); 2707 __update_tg_runnable_avg(&rq->avg, &rq->cfs);
2671} 2708}
2672#else /* CONFIG_FAIR_GROUP_SCHED */ 2709#else /* CONFIG_FAIR_GROUP_SCHED */
@@ -2684,7 +2721,7 @@ static inline void __update_task_entity_contrib(struct sched_entity *se)
2684 2721
2685 /* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */ 2722 /* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */
2686 contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight); 2723 contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight);
2687 contrib /= (se->avg.runnable_avg_period + 1); 2724 contrib /= (se->avg.avg_period + 1);
2688 se->avg.load_avg_contrib = scale_load(contrib); 2725 se->avg.load_avg_contrib = scale_load(contrib);
2689} 2726}
2690 2727
@@ -2703,6 +2740,30 @@ static long __update_entity_load_avg_contrib(struct sched_entity *se)
2703 return se->avg.load_avg_contrib - old_contrib; 2740 return se->avg.load_avg_contrib - old_contrib;
2704} 2741}
2705 2742
2743
2744static inline void __update_task_entity_utilization(struct sched_entity *se)
2745{
2746 u32 contrib;
2747
2748 /* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */
2749 contrib = se->avg.running_avg_sum * scale_load_down(SCHED_LOAD_SCALE);
2750 contrib /= (se->avg.avg_period + 1);
2751 se->avg.utilization_avg_contrib = scale_load(contrib);
2752}
2753
2754static long __update_entity_utilization_avg_contrib(struct sched_entity *se)
2755{
2756 long old_contrib = se->avg.utilization_avg_contrib;
2757
2758 if (entity_is_task(se))
2759 __update_task_entity_utilization(se);
2760 else
2761 se->avg.utilization_avg_contrib =
2762 group_cfs_rq(se)->utilization_load_avg;
2763
2764 return se->avg.utilization_avg_contrib - old_contrib;
2765}
2766
2706static inline void subtract_blocked_load_contrib(struct cfs_rq *cfs_rq, 2767static inline void subtract_blocked_load_contrib(struct cfs_rq *cfs_rq,
2707 long load_contrib) 2768 long load_contrib)
2708{ 2769{
@@ -2719,7 +2780,8 @@ static inline void update_entity_load_avg(struct sched_entity *se,
2719 int update_cfs_rq) 2780 int update_cfs_rq)
2720{ 2781{
2721 struct cfs_rq *cfs_rq = cfs_rq_of(se); 2782 struct cfs_rq *cfs_rq = cfs_rq_of(se);
2722 long contrib_delta; 2783 long contrib_delta, utilization_delta;
2784 int cpu = cpu_of(rq_of(cfs_rq));
2723 u64 now; 2785 u64 now;
2724 2786
2725 /* 2787 /*
@@ -2731,18 +2793,22 @@ static inline void update_entity_load_avg(struct sched_entity *se,
2731 else 2793 else
2732 now = cfs_rq_clock_task(group_cfs_rq(se)); 2794 now = cfs_rq_clock_task(group_cfs_rq(se));
2733 2795
2734 if (!__update_entity_runnable_avg(now, &se->avg, se->on_rq)) 2796 if (!__update_entity_runnable_avg(now, cpu, &se->avg, se->on_rq,
2797 cfs_rq->curr == se))
2735 return; 2798 return;
2736 2799
2737 contrib_delta = __update_entity_load_avg_contrib(se); 2800 contrib_delta = __update_entity_load_avg_contrib(se);
2801 utilization_delta = __update_entity_utilization_avg_contrib(se);
2738 2802
2739 if (!update_cfs_rq) 2803 if (!update_cfs_rq)
2740 return; 2804 return;
2741 2805
2742 if (se->on_rq) 2806 if (se->on_rq) {
2743 cfs_rq->runnable_load_avg += contrib_delta; 2807 cfs_rq->runnable_load_avg += contrib_delta;
2744 else 2808 cfs_rq->utilization_load_avg += utilization_delta;
2809 } else {
2745 subtract_blocked_load_contrib(cfs_rq, -contrib_delta); 2810 subtract_blocked_load_contrib(cfs_rq, -contrib_delta);
2811 }
2746} 2812}
2747 2813
2748/* 2814/*
@@ -2817,6 +2883,7 @@ static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
2817 } 2883 }
2818 2884
2819 cfs_rq->runnable_load_avg += se->avg.load_avg_contrib; 2885 cfs_rq->runnable_load_avg += se->avg.load_avg_contrib;
2886 cfs_rq->utilization_load_avg += se->avg.utilization_avg_contrib;
2820 /* we force update consideration on load-balancer moves */ 2887 /* we force update consideration on load-balancer moves */
2821 update_cfs_rq_blocked_load(cfs_rq, !wakeup); 2888 update_cfs_rq_blocked_load(cfs_rq, !wakeup);
2822} 2889}
@@ -2835,6 +2902,7 @@ static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
2835 update_cfs_rq_blocked_load(cfs_rq, !sleep); 2902 update_cfs_rq_blocked_load(cfs_rq, !sleep);
2836 2903
2837 cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib; 2904 cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib;
2905 cfs_rq->utilization_load_avg -= se->avg.utilization_avg_contrib;
2838 if (sleep) { 2906 if (sleep) {
2839 cfs_rq->blocked_load_avg += se->avg.load_avg_contrib; 2907 cfs_rq->blocked_load_avg += se->avg.load_avg_contrib;
2840 se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter); 2908 se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
@@ -3172,6 +3240,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
3172 */ 3240 */
3173 update_stats_wait_end(cfs_rq, se); 3241 update_stats_wait_end(cfs_rq, se);
3174 __dequeue_entity(cfs_rq, se); 3242 __dequeue_entity(cfs_rq, se);
3243 update_entity_load_avg(se, 1);
3175 } 3244 }
3176 3245
3177 update_stats_curr_start(cfs_rq, se); 3246 update_stats_curr_start(cfs_rq, se);
@@ -4298,6 +4367,11 @@ static unsigned long capacity_of(int cpu)
4298 return cpu_rq(cpu)->cpu_capacity; 4367 return cpu_rq(cpu)->cpu_capacity;
4299} 4368}
4300 4369
4370static unsigned long capacity_orig_of(int cpu)
4371{
4372 return cpu_rq(cpu)->cpu_capacity_orig;
4373}
4374
4301static unsigned long cpu_avg_load_per_task(int cpu) 4375static unsigned long cpu_avg_load_per_task(int cpu)
4302{ 4376{
4303 struct rq *rq = cpu_rq(cpu); 4377 struct rq *rq = cpu_rq(cpu);
@@ -4711,6 +4785,33 @@ next:
4711done: 4785done:
4712 return target; 4786 return target;
4713} 4787}
4788/*
4789 * get_cpu_usage returns the amount of capacity of a CPU that is used by CFS
4790 * tasks. The unit of the return value must be the one of capacity so we can
4791 * compare the usage with the capacity of the CPU that is available for CFS
4792 * task (ie cpu_capacity).
4793 * cfs.utilization_load_avg is the sum of running time of runnable tasks on a
4794 * CPU. It represents the amount of utilization of a CPU in the range
4795 * [0..SCHED_LOAD_SCALE]. The usage of a CPU can't be higher than the full
4796 * capacity of the CPU because it's about the running time on this CPU.
4797 * Nevertheless, cfs.utilization_load_avg can be higher than SCHED_LOAD_SCALE
4798 * because of unfortunate rounding in avg_period and running_load_avg or just
4799 * after migrating tasks until the average stabilizes with the new running
4800 * time. So we need to check that the usage stays into the range
4801 * [0..cpu_capacity_orig] and cap if necessary.
4802 * Without capping the usage, a group could be seen as overloaded (CPU0 usage
4803 * at 121% + CPU1 usage at 80%) whereas CPU1 has 20% of available capacity
4804 */
4805static int get_cpu_usage(int cpu)
4806{
4807 unsigned long usage = cpu_rq(cpu)->cfs.utilization_load_avg;
4808 unsigned long capacity = capacity_orig_of(cpu);
4809
4810 if (usage >= SCHED_LOAD_SCALE)
4811 return capacity;
4812
4813 return (usage * capacity) >> SCHED_LOAD_SHIFT;
4814}
4714 4815
4715/* 4816/*
4716 * select_task_rq_fair: Select target runqueue for the waking task in domains 4817 * select_task_rq_fair: Select target runqueue for the waking task in domains
@@ -5837,12 +5938,12 @@ struct sg_lb_stats {
5837 unsigned long sum_weighted_load; /* Weighted load of group's tasks */ 5938 unsigned long sum_weighted_load; /* Weighted load of group's tasks */
5838 unsigned long load_per_task; 5939 unsigned long load_per_task;
5839 unsigned long group_capacity; 5940 unsigned long group_capacity;
5941 unsigned long group_usage; /* Total usage of the group */
5840 unsigned int sum_nr_running; /* Nr tasks running in the group */ 5942 unsigned int sum_nr_running; /* Nr tasks running in the group */
5841 unsigned int group_capacity_factor;
5842 unsigned int idle_cpus; 5943 unsigned int idle_cpus;
5843 unsigned int group_weight; 5944 unsigned int group_weight;
5844 enum group_type group_type; 5945 enum group_type group_type;
5845 int group_has_free_capacity; 5946 int group_no_capacity;
5846#ifdef CONFIG_NUMA_BALANCING 5947#ifdef CONFIG_NUMA_BALANCING
5847 unsigned int nr_numa_running; 5948 unsigned int nr_numa_running;
5848 unsigned int nr_preferred_running; 5949 unsigned int nr_preferred_running;
@@ -5913,16 +6014,6 @@ static inline int get_sd_load_idx(struct sched_domain *sd,
5913 return load_idx; 6014 return load_idx;
5914} 6015}
5915 6016
5916static unsigned long default_scale_capacity(struct sched_domain *sd, int cpu)
5917{
5918 return SCHED_CAPACITY_SCALE;
5919}
5920
5921unsigned long __weak arch_scale_freq_capacity(struct sched_domain *sd, int cpu)
5922{
5923 return default_scale_capacity(sd, cpu);
5924}
5925
5926static unsigned long default_scale_cpu_capacity(struct sched_domain *sd, int cpu) 6017static unsigned long default_scale_cpu_capacity(struct sched_domain *sd, int cpu)
5927{ 6018{
5928 if ((sd->flags & SD_SHARE_CPUCAPACITY) && (sd->span_weight > 1)) 6019 if ((sd->flags & SD_SHARE_CPUCAPACITY) && (sd->span_weight > 1))
@@ -5939,7 +6030,7 @@ unsigned long __weak arch_scale_cpu_capacity(struct sched_domain *sd, int cpu)
5939static unsigned long scale_rt_capacity(int cpu) 6030static unsigned long scale_rt_capacity(int cpu)
5940{ 6031{
5941 struct rq *rq = cpu_rq(cpu); 6032 struct rq *rq = cpu_rq(cpu);
5942 u64 total, available, age_stamp, avg; 6033 u64 total, used, age_stamp, avg;
5943 s64 delta; 6034 s64 delta;
5944 6035
5945 /* 6036 /*
@@ -5955,19 +6046,12 @@ static unsigned long scale_rt_capacity(int cpu)
5955 6046
5956 total = sched_avg_period() + delta; 6047 total = sched_avg_period() + delta;
5957 6048
5958 if (unlikely(total < avg)) { 6049 used = div_u64(avg, total);
5959 /* Ensures that capacity won't end up being negative */
5960 available = 0;
5961 } else {
5962 available = total - avg;
5963 }
5964
5965 if (unlikely((s64)total < SCHED_CAPACITY_SCALE))
5966 total = SCHED_CAPACITY_SCALE;
5967 6050
5968 total >>= SCHED_CAPACITY_SHIFT; 6051 if (likely(used < SCHED_CAPACITY_SCALE))
6052 return SCHED_CAPACITY_SCALE - used;
5969 6053
5970 return div_u64(available, total); 6054 return 1;
5971} 6055}
5972 6056
5973static void update_cpu_capacity(struct sched_domain *sd, int cpu) 6057static void update_cpu_capacity(struct sched_domain *sd, int cpu)
@@ -5982,14 +6066,7 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu)
5982 6066
5983 capacity >>= SCHED_CAPACITY_SHIFT; 6067 capacity >>= SCHED_CAPACITY_SHIFT;
5984 6068
5985 sdg->sgc->capacity_orig = capacity; 6069 cpu_rq(cpu)->cpu_capacity_orig = capacity;
5986
5987 if (sched_feat(ARCH_CAPACITY))
5988 capacity *= arch_scale_freq_capacity(sd, cpu);
5989 else
5990 capacity *= default_scale_capacity(sd, cpu);
5991
5992 capacity >>= SCHED_CAPACITY_SHIFT;
5993 6070
5994 capacity *= scale_rt_capacity(cpu); 6071 capacity *= scale_rt_capacity(cpu);
5995 capacity >>= SCHED_CAPACITY_SHIFT; 6072 capacity >>= SCHED_CAPACITY_SHIFT;
@@ -6005,7 +6082,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
6005{ 6082{
6006 struct sched_domain *child = sd->child; 6083 struct sched_domain *child = sd->child;
6007 struct sched_group *group, *sdg = sd->groups; 6084 struct sched_group *group, *sdg = sd->groups;
6008 unsigned long capacity, capacity_orig; 6085 unsigned long capacity;
6009 unsigned long interval; 6086 unsigned long interval;
6010 6087
6011 interval = msecs_to_jiffies(sd->balance_interval); 6088 interval = msecs_to_jiffies(sd->balance_interval);
@@ -6017,7 +6094,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
6017 return; 6094 return;
6018 } 6095 }
6019 6096
6020 capacity_orig = capacity = 0; 6097 capacity = 0;
6021 6098
6022 if (child->flags & SD_OVERLAP) { 6099 if (child->flags & SD_OVERLAP) {
6023 /* 6100 /*
@@ -6037,19 +6114,15 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
6037 * Use capacity_of(), which is set irrespective of domains 6114 * Use capacity_of(), which is set irrespective of domains
6038 * in update_cpu_capacity(). 6115 * in update_cpu_capacity().
6039 * 6116 *
6040 * This avoids capacity/capacity_orig from being 0 and 6117 * This avoids capacity from being 0 and
6041 * causing divide-by-zero issues on boot. 6118 * causing divide-by-zero issues on boot.
6042 *
6043 * Runtime updates will correct capacity_orig.
6044 */ 6119 */
6045 if (unlikely(!rq->sd)) { 6120 if (unlikely(!rq->sd)) {
6046 capacity_orig += capacity_of(cpu);
6047 capacity += capacity_of(cpu); 6121 capacity += capacity_of(cpu);
6048 continue; 6122 continue;
6049 } 6123 }
6050 6124
6051 sgc = rq->sd->groups->sgc; 6125 sgc = rq->sd->groups->sgc;
6052 capacity_orig += sgc->capacity_orig;
6053 capacity += sgc->capacity; 6126 capacity += sgc->capacity;
6054 } 6127 }
6055 } else { 6128 } else {
@@ -6060,39 +6133,24 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
6060 6133
6061 group = child->groups; 6134 group = child->groups;
6062 do { 6135 do {
6063 capacity_orig += group->sgc->capacity_orig;
6064 capacity += group->sgc->capacity; 6136 capacity += group->sgc->capacity;
6065 group = group->next; 6137 group = group->next;
6066 } while (group != child->groups); 6138 } while (group != child->groups);
6067 } 6139 }
6068 6140
6069 sdg->sgc->capacity_orig = capacity_orig;
6070 sdg->sgc->capacity = capacity; 6141 sdg->sgc->capacity = capacity;
6071} 6142}
6072 6143
6073/* 6144/*
6074 * Try and fix up capacity for tiny siblings, this is needed when 6145 * Check whether the capacity of the rq has been noticeably reduced by side
6075 * things like SD_ASYM_PACKING need f_b_g to select another sibling 6146 * activity. The imbalance_pct is used for the threshold.
6076 * which on its own isn't powerful enough. 6147 * Return true is the capacity is reduced
6077 *
6078 * See update_sd_pick_busiest() and check_asym_packing().
6079 */ 6148 */
6080static inline int 6149static inline int
6081fix_small_capacity(struct sched_domain *sd, struct sched_group *group) 6150check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
6082{ 6151{
6083 /* 6152 return ((rq->cpu_capacity * sd->imbalance_pct) <
6084 * Only siblings can have significantly less than SCHED_CAPACITY_SCALE 6153 (rq->cpu_capacity_orig * 100));
6085 */
6086 if (!(sd->flags & SD_SHARE_CPUCAPACITY))
6087 return 0;
6088
6089 /*
6090 * If ~90% of the cpu_capacity is still there, we're good.
6091 */
6092 if (group->sgc->capacity * 32 > group->sgc->capacity_orig * 29)
6093 return 1;
6094
6095 return 0;
6096} 6154}
6097 6155
6098/* 6156/*
@@ -6130,37 +6188,56 @@ static inline int sg_imbalanced(struct sched_group *group)
6130} 6188}
6131 6189
6132/* 6190/*
6133 * Compute the group capacity factor. 6191 * group_has_capacity returns true if the group has spare capacity that could
6134 * 6192 * be used by some tasks.
6135 * Avoid the issue where N*frac(smt_capacity) >= 1 creates 'phantom' cores by 6193 * We consider that a group has spare capacity if the * number of task is
6136 * first dividing out the smt factor and computing the actual number of cores 6194 * smaller than the number of CPUs or if the usage is lower than the available
6137 * and limit unit capacity with that. 6195 * capacity for CFS tasks.
6196 * For the latter, we use a threshold to stabilize the state, to take into
6197 * account the variance of the tasks' load and to return true if the available
6198 * capacity in meaningful for the load balancer.
6199 * As an example, an available capacity of 1% can appear but it doesn't make
6200 * any benefit for the load balance.
6138 */ 6201 */
6139static inline int sg_capacity_factor(struct lb_env *env, struct sched_group *group) 6202static inline bool
6203group_has_capacity(struct lb_env *env, struct sg_lb_stats *sgs)
6140{ 6204{
6141 unsigned int capacity_factor, smt, cpus; 6205 if (sgs->sum_nr_running < sgs->group_weight)
6142 unsigned int capacity, capacity_orig; 6206 return true;
6143 6207
6144 capacity = group->sgc->capacity; 6208 if ((sgs->group_capacity * 100) >
6145 capacity_orig = group->sgc->capacity_orig; 6209 (sgs->group_usage * env->sd->imbalance_pct))
6146 cpus = group->group_weight; 6210 return true;
6211
6212 return false;
6213}
6147 6214
6148 /* smt := ceil(cpus / capacity), assumes: 1 < smt_capacity < 2 */ 6215/*
6149 smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, capacity_orig); 6216 * group_is_overloaded returns true if the group has more tasks than it can
6150 capacity_factor = cpus / smt; /* cores */ 6217 * handle.
6218 * group_is_overloaded is not equals to !group_has_capacity because a group
6219 * with the exact right number of tasks, has no more spare capacity but is not
6220 * overloaded so both group_has_capacity and group_is_overloaded return
6221 * false.
6222 */
6223static inline bool
6224group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs)
6225{
6226 if (sgs->sum_nr_running <= sgs->group_weight)
6227 return false;
6151 6228
6152 capacity_factor = min_t(unsigned, 6229 if ((sgs->group_capacity * 100) <
6153 capacity_factor, DIV_ROUND_CLOSEST(capacity, SCHED_CAPACITY_SCALE)); 6230 (sgs->group_usage * env->sd->imbalance_pct))
6154 if (!capacity_factor) 6231 return true;
6155 capacity_factor = fix_small_capacity(env->sd, group);
6156 6232
6157 return capacity_factor; 6233 return false;
6158} 6234}
6159 6235
6160static enum group_type 6236static enum group_type group_classify(struct lb_env *env,
6161group_classify(struct sched_group *group, struct sg_lb_stats *sgs) 6237 struct sched_group *group,
6238 struct sg_lb_stats *sgs)
6162{ 6239{
6163 if (sgs->sum_nr_running > sgs->group_capacity_factor) 6240 if (sgs->group_no_capacity)
6164 return group_overloaded; 6241 return group_overloaded;
6165 6242
6166 if (sg_imbalanced(group)) 6243 if (sg_imbalanced(group))
@@ -6198,6 +6275,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
6198 load = source_load(i, load_idx); 6275 load = source_load(i, load_idx);
6199 6276
6200 sgs->group_load += load; 6277 sgs->group_load += load;
6278 sgs->group_usage += get_cpu_usage(i);
6201 sgs->sum_nr_running += rq->cfs.h_nr_running; 6279 sgs->sum_nr_running += rq->cfs.h_nr_running;
6202 6280
6203 if (rq->nr_running > 1) 6281 if (rq->nr_running > 1)
@@ -6220,11 +6298,9 @@ static inline void update_sg_lb_stats(struct lb_env *env,
6220 sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; 6298 sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
6221 6299
6222 sgs->group_weight = group->group_weight; 6300 sgs->group_weight = group->group_weight;
6223 sgs->group_capacity_factor = sg_capacity_factor(env, group);
6224 sgs->group_type = group_classify(group, sgs);
6225 6301
6226 if (sgs->group_capacity_factor > sgs->sum_nr_running) 6302 sgs->group_no_capacity = group_is_overloaded(env, sgs);
6227 sgs->group_has_free_capacity = 1; 6303 sgs->group_type = group_classify(env, group, sgs);
6228} 6304}
6229 6305
6230/** 6306/**
@@ -6346,18 +6422,19 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
6346 6422
6347 /* 6423 /*
6348 * In case the child domain prefers tasks go to siblings 6424 * In case the child domain prefers tasks go to siblings
6349 * first, lower the sg capacity factor to one so that we'll try 6425 * first, lower the sg capacity so that we'll try
6350 * and move all the excess tasks away. We lower the capacity 6426 * and move all the excess tasks away. We lower the capacity
6351 * of a group only if the local group has the capacity to fit 6427 * of a group only if the local group has the capacity to fit
6352 * these excess tasks, i.e. nr_running < group_capacity_factor. The 6428 * these excess tasks. The extra check prevents the case where
6353 * extra check prevents the case where you always pull from the 6429 * you always pull from the heaviest group when it is already
6354 * heaviest group when it is already under-utilized (possible 6430 * under-utilized (possible with a large weight task outweighs
6355 * with a large weight task outweighs the tasks on the system). 6431 * the tasks on the system).
6356 */ 6432 */
6357 if (prefer_sibling && sds->local && 6433 if (prefer_sibling && sds->local &&
6358 sds->local_stat.group_has_free_capacity) { 6434 group_has_capacity(env, &sds->local_stat) &&
6359 sgs->group_capacity_factor = min(sgs->group_capacity_factor, 1U); 6435 (sgs->sum_nr_running > 1)) {
6360 sgs->group_type = group_classify(sg, sgs); 6436 sgs->group_no_capacity = 1;
6437 sgs->group_type = group_overloaded;
6361 } 6438 }
6362 6439
6363 if (update_sd_pick_busiest(env, sds, sg, sgs)) { 6440 if (update_sd_pick_busiest(env, sds, sg, sgs)) {
@@ -6537,11 +6614,12 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
6537 */ 6614 */
6538 if (busiest->group_type == group_overloaded && 6615 if (busiest->group_type == group_overloaded &&
6539 local->group_type == group_overloaded) { 6616 local->group_type == group_overloaded) {
6540 load_above_capacity = 6617 load_above_capacity = busiest->sum_nr_running *
6541 (busiest->sum_nr_running - busiest->group_capacity_factor); 6618 SCHED_LOAD_SCALE;
6542 6619 if (load_above_capacity > busiest->group_capacity)
6543 load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_CAPACITY_SCALE); 6620 load_above_capacity -= busiest->group_capacity;
6544 load_above_capacity /= busiest->group_capacity; 6621 else
6622 load_above_capacity = ~0UL;
6545 } 6623 }
6546 6624
6547 /* 6625 /*
@@ -6604,6 +6682,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
6604 local = &sds.local_stat; 6682 local = &sds.local_stat;
6605 busiest = &sds.busiest_stat; 6683 busiest = &sds.busiest_stat;
6606 6684
6685 /* ASYM feature bypasses nice load balance check */
6607 if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) && 6686 if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) &&
6608 check_asym_packing(env, &sds)) 6687 check_asym_packing(env, &sds))
6609 return sds.busiest; 6688 return sds.busiest;
@@ -6624,8 +6703,8 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
6624 goto force_balance; 6703 goto force_balance;
6625 6704
6626 /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ 6705 /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
6627 if (env->idle == CPU_NEWLY_IDLE && local->group_has_free_capacity && 6706 if (env->idle == CPU_NEWLY_IDLE && group_has_capacity(env, local) &&
6628 !busiest->group_has_free_capacity) 6707 busiest->group_no_capacity)
6629 goto force_balance; 6708 goto force_balance;
6630 6709
6631 /* 6710 /*
@@ -6684,7 +6763,7 @@ static struct rq *find_busiest_queue(struct lb_env *env,
6684 int i; 6763 int i;
6685 6764
6686 for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { 6765 for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
6687 unsigned long capacity, capacity_factor, wl; 6766 unsigned long capacity, wl;
6688 enum fbq_type rt; 6767 enum fbq_type rt;
6689 6768
6690 rq = cpu_rq(i); 6769 rq = cpu_rq(i);
@@ -6713,9 +6792,6 @@ static struct rq *find_busiest_queue(struct lb_env *env,
6713 continue; 6792 continue;
6714 6793
6715 capacity = capacity_of(i); 6794 capacity = capacity_of(i);
6716 capacity_factor = DIV_ROUND_CLOSEST(capacity, SCHED_CAPACITY_SCALE);
6717 if (!capacity_factor)
6718 capacity_factor = fix_small_capacity(env->sd, group);
6719 6795
6720 wl = weighted_cpuload(i); 6796 wl = weighted_cpuload(i);
6721 6797
@@ -6723,7 +6799,9 @@ static struct rq *find_busiest_queue(struct lb_env *env,
6723 * When comparing with imbalance, use weighted_cpuload() 6799 * When comparing with imbalance, use weighted_cpuload()
6724 * which is not scaled with the cpu capacity. 6800 * which is not scaled with the cpu capacity.
6725 */ 6801 */
6726 if (capacity_factor && rq->nr_running == 1 && wl > env->imbalance) 6802
6803 if (rq->nr_running == 1 && wl > env->imbalance &&
6804 !check_cpu_capacity(rq, env->sd))
6727 continue; 6805 continue;
6728 6806
6729 /* 6807 /*
@@ -6771,6 +6849,19 @@ static int need_active_balance(struct lb_env *env)
6771 return 1; 6849 return 1;
6772 } 6850 }
6773 6851
6852 /*
6853 * The dst_cpu is idle and the src_cpu CPU has only 1 CFS task.
6854 * It's worth migrating the task if the src_cpu's capacity is reduced
6855 * because of other sched_class or IRQs if more capacity stays
6856 * available on dst_cpu.
6857 */
6858 if ((env->idle != CPU_NOT_IDLE) &&
6859 (env->src_rq->cfs.h_nr_running == 1)) {
6860 if ((check_cpu_capacity(env->src_rq, sd)) &&
6861 (capacity_of(env->src_cpu)*sd->imbalance_pct < capacity_of(env->dst_cpu)*100))
6862 return 1;
6863 }
6864
6774 return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2); 6865 return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
6775} 6866}
6776 6867
@@ -6870,6 +6961,9 @@ redo:
6870 6961
6871 schedstat_add(sd, lb_imbalance[idle], env.imbalance); 6962 schedstat_add(sd, lb_imbalance[idle], env.imbalance);
6872 6963
6964 env.src_cpu = busiest->cpu;
6965 env.src_rq = busiest;
6966
6873 ld_moved = 0; 6967 ld_moved = 0;
6874 if (busiest->nr_running > 1) { 6968 if (busiest->nr_running > 1) {
6875 /* 6969 /*
@@ -6879,8 +6973,6 @@ redo:
6879 * correctly treated as an imbalance. 6973 * correctly treated as an imbalance.
6880 */ 6974 */
6881 env.flags |= LBF_ALL_PINNED; 6975 env.flags |= LBF_ALL_PINNED;
6882 env.src_cpu = busiest->cpu;
6883 env.src_rq = busiest;
6884 env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running); 6976 env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);
6885 6977
6886more_balance: 6978more_balance:
@@ -7580,22 +7672,25 @@ end:
7580 7672
7581/* 7673/*
7582 * Current heuristic for kicking the idle load balancer in the presence 7674 * Current heuristic for kicking the idle load balancer in the presence
7583 * of an idle cpu is the system. 7675 * of an idle cpu in the system.
7584 * - This rq has more than one task. 7676 * - This rq has more than one task.
7585 * - At any scheduler domain level, this cpu's scheduler group has multiple 7677 * - This rq has at least one CFS task and the capacity of the CPU is
7586 * busy cpu's exceeding the group's capacity. 7678 * significantly reduced because of RT tasks or IRQs.
7679 * - At parent of LLC scheduler domain level, this cpu's scheduler group has
7680 * multiple busy cpu.
7587 * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler 7681 * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
7588 * domain span are idle. 7682 * domain span are idle.
7589 */ 7683 */
7590static inline int nohz_kick_needed(struct rq *rq) 7684static inline bool nohz_kick_needed(struct rq *rq)
7591{ 7685{
7592 unsigned long now = jiffies; 7686 unsigned long now = jiffies;
7593 struct sched_domain *sd; 7687 struct sched_domain *sd;
7594 struct sched_group_capacity *sgc; 7688 struct sched_group_capacity *sgc;
7595 int nr_busy, cpu = rq->cpu; 7689 int nr_busy, cpu = rq->cpu;
7690 bool kick = false;
7596 7691
7597 if (unlikely(rq->idle_balance)) 7692 if (unlikely(rq->idle_balance))
7598 return 0; 7693 return false;
7599 7694
7600 /* 7695 /*
7601 * We may be recently in ticked or tickless idle mode. At the first 7696 * We may be recently in ticked or tickless idle mode. At the first
@@ -7609,38 +7704,46 @@ static inline int nohz_kick_needed(struct rq *rq)
7609 * balancing. 7704 * balancing.
7610 */ 7705 */
7611 if (likely(!atomic_read(&nohz.nr_cpus))) 7706 if (likely(!atomic_read(&nohz.nr_cpus)))
7612 return 0; 7707 return false;
7613 7708
7614 if (time_before(now, nohz.next_balance)) 7709 if (time_before(now, nohz.next_balance))
7615 return 0; 7710 return false;
7616 7711
7617 if (rq->nr_running >= 2) 7712 if (rq->nr_running >= 2)
7618 goto need_kick; 7713 return true;
7619 7714
7620 rcu_read_lock(); 7715 rcu_read_lock();
7621 sd = rcu_dereference(per_cpu(sd_busy, cpu)); 7716 sd = rcu_dereference(per_cpu(sd_busy, cpu));
7622
7623 if (sd) { 7717 if (sd) {
7624 sgc = sd->groups->sgc; 7718 sgc = sd->groups->sgc;
7625 nr_busy = atomic_read(&sgc->nr_busy_cpus); 7719 nr_busy = atomic_read(&sgc->nr_busy_cpus);
7626 7720
7627 if (nr_busy > 1) 7721 if (nr_busy > 1) {
7628 goto need_kick_unlock; 7722 kick = true;
7723 goto unlock;
7724 }
7725
7629 } 7726 }
7630 7727
7631 sd = rcu_dereference(per_cpu(sd_asym, cpu)); 7728 sd = rcu_dereference(rq->sd);
7729 if (sd) {
7730 if ((rq->cfs.h_nr_running >= 1) &&
7731 check_cpu_capacity(rq, sd)) {
7732 kick = true;
7733 goto unlock;
7734 }
7735 }
7632 7736
7737 sd = rcu_dereference(per_cpu(sd_asym, cpu));
7633 if (sd && (cpumask_first_and(nohz.idle_cpus_mask, 7738 if (sd && (cpumask_first_and(nohz.idle_cpus_mask,
7634 sched_domain_span(sd)) < cpu)) 7739 sched_domain_span(sd)) < cpu)) {
7635 goto need_kick_unlock; 7740 kick = true;
7636 7741 goto unlock;
7637 rcu_read_unlock(); 7742 }
7638 return 0;
7639 7743
7640need_kick_unlock: 7744unlock:
7641 rcu_read_unlock(); 7745 rcu_read_unlock();
7642need_kick: 7746 return kick;
7643 return 1;
7644} 7747}
7645#else 7748#else
7646static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { } 7749static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { }
@@ -7656,14 +7759,16 @@ static void run_rebalance_domains(struct softirq_action *h)
7656 enum cpu_idle_type idle = this_rq->idle_balance ? 7759 enum cpu_idle_type idle = this_rq->idle_balance ?
7657 CPU_IDLE : CPU_NOT_IDLE; 7760 CPU_IDLE : CPU_NOT_IDLE;
7658 7761
7659 rebalance_domains(this_rq, idle);
7660
7661 /* 7762 /*
7662 * If this cpu has a pending nohz_balance_kick, then do the 7763 * If this cpu has a pending nohz_balance_kick, then do the
7663 * balancing on behalf of the other idle cpus whose ticks are 7764 * balancing on behalf of the other idle cpus whose ticks are
7664 * stopped. 7765 * stopped. Do nohz_idle_balance *before* rebalance_domains to
7766 * give the idle cpus a chance to load balance. Else we may
7767 * load balance only within the local sched_domain hierarchy
7768 * and abort nohz_idle_balance altogether if we pull some load.
7665 */ 7769 */
7666 nohz_idle_balance(this_rq, idle); 7770 nohz_idle_balance(this_rq, idle);
7771 rebalance_domains(this_rq, idle);
7667} 7772}
7668 7773
7669/* 7774/*
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 90284d117fe6..91e33cd485f6 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -56,6 +56,19 @@ SCHED_FEAT(NONTASK_CAPACITY, true)
56 */ 56 */
57SCHED_FEAT(TTWU_QUEUE, true) 57SCHED_FEAT(TTWU_QUEUE, true)
58 58
59#ifdef HAVE_RT_PUSH_IPI
60/*
61 * In order to avoid a thundering herd attack of CPUs that are
62 * lowering their priorities at the same time, and there being
63 * a single CPU that has an RT task that can migrate and is waiting
64 * to run, where the other CPUs will try to take that CPUs
65 * rq lock and possibly create a large contention, sending an
66 * IPI to that CPU and let that CPU push the RT task to where
67 * it should go may be a better scenario.
68 */
69SCHED_FEAT(RT_PUSH_IPI, true)
70#endif
71
59SCHED_FEAT(FORCE_SD_OVERLAP, false) 72SCHED_FEAT(FORCE_SD_OVERLAP, false)
60SCHED_FEAT(RT_RUNTIME_SHARE, true) 73SCHED_FEAT(RT_RUNTIME_SHARE, true)
61SCHED_FEAT(LB_MIN, false) 74SCHED_FEAT(LB_MIN, false)
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 80014a178342..deef1caa94c6 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -158,8 +158,7 @@ static void cpuidle_idle_call(void)
158 * is used from another cpu as a broadcast timer, this call may 158 * is used from another cpu as a broadcast timer, this call may
159 * fail if it is not available 159 * fail if it is not available
160 */ 160 */
161 if (broadcast && 161 if (broadcast && tick_broadcast_enter())
162 clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &dev->cpu))
163 goto use_default; 162 goto use_default;
164 163
165 /* Take note of the planned idle state. */ 164 /* Take note of the planned idle state. */
@@ -176,7 +175,7 @@ static void cpuidle_idle_call(void)
176 idle_set_state(this_rq(), NULL); 175 idle_set_state(this_rq(), NULL);
177 176
178 if (broadcast) 177 if (broadcast)
179 clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &dev->cpu); 178 tick_broadcast_exit();
180 179
181 /* 180 /*
182 * Give the governor an opportunity to reflect on the outcome 181 * Give the governor an opportunity to reflect on the outcome
@@ -210,6 +209,8 @@ use_default:
210 goto exit_idle; 209 goto exit_idle;
211} 210}
212 211
212DEFINE_PER_CPU(bool, cpu_dead_idle);
213
213/* 214/*
214 * Generic idle loop implementation 215 * Generic idle loop implementation
215 * 216 *
@@ -234,8 +235,13 @@ static void cpu_idle_loop(void)
234 check_pgt_cache(); 235 check_pgt_cache();
235 rmb(); 236 rmb();
236 237
237 if (cpu_is_offline(smp_processor_id())) 238 if (cpu_is_offline(smp_processor_id())) {
239 rcu_cpu_notify(NULL, CPU_DYING_IDLE,
240 (void *)(long)smp_processor_id());
241 smp_mb(); /* all activity before dead. */
242 this_cpu_write(cpu_dead_idle, true);
238 arch_cpu_idle_dead(); 243 arch_cpu_idle_dead();
244 }
239 245
240 local_irq_disable(); 246 local_irq_disable();
241 arch_cpu_idle_enter(); 247 arch_cpu_idle_enter();
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index f4d4b077eba0..575da76a3874 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -6,6 +6,7 @@
6#include "sched.h" 6#include "sched.h"
7 7
8#include <linux/slab.h> 8#include <linux/slab.h>
9#include <linux/irq_work.h>
9 10
10int sched_rr_timeslice = RR_TIMESLICE; 11int sched_rr_timeslice = RR_TIMESLICE;
11 12
@@ -59,7 +60,11 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
59 raw_spin_unlock(&rt_b->rt_runtime_lock); 60 raw_spin_unlock(&rt_b->rt_runtime_lock);
60} 61}
61 62
62void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) 63#ifdef CONFIG_SMP
64static void push_irq_work_func(struct irq_work *work);
65#endif
66
67void init_rt_rq(struct rt_rq *rt_rq)
63{ 68{
64 struct rt_prio_array *array; 69 struct rt_prio_array *array;
65 int i; 70 int i;
@@ -78,7 +83,14 @@ void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
78 rt_rq->rt_nr_migratory = 0; 83 rt_rq->rt_nr_migratory = 0;
79 rt_rq->overloaded = 0; 84 rt_rq->overloaded = 0;
80 plist_head_init(&rt_rq->pushable_tasks); 85 plist_head_init(&rt_rq->pushable_tasks);
86
87#ifdef HAVE_RT_PUSH_IPI
88 rt_rq->push_flags = 0;
89 rt_rq->push_cpu = nr_cpu_ids;
90 raw_spin_lock_init(&rt_rq->push_lock);
91 init_irq_work(&rt_rq->push_work, push_irq_work_func);
81#endif 92#endif
93#endif /* CONFIG_SMP */
82 /* We start is dequeued state, because no RT tasks are queued */ 94 /* We start is dequeued state, because no RT tasks are queued */
83 rt_rq->rt_queued = 0; 95 rt_rq->rt_queued = 0;
84 96
@@ -193,7 +205,7 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
193 if (!rt_se) 205 if (!rt_se)
194 goto err_free_rq; 206 goto err_free_rq;
195 207
196 init_rt_rq(rt_rq, cpu_rq(i)); 208 init_rt_rq(rt_rq);
197 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; 209 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
198 init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]); 210 init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);
199 } 211 }
@@ -1778,6 +1790,164 @@ static void push_rt_tasks(struct rq *rq)
1778 ; 1790 ;
1779} 1791}
1780 1792
1793#ifdef HAVE_RT_PUSH_IPI
1794/*
1795 * The search for the next cpu always starts at rq->cpu and ends
1796 * when we reach rq->cpu again. It will never return rq->cpu.
1797 * This returns the next cpu to check, or nr_cpu_ids if the loop
1798 * is complete.
1799 *
1800 * rq->rt.push_cpu holds the last cpu returned by this function,
1801 * or if this is the first instance, it must hold rq->cpu.
1802 */
1803static int rto_next_cpu(struct rq *rq)
1804{
1805 int prev_cpu = rq->rt.push_cpu;
1806 int cpu;
1807
1808 cpu = cpumask_next(prev_cpu, rq->rd->rto_mask);
1809
1810 /*
1811 * If the previous cpu is less than the rq's CPU, then it already
1812 * passed the end of the mask, and has started from the beginning.
1813 * We end if the next CPU is greater or equal to rq's CPU.
1814 */
1815 if (prev_cpu < rq->cpu) {
1816 if (cpu >= rq->cpu)
1817 return nr_cpu_ids;
1818
1819 } else if (cpu >= nr_cpu_ids) {
1820 /*
1821 * We passed the end of the mask, start at the beginning.
1822 * If the result is greater or equal to the rq's CPU, then
1823 * the loop is finished.
1824 */
1825 cpu = cpumask_first(rq->rd->rto_mask);
1826 if (cpu >= rq->cpu)
1827 return nr_cpu_ids;
1828 }
1829 rq->rt.push_cpu = cpu;
1830
1831 /* Return cpu to let the caller know if the loop is finished or not */
1832 return cpu;
1833}
1834
1835static int find_next_push_cpu(struct rq *rq)
1836{
1837 struct rq *next_rq;
1838 int cpu;
1839
1840 while (1) {
1841 cpu = rto_next_cpu(rq);
1842 if (cpu >= nr_cpu_ids)
1843 break;
1844 next_rq = cpu_rq(cpu);
1845
1846 /* Make sure the next rq can push to this rq */
1847 if (next_rq->rt.highest_prio.next < rq->rt.highest_prio.curr)
1848 break;
1849 }
1850
1851 return cpu;
1852}
1853
1854#define RT_PUSH_IPI_EXECUTING 1
1855#define RT_PUSH_IPI_RESTART 2
1856
1857static void tell_cpu_to_push(struct rq *rq)
1858{
1859 int cpu;
1860
1861 if (rq->rt.push_flags & RT_PUSH_IPI_EXECUTING) {
1862 raw_spin_lock(&rq->rt.push_lock);
1863 /* Make sure it's still executing */
1864 if (rq->rt.push_flags & RT_PUSH_IPI_EXECUTING) {
1865 /*
1866 * Tell the IPI to restart the loop as things have
1867 * changed since it started.
1868 */
1869 rq->rt.push_flags |= RT_PUSH_IPI_RESTART;
1870 raw_spin_unlock(&rq->rt.push_lock);
1871 return;
1872 }
1873 raw_spin_unlock(&rq->rt.push_lock);
1874 }
1875
1876 /* When here, there's no IPI going around */
1877
1878 rq->rt.push_cpu = rq->cpu;
1879 cpu = find_next_push_cpu(rq);
1880 if (cpu >= nr_cpu_ids)
1881 return;
1882
1883 rq->rt.push_flags = RT_PUSH_IPI_EXECUTING;
1884
1885 irq_work_queue_on(&rq->rt.push_work, cpu);
1886}
1887
1888/* Called from hardirq context */
1889static void try_to_push_tasks(void *arg)
1890{
1891 struct rt_rq *rt_rq = arg;
1892 struct rq *rq, *src_rq;
1893 int this_cpu;
1894 int cpu;
1895
1896 this_cpu = rt_rq->push_cpu;
1897
1898 /* Paranoid check */
1899 BUG_ON(this_cpu != smp_processor_id());
1900
1901 rq = cpu_rq(this_cpu);
1902 src_rq = rq_of_rt_rq(rt_rq);
1903
1904again:
1905 if (has_pushable_tasks(rq)) {
1906 raw_spin_lock(&rq->lock);
1907 push_rt_task(rq);
1908 raw_spin_unlock(&rq->lock);
1909 }
1910
1911 /* Pass the IPI to the next rt overloaded queue */
1912 raw_spin_lock(&rt_rq->push_lock);
1913 /*
1914 * If the source queue changed since the IPI went out,
1915 * we need to restart the search from that CPU again.
1916 */
1917 if (rt_rq->push_flags & RT_PUSH_IPI_RESTART) {
1918 rt_rq->push_flags &= ~RT_PUSH_IPI_RESTART;
1919 rt_rq->push_cpu = src_rq->cpu;
1920 }
1921
1922 cpu = find_next_push_cpu(src_rq);
1923
1924 if (cpu >= nr_cpu_ids)
1925 rt_rq->push_flags &= ~RT_PUSH_IPI_EXECUTING;
1926 raw_spin_unlock(&rt_rq->push_lock);
1927
1928 if (cpu >= nr_cpu_ids)
1929 return;
1930
1931 /*
1932 * It is possible that a restart caused this CPU to be
1933 * chosen again. Don't bother with an IPI, just see if we
1934 * have more to push.
1935 */
1936 if (unlikely(cpu == rq->cpu))
1937 goto again;
1938
1939 /* Try the next RT overloaded CPU */
1940 irq_work_queue_on(&rt_rq->push_work, cpu);
1941}
1942
1943static void push_irq_work_func(struct irq_work *work)
1944{
1945 struct rt_rq *rt_rq = container_of(work, struct rt_rq, push_work);
1946
1947 try_to_push_tasks(rt_rq);
1948}
1949#endif /* HAVE_RT_PUSH_IPI */
1950
1781static int pull_rt_task(struct rq *this_rq) 1951static int pull_rt_task(struct rq *this_rq)
1782{ 1952{
1783 int this_cpu = this_rq->cpu, ret = 0, cpu; 1953 int this_cpu = this_rq->cpu, ret = 0, cpu;
@@ -1793,6 +1963,13 @@ static int pull_rt_task(struct rq *this_rq)
1793 */ 1963 */
1794 smp_rmb(); 1964 smp_rmb();
1795 1965
1966#ifdef HAVE_RT_PUSH_IPI
1967 if (sched_feat(RT_PUSH_IPI)) {
1968 tell_cpu_to_push(this_rq);
1969 return 0;
1970 }
1971#endif
1972
1796 for_each_cpu(cpu, this_rq->rd->rto_mask) { 1973 for_each_cpu(cpu, this_rq->rd->rto_mask) {
1797 if (this_cpu == cpu) 1974 if (this_cpu == cpu)
1798 continue; 1975 continue;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index dc0f435a2779..e0e129993958 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -6,6 +6,7 @@
6#include <linux/mutex.h> 6#include <linux/mutex.h>
7#include <linux/spinlock.h> 7#include <linux/spinlock.h>
8#include <linux/stop_machine.h> 8#include <linux/stop_machine.h>
9#include <linux/irq_work.h>
9#include <linux/tick.h> 10#include <linux/tick.h>
10#include <linux/slab.h> 11#include <linux/slab.h>
11 12
@@ -362,8 +363,14 @@ struct cfs_rq {
362 * Under CFS, load is tracked on a per-entity basis and aggregated up. 363 * Under CFS, load is tracked on a per-entity basis and aggregated up.
363 * This allows for the description of both thread and group usage (in 364 * This allows for the description of both thread and group usage (in
364 * the FAIR_GROUP_SCHED case). 365 * the FAIR_GROUP_SCHED case).
366 * runnable_load_avg is the sum of the load_avg_contrib of the
367 * sched_entities on the rq.
368 * blocked_load_avg is similar to runnable_load_avg except that its
369 * the blocked sched_entities on the rq.
370 * utilization_load_avg is the sum of the average running time of the
371 * sched_entities on the rq.
365 */ 372 */
366 unsigned long runnable_load_avg, blocked_load_avg; 373 unsigned long runnable_load_avg, blocked_load_avg, utilization_load_avg;
367 atomic64_t decay_counter; 374 atomic64_t decay_counter;
368 u64 last_decay; 375 u64 last_decay;
369 atomic_long_t removed_load; 376 atomic_long_t removed_load;
@@ -418,6 +425,11 @@ static inline int rt_bandwidth_enabled(void)
418 return sysctl_sched_rt_runtime >= 0; 425 return sysctl_sched_rt_runtime >= 0;
419} 426}
420 427
428/* RT IPI pull logic requires IRQ_WORK */
429#ifdef CONFIG_IRQ_WORK
430# define HAVE_RT_PUSH_IPI
431#endif
432
421/* Real-Time classes' related field in a runqueue: */ 433/* Real-Time classes' related field in a runqueue: */
422struct rt_rq { 434struct rt_rq {
423 struct rt_prio_array active; 435 struct rt_prio_array active;
@@ -435,7 +447,13 @@ struct rt_rq {
435 unsigned long rt_nr_total; 447 unsigned long rt_nr_total;
436 int overloaded; 448 int overloaded;
437 struct plist_head pushable_tasks; 449 struct plist_head pushable_tasks;
450#ifdef HAVE_RT_PUSH_IPI
451 int push_flags;
452 int push_cpu;
453 struct irq_work push_work;
454 raw_spinlock_t push_lock;
438#endif 455#endif
456#endif /* CONFIG_SMP */
439 int rt_queued; 457 int rt_queued;
440 458
441 int rt_throttled; 459 int rt_throttled;
@@ -597,6 +615,7 @@ struct rq {
597 struct sched_domain *sd; 615 struct sched_domain *sd;
598 616
599 unsigned long cpu_capacity; 617 unsigned long cpu_capacity;
618 unsigned long cpu_capacity_orig;
600 619
601 unsigned char idle_balance; 620 unsigned char idle_balance;
602 /* For active balancing */ 621 /* For active balancing */
@@ -807,7 +826,7 @@ struct sched_group_capacity {
807 * CPU capacity of this group, SCHED_LOAD_SCALE being max capacity 826 * CPU capacity of this group, SCHED_LOAD_SCALE being max capacity
808 * for a single CPU. 827 * for a single CPU.
809 */ 828 */
810 unsigned int capacity, capacity_orig; 829 unsigned int capacity;
811 unsigned long next_update; 830 unsigned long next_update;
812 int imbalance; /* XXX unrelated to capacity but shared group state */ 831 int imbalance; /* XXX unrelated to capacity but shared group state */
813 /* 832 /*
@@ -1368,9 +1387,18 @@ static inline int hrtick_enabled(struct rq *rq)
1368 1387
1369#ifdef CONFIG_SMP 1388#ifdef CONFIG_SMP
1370extern void sched_avg_update(struct rq *rq); 1389extern void sched_avg_update(struct rq *rq);
1390
1391#ifndef arch_scale_freq_capacity
1392static __always_inline
1393unsigned long arch_scale_freq_capacity(struct sched_domain *sd, int cpu)
1394{
1395 return SCHED_CAPACITY_SCALE;
1396}
1397#endif
1398
1371static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) 1399static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
1372{ 1400{
1373 rq->rt_avg += rt_delta; 1401 rq->rt_avg += rt_delta * arch_scale_freq_capacity(NULL, cpu_of(rq));
1374 sched_avg_update(rq); 1402 sched_avg_update(rq);
1375} 1403}
1376#else 1404#else
@@ -1643,8 +1671,8 @@ extern void print_rt_stats(struct seq_file *m, int cpu);
1643extern void print_dl_stats(struct seq_file *m, int cpu); 1671extern void print_dl_stats(struct seq_file *m, int cpu);
1644 1672
1645extern void init_cfs_rq(struct cfs_rq *cfs_rq); 1673extern void init_cfs_rq(struct cfs_rq *cfs_rq);
1646extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq); 1674extern void init_rt_rq(struct rt_rq *rt_rq);
1647extern void init_dl_rq(struct dl_rq *dl_rq, struct rq *rq); 1675extern void init_dl_rq(struct dl_rq *dl_rq);
1648 1676
1649extern void cfs_bandwidth_usage_inc(void); 1677extern void cfs_bandwidth_usage_inc(void);
1650extern void cfs_bandwidth_usage_dec(void); 1678extern void cfs_bandwidth_usage_dec(void);
diff --git a/kernel/signal.c b/kernel/signal.c
index a390499943e4..d51c5ddd855c 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2992,11 +2992,9 @@ static int do_rt_sigqueueinfo(pid_t pid, int sig, siginfo_t *info)
2992 * Nor can they impersonate a kill()/tgkill(), which adds source info. 2992 * Nor can they impersonate a kill()/tgkill(), which adds source info.
2993 */ 2993 */
2994 if ((info->si_code >= 0 || info->si_code == SI_TKILL) && 2994 if ((info->si_code >= 0 || info->si_code == SI_TKILL) &&
2995 (task_pid_vnr(current) != pid)) { 2995 (task_pid_vnr(current) != pid))
2996 /* We used to allow any < 0 si_code */
2997 WARN_ON_ONCE(info->si_code < 0);
2998 return -EPERM; 2996 return -EPERM;
2999 } 2997
3000 info->si_signo = sig; 2998 info->si_signo = sig;
3001 2999
3002 /* POSIX.1b doesn't mention process groups. */ 3000 /* POSIX.1b doesn't mention process groups. */
@@ -3041,12 +3039,10 @@ static int do_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig, siginfo_t *info)
3041 /* Not even root can pretend to send signals from the kernel. 3039 /* Not even root can pretend to send signals from the kernel.
3042 * Nor can they impersonate a kill()/tgkill(), which adds source info. 3040 * Nor can they impersonate a kill()/tgkill(), which adds source info.
3043 */ 3041 */
3044 if (((info->si_code >= 0 || info->si_code == SI_TKILL)) && 3042 if ((info->si_code >= 0 || info->si_code == SI_TKILL) &&
3045 (task_pid_vnr(current) != pid)) { 3043 (task_pid_vnr(current) != pid))
3046 /* We used to allow any < 0 si_code */
3047 WARN_ON_ONCE(info->si_code < 0);
3048 return -EPERM; 3044 return -EPERM;
3049 } 3045
3050 info->si_signo = sig; 3046 info->si_signo = sig;
3051 3047
3052 return do_send_specific(tgid, pid, sig, info); 3048 return do_send_specific(tgid, pid, sig, info);
diff --git a/kernel/smp.c b/kernel/smp.c
index f38a1e692259..07854477c164 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -19,7 +19,7 @@
19 19
20enum { 20enum {
21 CSD_FLAG_LOCK = 0x01, 21 CSD_FLAG_LOCK = 0x01,
22 CSD_FLAG_WAIT = 0x02, 22 CSD_FLAG_SYNCHRONOUS = 0x02,
23}; 23};
24 24
25struct call_function_data { 25struct call_function_data {
@@ -107,7 +107,7 @@ void __init call_function_init(void)
107 */ 107 */
108static void csd_lock_wait(struct call_single_data *csd) 108static void csd_lock_wait(struct call_single_data *csd)
109{ 109{
110 while (csd->flags & CSD_FLAG_LOCK) 110 while (smp_load_acquire(&csd->flags) & CSD_FLAG_LOCK)
111 cpu_relax(); 111 cpu_relax();
112} 112}
113 113
@@ -121,19 +121,17 @@ static void csd_lock(struct call_single_data *csd)
121 * to ->flags with any subsequent assignments to other 121 * to ->flags with any subsequent assignments to other
122 * fields of the specified call_single_data structure: 122 * fields of the specified call_single_data structure:
123 */ 123 */
124 smp_mb(); 124 smp_wmb();
125} 125}
126 126
127static void csd_unlock(struct call_single_data *csd) 127static void csd_unlock(struct call_single_data *csd)
128{ 128{
129 WARN_ON((csd->flags & CSD_FLAG_WAIT) && !(csd->flags & CSD_FLAG_LOCK)); 129 WARN_ON(!(csd->flags & CSD_FLAG_LOCK));
130 130
131 /* 131 /*
132 * ensure we're all done before releasing data: 132 * ensure we're all done before releasing data:
133 */ 133 */
134 smp_mb(); 134 smp_store_release(&csd->flags, 0);
135
136 csd->flags &= ~CSD_FLAG_LOCK;
137} 135}
138 136
139static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_single_data, csd_data); 137static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_single_data, csd_data);
@@ -144,13 +142,16 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_single_data, csd_data);
144 * ->func, ->info, and ->flags set. 142 * ->func, ->info, and ->flags set.
145 */ 143 */
146static int generic_exec_single(int cpu, struct call_single_data *csd, 144static int generic_exec_single(int cpu, struct call_single_data *csd,
147 smp_call_func_t func, void *info, int wait) 145 smp_call_func_t func, void *info)
148{ 146{
149 struct call_single_data csd_stack = { .flags = 0 };
150 unsigned long flags;
151
152
153 if (cpu == smp_processor_id()) { 147 if (cpu == smp_processor_id()) {
148 unsigned long flags;
149
150 /*
151 * We can unlock early even for the synchronous on-stack case,
152 * since we're doing this from the same CPU..
153 */
154 csd_unlock(csd);
154 local_irq_save(flags); 155 local_irq_save(flags);
155 func(info); 156 func(info);
156 local_irq_restore(flags); 157 local_irq_restore(flags);
@@ -158,24 +159,14 @@ static int generic_exec_single(int cpu, struct call_single_data *csd,
158 } 159 }
159 160
160 161
161 if ((unsigned)cpu >= nr_cpu_ids || !cpu_online(cpu)) 162 if ((unsigned)cpu >= nr_cpu_ids || !cpu_online(cpu)) {
163 csd_unlock(csd);
162 return -ENXIO; 164 return -ENXIO;
163
164
165 if (!csd) {
166 csd = &csd_stack;
167 if (!wait)
168 csd = this_cpu_ptr(&csd_data);
169 } 165 }
170 166
171 csd_lock(csd);
172
173 csd->func = func; 167 csd->func = func;
174 csd->info = info; 168 csd->info = info;
175 169
176 if (wait)
177 csd->flags |= CSD_FLAG_WAIT;
178
179 /* 170 /*
180 * The list addition should be visible before sending the IPI 171 * The list addition should be visible before sending the IPI
181 * handler locks the list to pull the entry off it because of 172 * handler locks the list to pull the entry off it because of
@@ -190,9 +181,6 @@ static int generic_exec_single(int cpu, struct call_single_data *csd,
190 if (llist_add(&csd->llist, &per_cpu(call_single_queue, cpu))) 181 if (llist_add(&csd->llist, &per_cpu(call_single_queue, cpu)))
191 arch_send_call_function_single_ipi(cpu); 182 arch_send_call_function_single_ipi(cpu);
192 183
193 if (wait)
194 csd_lock_wait(csd);
195
196 return 0; 184 return 0;
197} 185}
198 186
@@ -250,8 +238,17 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline)
250 } 238 }
251 239
252 llist_for_each_entry_safe(csd, csd_next, entry, llist) { 240 llist_for_each_entry_safe(csd, csd_next, entry, llist) {
253 csd->func(csd->info); 241 smp_call_func_t func = csd->func;
254 csd_unlock(csd); 242 void *info = csd->info;
243
244 /* Do we wait until *after* callback? */
245 if (csd->flags & CSD_FLAG_SYNCHRONOUS) {
246 func(info);
247 csd_unlock(csd);
248 } else {
249 csd_unlock(csd);
250 func(info);
251 }
255 } 252 }
256 253
257 /* 254 /*
@@ -274,6 +271,8 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline)
274int smp_call_function_single(int cpu, smp_call_func_t func, void *info, 271int smp_call_function_single(int cpu, smp_call_func_t func, void *info,
275 int wait) 272 int wait)
276{ 273{
274 struct call_single_data *csd;
275 struct call_single_data csd_stack = { .flags = CSD_FLAG_LOCK | CSD_FLAG_SYNCHRONOUS };
277 int this_cpu; 276 int this_cpu;
278 int err; 277 int err;
279 278
@@ -292,7 +291,16 @@ int smp_call_function_single(int cpu, smp_call_func_t func, void *info,
292 WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled() 291 WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled()
293 && !oops_in_progress); 292 && !oops_in_progress);
294 293
295 err = generic_exec_single(cpu, NULL, func, info, wait); 294 csd = &csd_stack;
295 if (!wait) {
296 csd = this_cpu_ptr(&csd_data);
297 csd_lock(csd);
298 }
299
300 err = generic_exec_single(cpu, csd, func, info);
301
302 if (wait)
303 csd_lock_wait(csd);
296 304
297 put_cpu(); 305 put_cpu();
298 306
@@ -321,7 +329,15 @@ int smp_call_function_single_async(int cpu, struct call_single_data *csd)
321 int err = 0; 329 int err = 0;
322 330
323 preempt_disable(); 331 preempt_disable();
324 err = generic_exec_single(cpu, csd, csd->func, csd->info, 0); 332
333 /* We could deadlock if we have to wait here with interrupts disabled! */
334 if (WARN_ON_ONCE(csd->flags & CSD_FLAG_LOCK))
335 csd_lock_wait(csd);
336
337 csd->flags = CSD_FLAG_LOCK;
338 smp_wmb();
339
340 err = generic_exec_single(cpu, csd, csd->func, csd->info);
325 preempt_enable(); 341 preempt_enable();
326 342
327 return err; 343 return err;
@@ -433,6 +449,8 @@ void smp_call_function_many(const struct cpumask *mask,
433 struct call_single_data *csd = per_cpu_ptr(cfd->csd, cpu); 449 struct call_single_data *csd = per_cpu_ptr(cfd->csd, cpu);
434 450
435 csd_lock(csd); 451 csd_lock(csd);
452 if (wait)
453 csd->flags |= CSD_FLAG_SYNCHRONOUS;
436 csd->func = func; 454 csd->func = func;
437 csd->info = info; 455 csd->info = info;
438 llist_add(&csd->llist, &per_cpu(call_single_queue, cpu)); 456 llist_add(&csd->llist, &per_cpu(call_single_queue, cpu));
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index 40190f28db35..c697f73d82d6 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -4,6 +4,7 @@
4#include <linux/cpu.h> 4#include <linux/cpu.h>
5#include <linux/err.h> 5#include <linux/err.h>
6#include <linux/smp.h> 6#include <linux/smp.h>
7#include <linux/delay.h>
7#include <linux/init.h> 8#include <linux/init.h>
8#include <linux/list.h> 9#include <linux/list.h>
9#include <linux/slab.h> 10#include <linux/slab.h>
@@ -314,3 +315,158 @@ void smpboot_unregister_percpu_thread(struct smp_hotplug_thread *plug_thread)
314 put_online_cpus(); 315 put_online_cpus();
315} 316}
316EXPORT_SYMBOL_GPL(smpboot_unregister_percpu_thread); 317EXPORT_SYMBOL_GPL(smpboot_unregister_percpu_thread);
318
319static DEFINE_PER_CPU(atomic_t, cpu_hotplug_state) = ATOMIC_INIT(CPU_POST_DEAD);
320
321/*
322 * Called to poll specified CPU's state, for example, when waiting for
323 * a CPU to come online.
324 */
325int cpu_report_state(int cpu)
326{
327 return atomic_read(&per_cpu(cpu_hotplug_state, cpu));
328}
329
330/*
331 * If CPU has died properly, set its state to CPU_UP_PREPARE and
332 * return success. Otherwise, return -EBUSY if the CPU died after
333 * cpu_wait_death() timed out. And yet otherwise again, return -EAGAIN
334 * if cpu_wait_death() timed out and the CPU still hasn't gotten around
335 * to dying. In the latter two cases, the CPU might not be set up
336 * properly, but it is up to the arch-specific code to decide.
337 * Finally, -EIO indicates an unanticipated problem.
338 *
339 * Note that it is permissible to omit this call entirely, as is
340 * done in architectures that do no CPU-hotplug error checking.
341 */
342int cpu_check_up_prepare(int cpu)
343{
344 if (!IS_ENABLED(CONFIG_HOTPLUG_CPU)) {
345 atomic_set(&per_cpu(cpu_hotplug_state, cpu), CPU_UP_PREPARE);
346 return 0;
347 }
348
349 switch (atomic_read(&per_cpu(cpu_hotplug_state, cpu))) {
350
351 case CPU_POST_DEAD:
352
353 /* The CPU died properly, so just start it up again. */
354 atomic_set(&per_cpu(cpu_hotplug_state, cpu), CPU_UP_PREPARE);
355 return 0;
356
357 case CPU_DEAD_FROZEN:
358
359 /*
360 * Timeout during CPU death, so let caller know.
361 * The outgoing CPU completed its processing, but after
362 * cpu_wait_death() timed out and reported the error. The
363 * caller is free to proceed, in which case the state
364 * will be reset properly by cpu_set_state_online().
365 * Proceeding despite this -EBUSY return makes sense
366 * for systems where the outgoing CPUs take themselves
367 * offline, with no post-death manipulation required from
368 * a surviving CPU.
369 */
370 return -EBUSY;
371
372 case CPU_BROKEN:
373
374 /*
375 * The most likely reason we got here is that there was
376 * a timeout during CPU death, and the outgoing CPU never
377 * did complete its processing. This could happen on
378 * a virtualized system if the outgoing VCPU gets preempted
379 * for more than five seconds, and the user attempts to
380 * immediately online that same CPU. Trying again later
381 * might return -EBUSY above, hence -EAGAIN.
382 */
383 return -EAGAIN;
384
385 default:
386
387 /* Should not happen. Famous last words. */
388 return -EIO;
389 }
390}
391
392/*
393 * Mark the specified CPU online.
394 *
395 * Note that it is permissible to omit this call entirely, as is
396 * done in architectures that do no CPU-hotplug error checking.
397 */
398void cpu_set_state_online(int cpu)
399{
400 (void)atomic_xchg(&per_cpu(cpu_hotplug_state, cpu), CPU_ONLINE);
401}
402
403#ifdef CONFIG_HOTPLUG_CPU
404
405/*
406 * Wait for the specified CPU to exit the idle loop and die.
407 */
408bool cpu_wait_death(unsigned int cpu, int seconds)
409{
410 int jf_left = seconds * HZ;
411 int oldstate;
412 bool ret = true;
413 int sleep_jf = 1;
414
415 might_sleep();
416
417 /* The outgoing CPU will normally get done quite quickly. */
418 if (atomic_read(&per_cpu(cpu_hotplug_state, cpu)) == CPU_DEAD)
419 goto update_state;
420 udelay(5);
421
422 /* But if the outgoing CPU dawdles, wait increasingly long times. */
423 while (atomic_read(&per_cpu(cpu_hotplug_state, cpu)) != CPU_DEAD) {
424 schedule_timeout_uninterruptible(sleep_jf);
425 jf_left -= sleep_jf;
426 if (jf_left <= 0)
427 break;
428 sleep_jf = DIV_ROUND_UP(sleep_jf * 11, 10);
429 }
430update_state:
431 oldstate = atomic_read(&per_cpu(cpu_hotplug_state, cpu));
432 if (oldstate == CPU_DEAD) {
433 /* Outgoing CPU died normally, update state. */
434 smp_mb(); /* atomic_read() before update. */
435 atomic_set(&per_cpu(cpu_hotplug_state, cpu), CPU_POST_DEAD);
436 } else {
437 /* Outgoing CPU still hasn't died, set state accordingly. */
438 if (atomic_cmpxchg(&per_cpu(cpu_hotplug_state, cpu),
439 oldstate, CPU_BROKEN) != oldstate)
440 goto update_state;
441 ret = false;
442 }
443 return ret;
444}
445
446/*
447 * Called by the outgoing CPU to report its successful death. Return
448 * false if this report follows the surviving CPU's timing out.
449 *
450 * A separate "CPU_DEAD_FROZEN" is used when the surviving CPU
451 * timed out. This approach allows architectures to omit calls to
452 * cpu_check_up_prepare() and cpu_set_state_online() without defeating
453 * the next cpu_wait_death()'s polling loop.
454 */
455bool cpu_report_death(void)
456{
457 int oldstate;
458 int newstate;
459 int cpu = smp_processor_id();
460
461 do {
462 oldstate = atomic_read(&per_cpu(cpu_hotplug_state, cpu));
463 if (oldstate != CPU_BROKEN)
464 newstate = CPU_DEAD;
465 else
466 newstate = CPU_DEAD_FROZEN;
467 } while (atomic_cmpxchg(&per_cpu(cpu_hotplug_state, cpu),
468 oldstate, newstate) != oldstate);
469 return newstate == CPU_DEAD;
470}
471
472#endif /* #ifdef CONFIG_HOTPLUG_CPU */
diff --git a/kernel/sys.c b/kernel/sys.c
index a03d9cd23ed7..a4e372b798a5 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -325,6 +325,7 @@ out_unlock:
325 * SMP: There are not races, the GIDs are checked only by filesystem 325 * SMP: There are not races, the GIDs are checked only by filesystem
326 * operations (as far as semantic preservation is concerned). 326 * operations (as far as semantic preservation is concerned).
327 */ 327 */
328#ifdef CONFIG_MULTIUSER
328SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid) 329SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid)
329{ 330{
330 struct user_namespace *ns = current_user_ns(); 331 struct user_namespace *ns = current_user_ns();
@@ -815,6 +816,7 @@ change_okay:
815 commit_creds(new); 816 commit_creds(new);
816 return old_fsgid; 817 return old_fsgid;
817} 818}
819#endif /* CONFIG_MULTIUSER */
818 820
819/** 821/**
820 * sys_getpid - return the thread group id of the current process 822 * sys_getpid - return the thread group id of the current process
@@ -1647,14 +1649,13 @@ SYSCALL_DEFINE1(umask, int, mask)
1647 return mask; 1649 return mask;
1648} 1650}
1649 1651
1650static int prctl_set_mm_exe_file_locked(struct mm_struct *mm, unsigned int fd) 1652static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
1651{ 1653{
1652 struct fd exe; 1654 struct fd exe;
1655 struct file *old_exe, *exe_file;
1653 struct inode *inode; 1656 struct inode *inode;
1654 int err; 1657 int err;
1655 1658
1656 VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_sem), mm);
1657
1658 exe = fdget(fd); 1659 exe = fdget(fd);
1659 if (!exe.file) 1660 if (!exe.file)
1660 return -EBADF; 1661 return -EBADF;
@@ -1678,15 +1679,22 @@ static int prctl_set_mm_exe_file_locked(struct mm_struct *mm, unsigned int fd)
1678 /* 1679 /*
1679 * Forbid mm->exe_file change if old file still mapped. 1680 * Forbid mm->exe_file change if old file still mapped.
1680 */ 1681 */
1682 exe_file = get_mm_exe_file(mm);
1681 err = -EBUSY; 1683 err = -EBUSY;
1682 if (mm->exe_file) { 1684 if (exe_file) {
1683 struct vm_area_struct *vma; 1685 struct vm_area_struct *vma;
1684 1686
1685 for (vma = mm->mmap; vma; vma = vma->vm_next) 1687 down_read(&mm->mmap_sem);
1686 if (vma->vm_file && 1688 for (vma = mm->mmap; vma; vma = vma->vm_next) {
1687 path_equal(&vma->vm_file->f_path, 1689 if (!vma->vm_file)
1688 &mm->exe_file->f_path)) 1690 continue;
1689 goto exit; 1691 if (path_equal(&vma->vm_file->f_path,
1692 &exe_file->f_path))
1693 goto exit_err;
1694 }
1695
1696 up_read(&mm->mmap_sem);
1697 fput(exe_file);
1690 } 1698 }
1691 1699
1692 /* 1700 /*
@@ -1700,10 +1708,18 @@ static int prctl_set_mm_exe_file_locked(struct mm_struct *mm, unsigned int fd)
1700 goto exit; 1708 goto exit;
1701 1709
1702 err = 0; 1710 err = 0;
1703 set_mm_exe_file(mm, exe.file); /* this grabs a reference to exe.file */ 1711 /* set the new file, lockless */
1712 get_file(exe.file);
1713 old_exe = xchg(&mm->exe_file, exe.file);
1714 if (old_exe)
1715 fput(old_exe);
1704exit: 1716exit:
1705 fdput(exe); 1717 fdput(exe);
1706 return err; 1718 return err;
1719exit_err:
1720 up_read(&mm->mmap_sem);
1721 fput(exe_file);
1722 goto exit;
1707} 1723}
1708 1724
1709#ifdef CONFIG_CHECKPOINT_RESTORE 1725#ifdef CONFIG_CHECKPOINT_RESTORE
@@ -1838,10 +1854,9 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data
1838 user_auxv[AT_VECTOR_SIZE - 1] = AT_NULL; 1854 user_auxv[AT_VECTOR_SIZE - 1] = AT_NULL;
1839 } 1855 }
1840 1856
1841 down_write(&mm->mmap_sem);
1842 if (prctl_map.exe_fd != (u32)-1) 1857 if (prctl_map.exe_fd != (u32)-1)
1843 error = prctl_set_mm_exe_file_locked(mm, prctl_map.exe_fd); 1858 error = prctl_set_mm_exe_file(mm, prctl_map.exe_fd);
1844 downgrade_write(&mm->mmap_sem); 1859 down_read(&mm->mmap_sem);
1845 if (error) 1860 if (error)
1846 goto out; 1861 goto out;
1847 1862
@@ -1907,12 +1922,8 @@ static int prctl_set_mm(int opt, unsigned long addr,
1907 if (!capable(CAP_SYS_RESOURCE)) 1922 if (!capable(CAP_SYS_RESOURCE))
1908 return -EPERM; 1923 return -EPERM;
1909 1924
1910 if (opt == PR_SET_MM_EXE_FILE) { 1925 if (opt == PR_SET_MM_EXE_FILE)
1911 down_write(&mm->mmap_sem); 1926 return prctl_set_mm_exe_file(mm, (unsigned int)addr);
1912 error = prctl_set_mm_exe_file_locked(mm, (unsigned int)addr);
1913 up_write(&mm->mmap_sem);
1914 return error;
1915 }
1916 1927
1917 if (addr >= TASK_SIZE || addr < mmap_min_addr) 1928 if (addr >= TASK_SIZE || addr < mmap_min_addr)
1918 return -EINVAL; 1929 return -EINVAL;
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 5adcb0ae3a58..7995ef5868d8 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -159,6 +159,20 @@ cond_syscall(sys_uselib);
159cond_syscall(sys_fadvise64); 159cond_syscall(sys_fadvise64);
160cond_syscall(sys_fadvise64_64); 160cond_syscall(sys_fadvise64_64);
161cond_syscall(sys_madvise); 161cond_syscall(sys_madvise);
162cond_syscall(sys_setuid);
163cond_syscall(sys_setregid);
164cond_syscall(sys_setgid);
165cond_syscall(sys_setreuid);
166cond_syscall(sys_setresuid);
167cond_syscall(sys_getresuid);
168cond_syscall(sys_setresgid);
169cond_syscall(sys_getresgid);
170cond_syscall(sys_setgroups);
171cond_syscall(sys_getgroups);
172cond_syscall(sys_setfsuid);
173cond_syscall(sys_setfsgid);
174cond_syscall(sys_capget);
175cond_syscall(sys_capset);
162 176
163/* arch-specific weak syscall entries */ 177/* arch-specific weak syscall entries */
164cond_syscall(sys_pciconfig_read); 178cond_syscall(sys_pciconfig_read);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 88ea2d6e0031..2082b1a88fb9 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -19,6 +19,7 @@
19 */ 19 */
20 20
21#include <linux/module.h> 21#include <linux/module.h>
22#include <linux/aio.h>
22#include <linux/mm.h> 23#include <linux/mm.h>
23#include <linux/swap.h> 24#include <linux/swap.h>
24#include <linux/slab.h> 25#include <linux/slab.h>
@@ -92,11 +93,9 @@
92#include <linux/nmi.h> 93#include <linux/nmi.h>
93#endif 94#endif
94 95
95
96#if defined(CONFIG_SYSCTL) 96#if defined(CONFIG_SYSCTL)
97 97
98/* External variables not in a header file. */ 98/* External variables not in a header file. */
99extern int max_threads;
100extern int suid_dumpable; 99extern int suid_dumpable;
101#ifdef CONFIG_COREDUMP 100#ifdef CONFIG_COREDUMP
102extern int core_uses_pid; 101extern int core_uses_pid;
@@ -709,10 +708,10 @@ static struct ctl_table kern_table[] = {
709#endif 708#endif
710 { 709 {
711 .procname = "threads-max", 710 .procname = "threads-max",
712 .data = &max_threads, 711 .data = NULL,
713 .maxlen = sizeof(int), 712 .maxlen = sizeof(int),
714 .mode = 0644, 713 .mode = 0644,
715 .proc_handler = proc_dointvec, 714 .proc_handler = sysctl_max_threads,
716 }, 715 },
717 { 716 {
718 .procname = "random", 717 .procname = "random",
@@ -846,7 +845,7 @@ static struct ctl_table kern_table[] = {
846 .data = &watchdog_user_enabled, 845 .data = &watchdog_user_enabled,
847 .maxlen = sizeof (int), 846 .maxlen = sizeof (int),
848 .mode = 0644, 847 .mode = 0644,
849 .proc_handler = proc_dowatchdog, 848 .proc_handler = proc_watchdog,
850 .extra1 = &zero, 849 .extra1 = &zero,
851 .extra2 = &one, 850 .extra2 = &one,
852 }, 851 },
@@ -855,11 +854,33 @@ static struct ctl_table kern_table[] = {
855 .data = &watchdog_thresh, 854 .data = &watchdog_thresh,
856 .maxlen = sizeof(int), 855 .maxlen = sizeof(int),
857 .mode = 0644, 856 .mode = 0644,
858 .proc_handler = proc_dowatchdog, 857 .proc_handler = proc_watchdog_thresh,
859 .extra1 = &zero, 858 .extra1 = &zero,
860 .extra2 = &sixty, 859 .extra2 = &sixty,
861 }, 860 },
862 { 861 {
862 .procname = "nmi_watchdog",
863 .data = &nmi_watchdog_enabled,
864 .maxlen = sizeof (int),
865 .mode = 0644,
866 .proc_handler = proc_nmi_watchdog,
867 .extra1 = &zero,
868#if defined(CONFIG_HAVE_NMI_WATCHDOG) || defined(CONFIG_HARDLOCKUP_DETECTOR)
869 .extra2 = &one,
870#else
871 .extra2 = &zero,
872#endif
873 },
874 {
875 .procname = "soft_watchdog",
876 .data = &soft_watchdog_enabled,
877 .maxlen = sizeof (int),
878 .mode = 0644,
879 .proc_handler = proc_soft_watchdog,
880 .extra1 = &zero,
881 .extra2 = &one,
882 },
883 {
863 .procname = "softlockup_panic", 884 .procname = "softlockup_panic",
864 .data = &softlockup_panic, 885 .data = &softlockup_panic,
865 .maxlen = sizeof(int), 886 .maxlen = sizeof(int),
@@ -879,15 +900,6 @@ static struct ctl_table kern_table[] = {
879 .extra2 = &one, 900 .extra2 = &one,
880 }, 901 },
881#endif /* CONFIG_SMP */ 902#endif /* CONFIG_SMP */
882 {
883 .procname = "nmi_watchdog",
884 .data = &watchdog_user_enabled,
885 .maxlen = sizeof (int),
886 .mode = 0644,
887 .proc_handler = proc_dowatchdog,
888 .extra1 = &zero,
889 .extra2 = &one,
890 },
891#endif 903#endif
892#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) 904#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
893 { 905 {
@@ -1228,6 +1240,14 @@ static struct ctl_table vm_table[] = {
1228 .extra1 = &zero, 1240 .extra1 = &zero,
1229 }, 1241 },
1230 { 1242 {
1243 .procname = "dirtytime_expire_seconds",
1244 .data = &dirtytime_expire_interval,
1245 .maxlen = sizeof(dirty_expire_interval),
1246 .mode = 0644,
1247 .proc_handler = dirtytime_interval_handler,
1248 .extra1 = &zero,
1249 },
1250 {
1231 .procname = "nr_pdflush_threads", 1251 .procname = "nr_pdflush_threads",
1232 .mode = 0444 /* read-only */, 1252 .mode = 0444 /* read-only */,
1233 .proc_handler = pdflush_proc_obsolete, 1253 .proc_handler = pdflush_proc_obsolete,
@@ -1313,6 +1333,15 @@ static struct ctl_table vm_table[] = {
1313 .extra1 = &min_extfrag_threshold, 1333 .extra1 = &min_extfrag_threshold,
1314 .extra2 = &max_extfrag_threshold, 1334 .extra2 = &max_extfrag_threshold,
1315 }, 1335 },
1336 {
1337 .procname = "compact_unevictable_allowed",
1338 .data = &sysctl_compact_unevictable_allowed,
1339 .maxlen = sizeof(int),
1340 .mode = 0644,
1341 .proc_handler = proc_dointvec,
1342 .extra1 = &zero,
1343 .extra2 = &one,
1344 },
1316 1345
1317#endif /* CONFIG_COMPACTION */ 1346#endif /* CONFIG_COMPACTION */
1318 { 1347 {
@@ -1952,7 +1981,15 @@ static int do_proc_dointvec_conv(bool *negp, unsigned long *lvalp,
1952 int write, void *data) 1981 int write, void *data)
1953{ 1982{
1954 if (write) { 1983 if (write) {
1955 *valp = *negp ? -*lvalp : *lvalp; 1984 if (*negp) {
1985 if (*lvalp > (unsigned long) INT_MAX + 1)
1986 return -EINVAL;
1987 *valp = -*lvalp;
1988 } else {
1989 if (*lvalp > (unsigned long) INT_MAX)
1990 return -EINVAL;
1991 *valp = *lvalp;
1992 }
1956 } else { 1993 } else {
1957 int val = *valp; 1994 int val = *valp;
1958 if (val < 0) { 1995 if (val < 0) {
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index d626dc98e8df..579ce1b929af 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -33,12 +33,6 @@ config ARCH_USES_GETTIMEOFFSET
33config GENERIC_CLOCKEVENTS 33config GENERIC_CLOCKEVENTS
34 bool 34 bool
35 35
36# Migration helper. Builds, but does not invoke
37config GENERIC_CLOCKEVENTS_BUILD
38 bool
39 default y
40 depends on GENERIC_CLOCKEVENTS
41
42# Architecture can handle broadcast in a driver-agnostic way 36# Architecture can handle broadcast in a driver-agnostic way
43config ARCH_HAS_TICK_BROADCAST 37config ARCH_HAS_TICK_BROADCAST
44 bool 38 bool
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index c09c07817d7a..01f0312419b3 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -2,15 +2,13 @@ obj-y += time.o timer.o hrtimer.o itimer.o posix-timers.o posix-cpu-timers.o
2obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o 2obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o
3obj-y += timeconv.o timecounter.o posix-clock.o alarmtimer.o 3obj-y += timeconv.o timecounter.o posix-clock.o alarmtimer.o
4 4
5obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o 5obj-$(CONFIG_GENERIC_CLOCKEVENTS) += clockevents.o tick-common.o
6obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o
7ifeq ($(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST),y) 6ifeq ($(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST),y)
8 obj-y += tick-broadcast.o 7 obj-y += tick-broadcast.o
9 obj-$(CONFIG_TICK_ONESHOT) += tick-broadcast-hrtimer.o 8 obj-$(CONFIG_TICK_ONESHOT) += tick-broadcast-hrtimer.o
10endif 9endif
11obj-$(CONFIG_GENERIC_SCHED_CLOCK) += sched_clock.o 10obj-$(CONFIG_GENERIC_SCHED_CLOCK) += sched_clock.o
12obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o 11obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o tick-sched.o
13obj-$(CONFIG_TICK_ONESHOT) += tick-sched.o
14obj-$(CONFIG_TIMER_STATS) += timer_stats.o 12obj-$(CONFIG_TIMER_STATS) += timer_stats.o
15obj-$(CONFIG_DEBUG_FS) += timekeeping_debug.o 13obj-$(CONFIG_DEBUG_FS) += timekeeping_debug.o
16obj-$(CONFIG_TEST_UDELAY) += test_udelay.o 14obj-$(CONFIG_TEST_UDELAY) += test_udelay.o
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 888ecc114ddc..11dc22a6983b 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -94,25 +94,76 @@ u64 clockevent_delta2ns(unsigned long latch, struct clock_event_device *evt)
94} 94}
95EXPORT_SYMBOL_GPL(clockevent_delta2ns); 95EXPORT_SYMBOL_GPL(clockevent_delta2ns);
96 96
97static int __clockevents_set_state(struct clock_event_device *dev,
98 enum clock_event_state state)
99{
100 /* Transition with legacy set_mode() callback */
101 if (dev->set_mode) {
102 /* Legacy callback doesn't support new modes */
103 if (state > CLOCK_EVT_STATE_ONESHOT)
104 return -ENOSYS;
105 /*
106 * 'clock_event_state' and 'clock_event_mode' have 1-to-1
107 * mapping until *_ONESHOT, and so a simple cast will work.
108 */
109 dev->set_mode((enum clock_event_mode)state, dev);
110 dev->mode = (enum clock_event_mode)state;
111 return 0;
112 }
113
114 if (dev->features & CLOCK_EVT_FEAT_DUMMY)
115 return 0;
116
117 /* Transition with new state-specific callbacks */
118 switch (state) {
119 case CLOCK_EVT_STATE_DETACHED:
120 /*
121 * This is an internal state, which is guaranteed to go from
122 * SHUTDOWN to DETACHED. No driver interaction required.
123 */
124 return 0;
125
126 case CLOCK_EVT_STATE_SHUTDOWN:
127 return dev->set_state_shutdown(dev);
128
129 case CLOCK_EVT_STATE_PERIODIC:
130 /* Core internal bug */
131 if (!(dev->features & CLOCK_EVT_FEAT_PERIODIC))
132 return -ENOSYS;
133 return dev->set_state_periodic(dev);
134
135 case CLOCK_EVT_STATE_ONESHOT:
136 /* Core internal bug */
137 if (!(dev->features & CLOCK_EVT_FEAT_ONESHOT))
138 return -ENOSYS;
139 return dev->set_state_oneshot(dev);
140
141 default:
142 return -ENOSYS;
143 }
144}
145
97/** 146/**
98 * clockevents_set_mode - set the operating mode of a clock event device 147 * clockevents_set_state - set the operating state of a clock event device
99 * @dev: device to modify 148 * @dev: device to modify
100 * @mode: new mode 149 * @state: new state
101 * 150 *
102 * Must be called with interrupts disabled ! 151 * Must be called with interrupts disabled !
103 */ 152 */
104void clockevents_set_mode(struct clock_event_device *dev, 153void clockevents_set_state(struct clock_event_device *dev,
105 enum clock_event_mode mode) 154 enum clock_event_state state)
106{ 155{
107 if (dev->mode != mode) { 156 if (dev->state != state) {
108 dev->set_mode(mode, dev); 157 if (__clockevents_set_state(dev, state))
109 dev->mode = mode; 158 return;
159
160 dev->state = state;
110 161
111 /* 162 /*
112 * A nsec2cyc multiplicator of 0 is invalid and we'd crash 163 * A nsec2cyc multiplicator of 0 is invalid and we'd crash
113 * on it, so fix it up and emit a warning: 164 * on it, so fix it up and emit a warning:
114 */ 165 */
115 if (mode == CLOCK_EVT_MODE_ONESHOT) { 166 if (state == CLOCK_EVT_STATE_ONESHOT) {
116 if (unlikely(!dev->mult)) { 167 if (unlikely(!dev->mult)) {
117 dev->mult = 1; 168 dev->mult = 1;
118 WARN_ON(1); 169 WARN_ON(1);
@@ -127,10 +178,28 @@ void clockevents_set_mode(struct clock_event_device *dev,
127 */ 178 */
128void clockevents_shutdown(struct clock_event_device *dev) 179void clockevents_shutdown(struct clock_event_device *dev)
129{ 180{
130 clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN); 181 clockevents_set_state(dev, CLOCK_EVT_STATE_SHUTDOWN);
131 dev->next_event.tv64 = KTIME_MAX; 182 dev->next_event.tv64 = KTIME_MAX;
132} 183}
133 184
185/**
186 * clockevents_tick_resume - Resume the tick device before using it again
187 * @dev: device to resume
188 */
189int clockevents_tick_resume(struct clock_event_device *dev)
190{
191 int ret = 0;
192
193 if (dev->set_mode) {
194 dev->set_mode(CLOCK_EVT_MODE_RESUME, dev);
195 dev->mode = CLOCK_EVT_MODE_RESUME;
196 } else if (dev->tick_resume) {
197 ret = dev->tick_resume(dev);
198 }
199
200 return ret;
201}
202
134#ifdef CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST 203#ifdef CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST
135 204
136/* Limit min_delta to a jiffie */ 205/* Limit min_delta to a jiffie */
@@ -183,7 +252,7 @@ static int clockevents_program_min_delta(struct clock_event_device *dev)
183 delta = dev->min_delta_ns; 252 delta = dev->min_delta_ns;
184 dev->next_event = ktime_add_ns(ktime_get(), delta); 253 dev->next_event = ktime_add_ns(ktime_get(), delta);
185 254
186 if (dev->mode == CLOCK_EVT_MODE_SHUTDOWN) 255 if (dev->state == CLOCK_EVT_STATE_SHUTDOWN)
187 return 0; 256 return 0;
188 257
189 dev->retries++; 258 dev->retries++;
@@ -220,7 +289,7 @@ static int clockevents_program_min_delta(struct clock_event_device *dev)
220 delta = dev->min_delta_ns; 289 delta = dev->min_delta_ns;
221 dev->next_event = ktime_add_ns(ktime_get(), delta); 290 dev->next_event = ktime_add_ns(ktime_get(), delta);
222 291
223 if (dev->mode == CLOCK_EVT_MODE_SHUTDOWN) 292 if (dev->state == CLOCK_EVT_STATE_SHUTDOWN)
224 return 0; 293 return 0;
225 294
226 dev->retries++; 295 dev->retries++;
@@ -252,7 +321,7 @@ int clockevents_program_event(struct clock_event_device *dev, ktime_t expires,
252 321
253 dev->next_event = expires; 322 dev->next_event = expires;
254 323
255 if (dev->mode == CLOCK_EVT_MODE_SHUTDOWN) 324 if (dev->state == CLOCK_EVT_STATE_SHUTDOWN)
256 return 0; 325 return 0;
257 326
258 /* Shortcut for clockevent devices that can deal with ktime. */ 327 /* Shortcut for clockevent devices that can deal with ktime. */
@@ -297,7 +366,7 @@ static int clockevents_replace(struct clock_event_device *ced)
297 struct clock_event_device *dev, *newdev = NULL; 366 struct clock_event_device *dev, *newdev = NULL;
298 367
299 list_for_each_entry(dev, &clockevent_devices, list) { 368 list_for_each_entry(dev, &clockevent_devices, list) {
300 if (dev == ced || dev->mode != CLOCK_EVT_MODE_UNUSED) 369 if (dev == ced || dev->state != CLOCK_EVT_STATE_DETACHED)
301 continue; 370 continue;
302 371
303 if (!tick_check_replacement(newdev, dev)) 372 if (!tick_check_replacement(newdev, dev))
@@ -323,7 +392,7 @@ static int clockevents_replace(struct clock_event_device *ced)
323static int __clockevents_try_unbind(struct clock_event_device *ced, int cpu) 392static int __clockevents_try_unbind(struct clock_event_device *ced, int cpu)
324{ 393{
325 /* Fast track. Device is unused */ 394 /* Fast track. Device is unused */
326 if (ced->mode == CLOCK_EVT_MODE_UNUSED) { 395 if (ced->state == CLOCK_EVT_STATE_DETACHED) {
327 list_del_init(&ced->list); 396 list_del_init(&ced->list);
328 return 0; 397 return 0;
329 } 398 }
@@ -373,6 +442,37 @@ int clockevents_unbind_device(struct clock_event_device *ced, int cpu)
373} 442}
374EXPORT_SYMBOL_GPL(clockevents_unbind_device); 443EXPORT_SYMBOL_GPL(clockevents_unbind_device);
375 444
445/* Sanity check of state transition callbacks */
446static int clockevents_sanity_check(struct clock_event_device *dev)
447{
448 /* Legacy set_mode() callback */
449 if (dev->set_mode) {
450 /* We shouldn't be supporting new modes now */
451 WARN_ON(dev->set_state_periodic || dev->set_state_oneshot ||
452 dev->set_state_shutdown || dev->tick_resume);
453
454 BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED);
455 return 0;
456 }
457
458 if (dev->features & CLOCK_EVT_FEAT_DUMMY)
459 return 0;
460
461 /* New state-specific callbacks */
462 if (!dev->set_state_shutdown)
463 return -EINVAL;
464
465 if ((dev->features & CLOCK_EVT_FEAT_PERIODIC) &&
466 !dev->set_state_periodic)
467 return -EINVAL;
468
469 if ((dev->features & CLOCK_EVT_FEAT_ONESHOT) &&
470 !dev->set_state_oneshot)
471 return -EINVAL;
472
473 return 0;
474}
475
376/** 476/**
377 * clockevents_register_device - register a clock event device 477 * clockevents_register_device - register a clock event device
378 * @dev: device to register 478 * @dev: device to register
@@ -381,7 +481,11 @@ void clockevents_register_device(struct clock_event_device *dev)
381{ 481{
382 unsigned long flags; 482 unsigned long flags;
383 483
384 BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED); 484 BUG_ON(clockevents_sanity_check(dev));
485
486 /* Initialize state to DETACHED */
487 dev->state = CLOCK_EVT_STATE_DETACHED;
488
385 if (!dev->cpumask) { 489 if (!dev->cpumask) {
386 WARN_ON(num_possible_cpus() > 1); 490 WARN_ON(num_possible_cpus() > 1);
387 dev->cpumask = cpumask_of(smp_processor_id()); 491 dev->cpumask = cpumask_of(smp_processor_id());
@@ -445,11 +549,11 @@ int __clockevents_update_freq(struct clock_event_device *dev, u32 freq)
445{ 549{
446 clockevents_config(dev, freq); 550 clockevents_config(dev, freq);
447 551
448 if (dev->mode == CLOCK_EVT_MODE_ONESHOT) 552 if (dev->state == CLOCK_EVT_STATE_ONESHOT)
449 return clockevents_program_event(dev, dev->next_event, false); 553 return clockevents_program_event(dev, dev->next_event, false);
450 554
451 if (dev->mode == CLOCK_EVT_MODE_PERIODIC) 555 if (dev->state == CLOCK_EVT_STATE_PERIODIC)
452 dev->set_mode(CLOCK_EVT_MODE_PERIODIC, dev); 556 return __clockevents_set_state(dev, CLOCK_EVT_STATE_PERIODIC);
453 557
454 return 0; 558 return 0;
455} 559}
@@ -491,30 +595,27 @@ void clockevents_handle_noop(struct clock_event_device *dev)
491 * @old: device to release (can be NULL) 595 * @old: device to release (can be NULL)
492 * @new: device to request (can be NULL) 596 * @new: device to request (can be NULL)
493 * 597 *
494 * Called from the notifier chain. clockevents_lock is held already 598 * Called from various tick functions with clockevents_lock held and
599 * interrupts disabled.
495 */ 600 */
496void clockevents_exchange_device(struct clock_event_device *old, 601void clockevents_exchange_device(struct clock_event_device *old,
497 struct clock_event_device *new) 602 struct clock_event_device *new)
498{ 603{
499 unsigned long flags;
500
501 local_irq_save(flags);
502 /* 604 /*
503 * Caller releases a clock event device. We queue it into the 605 * Caller releases a clock event device. We queue it into the
504 * released list and do a notify add later. 606 * released list and do a notify add later.
505 */ 607 */
506 if (old) { 608 if (old) {
507 module_put(old->owner); 609 module_put(old->owner);
508 clockevents_set_mode(old, CLOCK_EVT_MODE_UNUSED); 610 clockevents_set_state(old, CLOCK_EVT_STATE_DETACHED);
509 list_del(&old->list); 611 list_del(&old->list);
510 list_add(&old->list, &clockevents_released); 612 list_add(&old->list, &clockevents_released);
511 } 613 }
512 614
513 if (new) { 615 if (new) {
514 BUG_ON(new->mode != CLOCK_EVT_MODE_UNUSED); 616 BUG_ON(new->state != CLOCK_EVT_STATE_DETACHED);
515 clockevents_shutdown(new); 617 clockevents_shutdown(new);
516 } 618 }
517 local_irq_restore(flags);
518} 619}
519 620
520/** 621/**
@@ -541,74 +642,40 @@ void clockevents_resume(void)
541 dev->resume(dev); 642 dev->resume(dev);
542} 643}
543 644
544#ifdef CONFIG_GENERIC_CLOCKEVENTS 645#ifdef CONFIG_HOTPLUG_CPU
545/** 646/**
546 * clockevents_notify - notification about relevant events 647 * tick_cleanup_dead_cpu - Cleanup the tick and clockevents of a dead cpu
547 * Returns 0 on success, any other value on error
548 */ 648 */
549int clockevents_notify(unsigned long reason, void *arg) 649void tick_cleanup_dead_cpu(int cpu)
550{ 650{
551 struct clock_event_device *dev, *tmp; 651 struct clock_event_device *dev, *tmp;
552 unsigned long flags; 652 unsigned long flags;
553 int cpu, ret = 0;
554 653
555 raw_spin_lock_irqsave(&clockevents_lock, flags); 654 raw_spin_lock_irqsave(&clockevents_lock, flags);
556 655
557 switch (reason) { 656 tick_shutdown_broadcast_oneshot(cpu);
558 case CLOCK_EVT_NOTIFY_BROADCAST_ON: 657 tick_shutdown_broadcast(cpu);
559 case CLOCK_EVT_NOTIFY_BROADCAST_OFF: 658 tick_shutdown(cpu);
560 case CLOCK_EVT_NOTIFY_BROADCAST_FORCE: 659 /*
561 tick_broadcast_on_off(reason, arg); 660 * Unregister the clock event devices which were
562 break; 661 * released from the users in the notify chain.
563 662 */
564 case CLOCK_EVT_NOTIFY_BROADCAST_ENTER: 663 list_for_each_entry_safe(dev, tmp, &clockevents_released, list)
565 case CLOCK_EVT_NOTIFY_BROADCAST_EXIT: 664 list_del(&dev->list);
566 ret = tick_broadcast_oneshot_control(reason); 665 /*
567 break; 666 * Now check whether the CPU has left unused per cpu devices
568 667 */
569 case CLOCK_EVT_NOTIFY_CPU_DYING: 668 list_for_each_entry_safe(dev, tmp, &clockevent_devices, list) {
570 tick_handover_do_timer(arg); 669 if (cpumask_test_cpu(cpu, dev->cpumask) &&
571 break; 670 cpumask_weight(dev->cpumask) == 1 &&
572 671 !tick_is_broadcast_device(dev)) {
573 case CLOCK_EVT_NOTIFY_SUSPEND: 672 BUG_ON(dev->state != CLOCK_EVT_STATE_DETACHED);
574 tick_suspend();
575 tick_suspend_broadcast();
576 break;
577
578 case CLOCK_EVT_NOTIFY_RESUME:
579 tick_resume();
580 break;
581
582 case CLOCK_EVT_NOTIFY_CPU_DEAD:
583 tick_shutdown_broadcast_oneshot(arg);
584 tick_shutdown_broadcast(arg);
585 tick_shutdown(arg);
586 /*
587 * Unregister the clock event devices which were
588 * released from the users in the notify chain.
589 */
590 list_for_each_entry_safe(dev, tmp, &clockevents_released, list)
591 list_del(&dev->list); 673 list_del(&dev->list);
592 /*
593 * Now check whether the CPU has left unused per cpu devices
594 */
595 cpu = *((int *)arg);
596 list_for_each_entry_safe(dev, tmp, &clockevent_devices, list) {
597 if (cpumask_test_cpu(cpu, dev->cpumask) &&
598 cpumask_weight(dev->cpumask) == 1 &&
599 !tick_is_broadcast_device(dev)) {
600 BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED);
601 list_del(&dev->list);
602 }
603 } 674 }
604 break;
605 default:
606 break;
607 } 675 }
608 raw_spin_unlock_irqrestore(&clockevents_lock, flags); 676 raw_spin_unlock_irqrestore(&clockevents_lock, flags);
609 return ret;
610} 677}
611EXPORT_SYMBOL_GPL(clockevents_notify); 678#endif
612 679
613#ifdef CONFIG_SYSFS 680#ifdef CONFIG_SYSFS
614struct bus_type clockevents_subsys = { 681struct bus_type clockevents_subsys = {
@@ -727,5 +794,3 @@ static int __init clockevents_init_sysfs(void)
727} 794}
728device_initcall(clockevents_init_sysfs); 795device_initcall(clockevents_init_sysfs);
729#endif /* SYSFS */ 796#endif /* SYSFS */
730
731#endif /* GENERIC_CLOCK_EVENTS */
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 4892352f0e49..15facb1b9c60 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -142,13 +142,6 @@ static void __clocksource_unstable(struct clocksource *cs)
142 schedule_work(&watchdog_work); 142 schedule_work(&watchdog_work);
143} 143}
144 144
145static void clocksource_unstable(struct clocksource *cs, int64_t delta)
146{
147 printk(KERN_WARNING "Clocksource %s unstable (delta = %Ld ns)\n",
148 cs->name, delta);
149 __clocksource_unstable(cs);
150}
151
152/** 145/**
153 * clocksource_mark_unstable - mark clocksource unstable via watchdog 146 * clocksource_mark_unstable - mark clocksource unstable via watchdog
154 * @cs: clocksource to be marked unstable 147 * @cs: clocksource to be marked unstable
@@ -174,7 +167,7 @@ void clocksource_mark_unstable(struct clocksource *cs)
174static void clocksource_watchdog(unsigned long data) 167static void clocksource_watchdog(unsigned long data)
175{ 168{
176 struct clocksource *cs; 169 struct clocksource *cs;
177 cycle_t csnow, wdnow, delta; 170 cycle_t csnow, wdnow, cslast, wdlast, delta;
178 int64_t wd_nsec, cs_nsec; 171 int64_t wd_nsec, cs_nsec;
179 int next_cpu, reset_pending; 172 int next_cpu, reset_pending;
180 173
@@ -213,6 +206,8 @@ static void clocksource_watchdog(unsigned long data)
213 206
214 delta = clocksource_delta(csnow, cs->cs_last, cs->mask); 207 delta = clocksource_delta(csnow, cs->cs_last, cs->mask);
215 cs_nsec = clocksource_cyc2ns(delta, cs->mult, cs->shift); 208 cs_nsec = clocksource_cyc2ns(delta, cs->mult, cs->shift);
209 wdlast = cs->wd_last; /* save these in case we print them */
210 cslast = cs->cs_last;
216 cs->cs_last = csnow; 211 cs->cs_last = csnow;
217 cs->wd_last = wdnow; 212 cs->wd_last = wdnow;
218 213
@@ -221,7 +216,12 @@ static void clocksource_watchdog(unsigned long data)
221 216
222 /* Check the deviation from the watchdog clocksource. */ 217 /* Check the deviation from the watchdog clocksource. */
223 if ((abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD)) { 218 if ((abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD)) {
224 clocksource_unstable(cs, cs_nsec - wd_nsec); 219 pr_warn("timekeeping watchdog: Marking clocksource '%s' as unstable, because the skew is too large:\n", cs->name);
220 pr_warn(" '%s' wd_now: %llx wd_last: %llx mask: %llx\n",
221 watchdog->name, wdnow, wdlast, watchdog->mask);
222 pr_warn(" '%s' cs_now: %llx cs_last: %llx mask: %llx\n",
223 cs->name, csnow, cslast, cs->mask);
224 __clocksource_unstable(cs);
225 continue; 225 continue;
226 } 226 }
227 227
@@ -469,26 +469,25 @@ static u32 clocksource_max_adjustment(struct clocksource *cs)
469 * @shift: cycle to nanosecond divisor (power of two) 469 * @shift: cycle to nanosecond divisor (power of two)
470 * @maxadj: maximum adjustment value to mult (~11%) 470 * @maxadj: maximum adjustment value to mult (~11%)
471 * @mask: bitmask for two's complement subtraction of non 64 bit counters 471 * @mask: bitmask for two's complement subtraction of non 64 bit counters
472 * @max_cyc: maximum cycle value before potential overflow (does not include
473 * any safety margin)
474 *
475 * NOTE: This function includes a safety margin of 50%, in other words, we
476 * return half the number of nanoseconds the hardware counter can technically
477 * cover. This is done so that we can potentially detect problems caused by
478 * delayed timers or bad hardware, which might result in time intervals that
479 * are larger then what the math used can handle without overflows.
472 */ 480 */
473u64 clocks_calc_max_nsecs(u32 mult, u32 shift, u32 maxadj, u64 mask) 481u64 clocks_calc_max_nsecs(u32 mult, u32 shift, u32 maxadj, u64 mask, u64 *max_cyc)
474{ 482{
475 u64 max_nsecs, max_cycles; 483 u64 max_nsecs, max_cycles;
476 484
477 /* 485 /*
478 * Calculate the maximum number of cycles that we can pass to the 486 * Calculate the maximum number of cycles that we can pass to the
479 * cyc2ns function without overflowing a 64-bit signed result. The 487 * cyc2ns() function without overflowing a 64-bit result.
480 * maximum number of cycles is equal to ULLONG_MAX/(mult+maxadj)
481 * which is equivalent to the below.
482 * max_cycles < (2^63)/(mult + maxadj)
483 * max_cycles < 2^(log2((2^63)/(mult + maxadj)))
484 * max_cycles < 2^(log2(2^63) - log2(mult + maxadj))
485 * max_cycles < 2^(63 - log2(mult + maxadj))
486 * max_cycles < 1 << (63 - log2(mult + maxadj))
487 * Please note that we add 1 to the result of the log2 to account for
488 * any rounding errors, ensure the above inequality is satisfied and
489 * no overflow will occur.
490 */ 488 */
491 max_cycles = 1ULL << (63 - (ilog2(mult + maxadj) + 1)); 489 max_cycles = ULLONG_MAX;
490 do_div(max_cycles, mult+maxadj);
492 491
493 /* 492 /*
494 * The actual maximum number of cycles we can defer the clocksource is 493 * The actual maximum number of cycles we can defer the clocksource is
@@ -499,27 +498,26 @@ u64 clocks_calc_max_nsecs(u32 mult, u32 shift, u32 maxadj, u64 mask)
499 max_cycles = min(max_cycles, mask); 498 max_cycles = min(max_cycles, mask);
500 max_nsecs = clocksource_cyc2ns(max_cycles, mult - maxadj, shift); 499 max_nsecs = clocksource_cyc2ns(max_cycles, mult - maxadj, shift);
501 500
501 /* return the max_cycles value as well if requested */
502 if (max_cyc)
503 *max_cyc = max_cycles;
504
505 /* Return 50% of the actual maximum, so we can detect bad values */
506 max_nsecs >>= 1;
507
502 return max_nsecs; 508 return max_nsecs;
503} 509}
504 510
505/** 511/**
506 * clocksource_max_deferment - Returns max time the clocksource can be deferred 512 * clocksource_update_max_deferment - Updates the clocksource max_idle_ns & max_cycles
507 * @cs: Pointer to clocksource 513 * @cs: Pointer to clocksource to be updated
508 * 514 *
509 */ 515 */
510static u64 clocksource_max_deferment(struct clocksource *cs) 516static inline void clocksource_update_max_deferment(struct clocksource *cs)
511{ 517{
512 u64 max_nsecs; 518 cs->max_idle_ns = clocks_calc_max_nsecs(cs->mult, cs->shift,
513 519 cs->maxadj, cs->mask,
514 max_nsecs = clocks_calc_max_nsecs(cs->mult, cs->shift, cs->maxadj, 520 &cs->max_cycles);
515 cs->mask);
516 /*
517 * To ensure that the clocksource does not wrap whilst we are idle,
518 * limit the time the clocksource can be deferred by 12.5%. Please
519 * note a margin of 12.5% is used because this can be computed with
520 * a shift, versus say 10% which would require division.
521 */
522 return max_nsecs - (max_nsecs >> 3);
523} 521}
524 522
525#ifndef CONFIG_ARCH_USES_GETTIMEOFFSET 523#ifndef CONFIG_ARCH_USES_GETTIMEOFFSET
@@ -648,7 +646,7 @@ static void clocksource_enqueue(struct clocksource *cs)
648} 646}
649 647
650/** 648/**
651 * __clocksource_updatefreq_scale - Used update clocksource with new freq 649 * __clocksource_update_freq_scale - Used update clocksource with new freq
652 * @cs: clocksource to be registered 650 * @cs: clocksource to be registered
653 * @scale: Scale factor multiplied against freq to get clocksource hz 651 * @scale: Scale factor multiplied against freq to get clocksource hz
654 * @freq: clocksource frequency (cycles per second) divided by scale 652 * @freq: clocksource frequency (cycles per second) divided by scale
@@ -656,48 +654,64 @@ static void clocksource_enqueue(struct clocksource *cs)
656 * This should only be called from the clocksource->enable() method. 654 * This should only be called from the clocksource->enable() method.
657 * 655 *
658 * This *SHOULD NOT* be called directly! Please use the 656 * This *SHOULD NOT* be called directly! Please use the
659 * clocksource_updatefreq_hz() or clocksource_updatefreq_khz helper functions. 657 * __clocksource_update_freq_hz() or __clocksource_update_freq_khz() helper
658 * functions.
660 */ 659 */
661void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq) 660void __clocksource_update_freq_scale(struct clocksource *cs, u32 scale, u32 freq)
662{ 661{
663 u64 sec; 662 u64 sec;
663
664 /* 664 /*
665 * Calc the maximum number of seconds which we can run before 665 * Default clocksources are *special* and self-define their mult/shift.
666 * wrapping around. For clocksources which have a mask > 32bit 666 * But, you're not special, so you should specify a freq value.
667 * we need to limit the max sleep time to have a good
668 * conversion precision. 10 minutes is still a reasonable
669 * amount. That results in a shift value of 24 for a
670 * clocksource with mask >= 40bit and f >= 4GHz. That maps to
671 * ~ 0.06ppm granularity for NTP. We apply the same 12.5%
672 * margin as we do in clocksource_max_deferment()
673 */ 667 */
674 sec = (cs->mask - (cs->mask >> 3)); 668 if (freq) {
675 do_div(sec, freq); 669 /*
676 do_div(sec, scale); 670 * Calc the maximum number of seconds which we can run before
677 if (!sec) 671 * wrapping around. For clocksources which have a mask > 32-bit
678 sec = 1; 672 * we need to limit the max sleep time to have a good
679 else if (sec > 600 && cs->mask > UINT_MAX) 673 * conversion precision. 10 minutes is still a reasonable
680 sec = 600; 674 * amount. That results in a shift value of 24 for a
681 675 * clocksource with mask >= 40-bit and f >= 4GHz. That maps to
682 clocks_calc_mult_shift(&cs->mult, &cs->shift, freq, 676 * ~ 0.06ppm granularity for NTP.
683 NSEC_PER_SEC / scale, sec * scale); 677 */
684 678 sec = cs->mask;
679 do_div(sec, freq);
680 do_div(sec, scale);
681 if (!sec)
682 sec = 1;
683 else if (sec > 600 && cs->mask > UINT_MAX)
684 sec = 600;
685
686 clocks_calc_mult_shift(&cs->mult, &cs->shift, freq,
687 NSEC_PER_SEC / scale, sec * scale);
688 }
685 /* 689 /*
686 * for clocksources that have large mults, to avoid overflow. 690 * Ensure clocksources that have large 'mult' values don't overflow
687 * Since mult may be adjusted by ntp, add an safety extra margin 691 * when adjusted.
688 *
689 */ 692 */
690 cs->maxadj = clocksource_max_adjustment(cs); 693 cs->maxadj = clocksource_max_adjustment(cs);
691 while ((cs->mult + cs->maxadj < cs->mult) 694 while (freq && ((cs->mult + cs->maxadj < cs->mult)
692 || (cs->mult - cs->maxadj > cs->mult)) { 695 || (cs->mult - cs->maxadj > cs->mult))) {
693 cs->mult >>= 1; 696 cs->mult >>= 1;
694 cs->shift--; 697 cs->shift--;
695 cs->maxadj = clocksource_max_adjustment(cs); 698 cs->maxadj = clocksource_max_adjustment(cs);
696 } 699 }
697 700
698 cs->max_idle_ns = clocksource_max_deferment(cs); 701 /*
702 * Only warn for *special* clocksources that self-define
703 * their mult/shift values and don't specify a freq.
704 */
705 WARN_ONCE(cs->mult + cs->maxadj < cs->mult,
706 "timekeeping: Clocksource %s might overflow on 11%% adjustment\n",
707 cs->name);
708
709 clocksource_update_max_deferment(cs);
710
711 pr_info("clocksource %s: mask: 0x%llx max_cycles: 0x%llx, max_idle_ns: %lld ns\n",
712 cs->name, cs->mask, cs->max_cycles, cs->max_idle_ns);
699} 713}
700EXPORT_SYMBOL_GPL(__clocksource_updatefreq_scale); 714EXPORT_SYMBOL_GPL(__clocksource_update_freq_scale);
701 715
702/** 716/**
703 * __clocksource_register_scale - Used to install new clocksources 717 * __clocksource_register_scale - Used to install new clocksources
@@ -714,7 +728,7 @@ int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq)
714{ 728{
715 729
716 /* Initialize mult/shift and max_idle_ns */ 730 /* Initialize mult/shift and max_idle_ns */
717 __clocksource_updatefreq_scale(cs, scale, freq); 731 __clocksource_update_freq_scale(cs, scale, freq);
718 732
719 /* Add clocksource to the clocksource list */ 733 /* Add clocksource to the clocksource list */
720 mutex_lock(&clocksource_mutex); 734 mutex_lock(&clocksource_mutex);
@@ -726,33 +740,6 @@ int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq)
726} 740}
727EXPORT_SYMBOL_GPL(__clocksource_register_scale); 741EXPORT_SYMBOL_GPL(__clocksource_register_scale);
728 742
729
730/**
731 * clocksource_register - Used to install new clocksources
732 * @cs: clocksource to be registered
733 *
734 * Returns -EBUSY if registration fails, zero otherwise.
735 */
736int clocksource_register(struct clocksource *cs)
737{
738 /* calculate max adjustment for given mult/shift */
739 cs->maxadj = clocksource_max_adjustment(cs);
740 WARN_ONCE(cs->mult + cs->maxadj < cs->mult,
741 "Clocksource %s might overflow on 11%% adjustment\n",
742 cs->name);
743
744 /* calculate max idle time permitted for this clocksource */
745 cs->max_idle_ns = clocksource_max_deferment(cs);
746
747 mutex_lock(&clocksource_mutex);
748 clocksource_enqueue(cs);
749 clocksource_enqueue_watchdog(cs);
750 clocksource_select();
751 mutex_unlock(&clocksource_mutex);
752 return 0;
753}
754EXPORT_SYMBOL(clocksource_register);
755
756static void __clocksource_change_rating(struct clocksource *cs, int rating) 743static void __clocksource_change_rating(struct clocksource *cs, int rating)
757{ 744{
758 list_del(&cs->list); 745 list_del(&cs->list);
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index bee0c1f78091..76d4bd962b19 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -54,7 +54,7 @@
54 54
55#include <trace/events/timer.h> 55#include <trace/events/timer.h>
56 56
57#include "timekeeping.h" 57#include "tick-internal.h"
58 58
59/* 59/*
60 * The timer bases: 60 * The timer bases:
@@ -1707,17 +1707,10 @@ static int hrtimer_cpu_notify(struct notifier_block *self,
1707 break; 1707 break;
1708 1708
1709#ifdef CONFIG_HOTPLUG_CPU 1709#ifdef CONFIG_HOTPLUG_CPU
1710 case CPU_DYING:
1711 case CPU_DYING_FROZEN:
1712 clockevents_notify(CLOCK_EVT_NOTIFY_CPU_DYING, &scpu);
1713 break;
1714 case CPU_DEAD: 1710 case CPU_DEAD:
1715 case CPU_DEAD_FROZEN: 1711 case CPU_DEAD_FROZEN:
1716 {
1717 clockevents_notify(CLOCK_EVT_NOTIFY_CPU_DEAD, &scpu);
1718 migrate_hrtimers(scpu); 1712 migrate_hrtimers(scpu);
1719 break; 1713 break;
1720 }
1721#endif 1714#endif
1722 1715
1723 default: 1716 default:
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index a6a5bf53e86d..347fecf86a3f 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -25,7 +25,7 @@
25#include <linux/module.h> 25#include <linux/module.h>
26#include <linux/init.h> 26#include <linux/init.h>
27 27
28#include "tick-internal.h" 28#include "timekeeping.h"
29 29
30/* The Jiffies based clocksource is the lowest common 30/* The Jiffies based clocksource is the lowest common
31 * denominator clock source which should function on 31 * denominator clock source which should function on
@@ -71,6 +71,7 @@ static struct clocksource clocksource_jiffies = {
71 .mask = 0xffffffff, /*32bits*/ 71 .mask = 0xffffffff, /*32bits*/
72 .mult = NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */ 72 .mult = NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */
73 .shift = JIFFIES_SHIFT, 73 .shift = JIFFIES_SHIFT,
74 .max_cycles = 10,
74}; 75};
75 76
76__cacheline_aligned_in_smp DEFINE_SEQLOCK(jiffies_lock); 77__cacheline_aligned_in_smp DEFINE_SEQLOCK(jiffies_lock);
@@ -94,7 +95,7 @@ EXPORT_SYMBOL(jiffies);
94 95
95static int __init init_jiffies_clocksource(void) 96static int __init init_jiffies_clocksource(void)
96{ 97{
97 return clocksource_register(&clocksource_jiffies); 98 return __clocksource_register(&clocksource_jiffies);
98} 99}
99 100
100core_initcall(init_jiffies_clocksource); 101core_initcall(init_jiffies_clocksource);
@@ -130,6 +131,6 @@ int register_refined_jiffies(long cycles_per_second)
130 131
131 refined_jiffies.mult = ((u32)nsec_per_tick) << JIFFIES_SHIFT; 132 refined_jiffies.mult = ((u32)nsec_per_tick) << JIFFIES_SHIFT;
132 133
133 clocksource_register(&refined_jiffies); 134 __clocksource_register(&refined_jiffies);
134 return 0; 135 return 0;
135} 136}
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 0f60b08a4f07..7a681003001c 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -17,7 +17,6 @@
17#include <linux/module.h> 17#include <linux/module.h>
18#include <linux/rtc.h> 18#include <linux/rtc.h>
19 19
20#include "tick-internal.h"
21#include "ntp_internal.h" 20#include "ntp_internal.h"
22 21
23/* 22/*
@@ -459,6 +458,16 @@ out:
459 return leap; 458 return leap;
460} 459}
461 460
461#ifdef CONFIG_GENERIC_CMOS_UPDATE
462int __weak update_persistent_clock64(struct timespec64 now64)
463{
464 struct timespec now;
465
466 now = timespec64_to_timespec(now64);
467 return update_persistent_clock(now);
468}
469#endif
470
462#if defined(CONFIG_GENERIC_CMOS_UPDATE) || defined(CONFIG_RTC_SYSTOHC) 471#if defined(CONFIG_GENERIC_CMOS_UPDATE) || defined(CONFIG_RTC_SYSTOHC)
463static void sync_cmos_clock(struct work_struct *work); 472static void sync_cmos_clock(struct work_struct *work);
464 473
@@ -494,8 +503,9 @@ static void sync_cmos_clock(struct work_struct *work)
494 if (persistent_clock_is_local) 503 if (persistent_clock_is_local)
495 adjust.tv_sec -= (sys_tz.tz_minuteswest * 60); 504 adjust.tv_sec -= (sys_tz.tz_minuteswest * 60);
496#ifdef CONFIG_GENERIC_CMOS_UPDATE 505#ifdef CONFIG_GENERIC_CMOS_UPDATE
497 fail = update_persistent_clock(timespec64_to_timespec(adjust)); 506 fail = update_persistent_clock64(adjust);
498#endif 507#endif
508
499#ifdef CONFIG_RTC_SYSTOHC 509#ifdef CONFIG_RTC_SYSTOHC
500 if (fail == -ENODEV) 510 if (fail == -ENODEV)
501 fail = rtc_set_ntp_time(adjust); 511 fail = rtc_set_ntp_time(adjust);
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c
index 01d2d15aa662..a26036d37a38 100644
--- a/kernel/time/sched_clock.c
+++ b/kernel/time/sched_clock.c
@@ -1,5 +1,6 @@
1/* 1/*
2 * sched_clock.c: support for extending counters to full 64-bit ns counter 2 * sched_clock.c: Generic sched_clock() support, to extend low level
3 * hardware time counters to full 64-bit ns values.
3 * 4 *
4 * This program is free software; you can redistribute it and/or modify 5 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License version 2 as 6 * it under the terms of the GNU General Public License version 2 as
@@ -18,15 +19,53 @@
18#include <linux/seqlock.h> 19#include <linux/seqlock.h>
19#include <linux/bitops.h> 20#include <linux/bitops.h>
20 21
21struct clock_data { 22/**
22 ktime_t wrap_kt; 23 * struct clock_read_data - data required to read from sched_clock()
24 *
25 * @epoch_ns: sched_clock() value at last update
26 * @epoch_cyc: Clock cycle value at last update.
27 * @sched_clock_mask: Bitmask for two's complement subtraction of non 64bit
28 * clocks.
29 * @read_sched_clock: Current clock source (or dummy source when suspended).
30 * @mult: Multipler for scaled math conversion.
31 * @shift: Shift value for scaled math conversion.
32 *
33 * Care must be taken when updating this structure; it is read by
34 * some very hot code paths. It occupies <=40 bytes and, when combined
35 * with the seqcount used to synchronize access, comfortably fits into
36 * a 64 byte cache line.
37 */
38struct clock_read_data {
23 u64 epoch_ns; 39 u64 epoch_ns;
24 u64 epoch_cyc; 40 u64 epoch_cyc;
25 seqcount_t seq; 41 u64 sched_clock_mask;
26 unsigned long rate; 42 u64 (*read_sched_clock)(void);
27 u32 mult; 43 u32 mult;
28 u32 shift; 44 u32 shift;
29 bool suspended; 45};
46
47/**
48 * struct clock_data - all data needed for sched_clock() (including
49 * registration of a new clock source)
50 *
51 * @seq: Sequence counter for protecting updates. The lowest
52 * bit is the index for @read_data.
53 * @read_data: Data required to read from sched_clock.
54 * @wrap_kt: Duration for which clock can run before wrapping.
55 * @rate: Tick rate of the registered clock.
56 * @actual_read_sched_clock: Registered hardware level clock read function.
57 *
58 * The ordering of this structure has been chosen to optimize cache
59 * performance. In particular 'seq' and 'read_data[0]' (combined) should fit
60 * into a single 64-byte cache line.
61 */
62struct clock_data {
63 seqcount_t seq;
64 struct clock_read_data read_data[2];
65 ktime_t wrap_kt;
66 unsigned long rate;
67
68 u64 (*actual_read_sched_clock)(void);
30}; 69};
31 70
32static struct hrtimer sched_clock_timer; 71static struct hrtimer sched_clock_timer;
@@ -34,12 +73,6 @@ static int irqtime = -1;
34 73
35core_param(irqtime, irqtime, int, 0400); 74core_param(irqtime, irqtime, int, 0400);
36 75
37static struct clock_data cd = {
38 .mult = NSEC_PER_SEC / HZ,
39};
40
41static u64 __read_mostly sched_clock_mask;
42
43static u64 notrace jiffy_sched_clock_read(void) 76static u64 notrace jiffy_sched_clock_read(void)
44{ 77{
45 /* 78 /*
@@ -49,7 +82,11 @@ static u64 notrace jiffy_sched_clock_read(void)
49 return (u64)(jiffies - INITIAL_JIFFIES); 82 return (u64)(jiffies - INITIAL_JIFFIES);
50} 83}
51 84
52static u64 __read_mostly (*read_sched_clock)(void) = jiffy_sched_clock_read; 85static struct clock_data cd ____cacheline_aligned = {
86 .read_data[0] = { .mult = NSEC_PER_SEC / HZ,
87 .read_sched_clock = jiffy_sched_clock_read, },
88 .actual_read_sched_clock = jiffy_sched_clock_read,
89};
53 90
54static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift) 91static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift)
55{ 92{
@@ -58,111 +95,136 @@ static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift)
58 95
59unsigned long long notrace sched_clock(void) 96unsigned long long notrace sched_clock(void)
60{ 97{
61 u64 epoch_ns; 98 u64 cyc, res;
62 u64 epoch_cyc;
63 u64 cyc;
64 unsigned long seq; 99 unsigned long seq;
65 100 struct clock_read_data *rd;
66 if (cd.suspended)
67 return cd.epoch_ns;
68 101
69 do { 102 do {
70 seq = raw_read_seqcount_begin(&cd.seq); 103 seq = raw_read_seqcount(&cd.seq);
71 epoch_cyc = cd.epoch_cyc; 104 rd = cd.read_data + (seq & 1);
72 epoch_ns = cd.epoch_ns; 105
106 cyc = (rd->read_sched_clock() - rd->epoch_cyc) &
107 rd->sched_clock_mask;
108 res = rd->epoch_ns + cyc_to_ns(cyc, rd->mult, rd->shift);
73 } while (read_seqcount_retry(&cd.seq, seq)); 109 } while (read_seqcount_retry(&cd.seq, seq));
74 110
75 cyc = read_sched_clock(); 111 return res;
76 cyc = (cyc - epoch_cyc) & sched_clock_mask; 112}
77 return epoch_ns + cyc_to_ns(cyc, cd.mult, cd.shift); 113
114/*
115 * Updating the data required to read the clock.
116 *
117 * sched_clock() will never observe mis-matched data even if called from
118 * an NMI. We do this by maintaining an odd/even copy of the data and
119 * steering sched_clock() to one or the other using a sequence counter.
120 * In order to preserve the data cache profile of sched_clock() as much
121 * as possible the system reverts back to the even copy when the update
122 * completes; the odd copy is used *only* during an update.
123 */
124static void update_clock_read_data(struct clock_read_data *rd)
125{
126 /* update the backup (odd) copy with the new data */
127 cd.read_data[1] = *rd;
128
129 /* steer readers towards the odd copy */
130 raw_write_seqcount_latch(&cd.seq);
131
132 /* now its safe for us to update the normal (even) copy */
133 cd.read_data[0] = *rd;
134
135 /* switch readers back to the even copy */
136 raw_write_seqcount_latch(&cd.seq);
78} 137}
79 138
80/* 139/*
81 * Atomically update the sched_clock epoch. 140 * Atomically update the sched_clock() epoch.
82 */ 141 */
83static void notrace update_sched_clock(void) 142static void update_sched_clock(void)
84{ 143{
85 unsigned long flags;
86 u64 cyc; 144 u64 cyc;
87 u64 ns; 145 u64 ns;
146 struct clock_read_data rd;
147
148 rd = cd.read_data[0];
149
150 cyc = cd.actual_read_sched_clock();
151 ns = rd.epoch_ns + cyc_to_ns((cyc - rd.epoch_cyc) & rd.sched_clock_mask, rd.mult, rd.shift);
152
153 rd.epoch_ns = ns;
154 rd.epoch_cyc = cyc;
88 155
89 cyc = read_sched_clock(); 156 update_clock_read_data(&rd);
90 ns = cd.epoch_ns +
91 cyc_to_ns((cyc - cd.epoch_cyc) & sched_clock_mask,
92 cd.mult, cd.shift);
93
94 raw_local_irq_save(flags);
95 raw_write_seqcount_begin(&cd.seq);
96 cd.epoch_ns = ns;
97 cd.epoch_cyc = cyc;
98 raw_write_seqcount_end(&cd.seq);
99 raw_local_irq_restore(flags);
100} 157}
101 158
102static enum hrtimer_restart sched_clock_poll(struct hrtimer *hrt) 159static enum hrtimer_restart sched_clock_poll(struct hrtimer *hrt)
103{ 160{
104 update_sched_clock(); 161 update_sched_clock();
105 hrtimer_forward_now(hrt, cd.wrap_kt); 162 hrtimer_forward_now(hrt, cd.wrap_kt);
163
106 return HRTIMER_RESTART; 164 return HRTIMER_RESTART;
107} 165}
108 166
109void __init sched_clock_register(u64 (*read)(void), int bits, 167void __init
110 unsigned long rate) 168sched_clock_register(u64 (*read)(void), int bits, unsigned long rate)
111{ 169{
112 u64 res, wrap, new_mask, new_epoch, cyc, ns; 170 u64 res, wrap, new_mask, new_epoch, cyc, ns;
113 u32 new_mult, new_shift; 171 u32 new_mult, new_shift;
114 ktime_t new_wrap_kt;
115 unsigned long r; 172 unsigned long r;
116 char r_unit; 173 char r_unit;
174 struct clock_read_data rd;
117 175
118 if (cd.rate > rate) 176 if (cd.rate > rate)
119 return; 177 return;
120 178
121 WARN_ON(!irqs_disabled()); 179 WARN_ON(!irqs_disabled());
122 180
123 /* calculate the mult/shift to convert counter ticks to ns. */ 181 /* Calculate the mult/shift to convert counter ticks to ns. */
124 clocks_calc_mult_shift(&new_mult, &new_shift, rate, NSEC_PER_SEC, 3600); 182 clocks_calc_mult_shift(&new_mult, &new_shift, rate, NSEC_PER_SEC, 3600);
125 183
126 new_mask = CLOCKSOURCE_MASK(bits); 184 new_mask = CLOCKSOURCE_MASK(bits);
185 cd.rate = rate;
186
187 /* Calculate how many nanosecs until we risk wrapping */
188 wrap = clocks_calc_max_nsecs(new_mult, new_shift, 0, new_mask, NULL);
189 cd.wrap_kt = ns_to_ktime(wrap);
127 190
128 /* calculate how many ns until we wrap */ 191 rd = cd.read_data[0];
129 wrap = clocks_calc_max_nsecs(new_mult, new_shift, 0, new_mask);
130 new_wrap_kt = ns_to_ktime(wrap - (wrap >> 3));
131 192
132 /* update epoch for new counter and update epoch_ns from old counter*/ 193 /* Update epoch for new counter and update 'epoch_ns' from old counter*/
133 new_epoch = read(); 194 new_epoch = read();
134 cyc = read_sched_clock(); 195 cyc = cd.actual_read_sched_clock();
135 ns = cd.epoch_ns + cyc_to_ns((cyc - cd.epoch_cyc) & sched_clock_mask, 196 ns = rd.epoch_ns + cyc_to_ns((cyc - rd.epoch_cyc) & rd.sched_clock_mask, rd.mult, rd.shift);
136 cd.mult, cd.shift); 197 cd.actual_read_sched_clock = read;
137 198
138 raw_write_seqcount_begin(&cd.seq); 199 rd.read_sched_clock = read;
139 read_sched_clock = read; 200 rd.sched_clock_mask = new_mask;
140 sched_clock_mask = new_mask; 201 rd.mult = new_mult;
141 cd.rate = rate; 202 rd.shift = new_shift;
142 cd.wrap_kt = new_wrap_kt; 203 rd.epoch_cyc = new_epoch;
143 cd.mult = new_mult; 204 rd.epoch_ns = ns;
144 cd.shift = new_shift; 205
145 cd.epoch_cyc = new_epoch; 206 update_clock_read_data(&rd);
146 cd.epoch_ns = ns;
147 raw_write_seqcount_end(&cd.seq);
148 207
149 r = rate; 208 r = rate;
150 if (r >= 4000000) { 209 if (r >= 4000000) {
151 r /= 1000000; 210 r /= 1000000;
152 r_unit = 'M'; 211 r_unit = 'M';
153 } else if (r >= 1000) { 212 } else {
154 r /= 1000; 213 if (r >= 1000) {
155 r_unit = 'k'; 214 r /= 1000;
156 } else 215 r_unit = 'k';
157 r_unit = ' '; 216 } else {
158 217 r_unit = ' ';
159 /* calculate the ns resolution of this counter */ 218 }
219 }
220
221 /* Calculate the ns resolution of this counter */
160 res = cyc_to_ns(1ULL, new_mult, new_shift); 222 res = cyc_to_ns(1ULL, new_mult, new_shift);
161 223
162 pr_info("sched_clock: %u bits at %lu%cHz, resolution %lluns, wraps every %lluns\n", 224 pr_info("sched_clock: %u bits at %lu%cHz, resolution %lluns, wraps every %lluns\n",
163 bits, r, r_unit, res, wrap); 225 bits, r, r_unit, res, wrap);
164 226
165 /* Enable IRQ time accounting if we have a fast enough sched_clock */ 227 /* Enable IRQ time accounting if we have a fast enough sched_clock() */
166 if (irqtime > 0 || (irqtime == -1 && rate >= 1000000)) 228 if (irqtime > 0 || (irqtime == -1 && rate >= 1000000))
167 enable_sched_clock_irqtime(); 229 enable_sched_clock_irqtime();
168 230
@@ -172,10 +234,10 @@ void __init sched_clock_register(u64 (*read)(void), int bits,
172void __init sched_clock_postinit(void) 234void __init sched_clock_postinit(void)
173{ 235{
174 /* 236 /*
175 * If no sched_clock function has been provided at that point, 237 * If no sched_clock() function has been provided at that point,
176 * make it the final one one. 238 * make it the final one one.
177 */ 239 */
178 if (read_sched_clock == jiffy_sched_clock_read) 240 if (cd.actual_read_sched_clock == jiffy_sched_clock_read)
179 sched_clock_register(jiffy_sched_clock_read, BITS_PER_LONG, HZ); 241 sched_clock_register(jiffy_sched_clock_read, BITS_PER_LONG, HZ);
180 242
181 update_sched_clock(); 243 update_sched_clock();
@@ -189,29 +251,53 @@ void __init sched_clock_postinit(void)
189 hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL); 251 hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL);
190} 252}
191 253
254/*
255 * Clock read function for use when the clock is suspended.
256 *
257 * This function makes it appear to sched_clock() as if the clock
258 * stopped counting at its last update.
259 *
260 * This function must only be called from the critical
261 * section in sched_clock(). It relies on the read_seqcount_retry()
262 * at the end of the critical section to be sure we observe the
263 * correct copy of 'epoch_cyc'.
264 */
265static u64 notrace suspended_sched_clock_read(void)
266{
267 unsigned long seq = raw_read_seqcount(&cd.seq);
268
269 return cd.read_data[seq & 1].epoch_cyc;
270}
271
192static int sched_clock_suspend(void) 272static int sched_clock_suspend(void)
193{ 273{
274 struct clock_read_data *rd = &cd.read_data[0];
275
194 update_sched_clock(); 276 update_sched_clock();
195 hrtimer_cancel(&sched_clock_timer); 277 hrtimer_cancel(&sched_clock_timer);
196 cd.suspended = true; 278 rd->read_sched_clock = suspended_sched_clock_read;
279
197 return 0; 280 return 0;
198} 281}
199 282
200static void sched_clock_resume(void) 283static void sched_clock_resume(void)
201{ 284{
202 cd.epoch_cyc = read_sched_clock(); 285 struct clock_read_data *rd = &cd.read_data[0];
286
287 rd->epoch_cyc = cd.actual_read_sched_clock();
203 hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL); 288 hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL);
204 cd.suspended = false; 289 rd->read_sched_clock = cd.actual_read_sched_clock;
205} 290}
206 291
207static struct syscore_ops sched_clock_ops = { 292static struct syscore_ops sched_clock_ops = {
208 .suspend = sched_clock_suspend, 293 .suspend = sched_clock_suspend,
209 .resume = sched_clock_resume, 294 .resume = sched_clock_resume,
210}; 295};
211 296
212static int __init sched_clock_syscore_init(void) 297static int __init sched_clock_syscore_init(void)
213{ 298{
214 register_syscore_ops(&sched_clock_ops); 299 register_syscore_ops(&sched_clock_ops);
300
215 return 0; 301 return 0;
216} 302}
217device_initcall(sched_clock_syscore_init); 303device_initcall(sched_clock_syscore_init);
diff --git a/kernel/time/tick-broadcast-hrtimer.c b/kernel/time/tick-broadcast-hrtimer.c
index eb682d5c697c..6aac4beedbbe 100644
--- a/kernel/time/tick-broadcast-hrtimer.c
+++ b/kernel/time/tick-broadcast-hrtimer.c
@@ -49,6 +49,7 @@ static void bc_set_mode(enum clock_event_mode mode,
49 */ 49 */
50static int bc_set_next(ktime_t expires, struct clock_event_device *bc) 50static int bc_set_next(ktime_t expires, struct clock_event_device *bc)
51{ 51{
52 int bc_moved;
52 /* 53 /*
53 * We try to cancel the timer first. If the callback is on 54 * We try to cancel the timer first. If the callback is on
54 * flight on some other cpu then we let it handle it. If we 55 * flight on some other cpu then we let it handle it. If we
@@ -60,9 +61,15 @@ static int bc_set_next(ktime_t expires, struct clock_event_device *bc)
60 * restart the timer because we are in the callback, but we 61 * restart the timer because we are in the callback, but we
61 * can set the expiry time and let the callback return 62 * can set the expiry time and let the callback return
62 * HRTIMER_RESTART. 63 * HRTIMER_RESTART.
64 *
65 * Since we are in the idle loop at this point and because
66 * hrtimer_{start/cancel} functions call into tracing,
67 * calls to these functions must be bound within RCU_NONIDLE.
63 */ 68 */
64 if (hrtimer_try_to_cancel(&bctimer) >= 0) { 69 RCU_NONIDLE(bc_moved = (hrtimer_try_to_cancel(&bctimer) >= 0) ?
65 hrtimer_start(&bctimer, expires, HRTIMER_MODE_ABS_PINNED); 70 !hrtimer_start(&bctimer, expires, HRTIMER_MODE_ABS_PINNED) :
71 0);
72 if (bc_moved) {
66 /* Bind the "device" to the cpu */ 73 /* Bind the "device" to the cpu */
67 bc->bound_on = smp_processor_id(); 74 bc->bound_on = smp_processor_id();
68 } else if (bc->bound_on == smp_processor_id()) { 75 } else if (bc->bound_on == smp_processor_id()) {
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 066f0ec05e48..7e8ca4f448a8 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -33,12 +33,14 @@ static cpumask_var_t tick_broadcast_mask;
33static cpumask_var_t tick_broadcast_on; 33static cpumask_var_t tick_broadcast_on;
34static cpumask_var_t tmpmask; 34static cpumask_var_t tmpmask;
35static DEFINE_RAW_SPINLOCK(tick_broadcast_lock); 35static DEFINE_RAW_SPINLOCK(tick_broadcast_lock);
36static int tick_broadcast_force; 36static int tick_broadcast_forced;
37 37
38#ifdef CONFIG_TICK_ONESHOT 38#ifdef CONFIG_TICK_ONESHOT
39static void tick_broadcast_clear_oneshot(int cpu); 39static void tick_broadcast_clear_oneshot(int cpu);
40static void tick_resume_broadcast_oneshot(struct clock_event_device *bc);
40#else 41#else
41static inline void tick_broadcast_clear_oneshot(int cpu) { } 42static inline void tick_broadcast_clear_oneshot(int cpu) { }
43static inline void tick_resume_broadcast_oneshot(struct clock_event_device *bc) { }
42#endif 44#endif
43 45
44/* 46/*
@@ -303,7 +305,7 @@ static void tick_handle_periodic_broadcast(struct clock_event_device *dev)
303 /* 305 /*
304 * The device is in periodic mode. No reprogramming necessary: 306 * The device is in periodic mode. No reprogramming necessary:
305 */ 307 */
306 if (dev->mode == CLOCK_EVT_MODE_PERIODIC) 308 if (dev->state == CLOCK_EVT_STATE_PERIODIC)
307 goto unlock; 309 goto unlock;
308 310
309 /* 311 /*
@@ -324,49 +326,54 @@ unlock:
324 raw_spin_unlock(&tick_broadcast_lock); 326 raw_spin_unlock(&tick_broadcast_lock);
325} 327}
326 328
327/* 329/**
328 * Powerstate information: The system enters/leaves a state, where 330 * tick_broadcast_control - Enable/disable or force broadcast mode
329 * affected devices might stop 331 * @mode: The selected broadcast mode
332 *
333 * Called when the system enters a state where affected tick devices
334 * might stop. Note: TICK_BROADCAST_FORCE cannot be undone.
335 *
336 * Called with interrupts disabled, so clockevents_lock is not
337 * required here because the local clock event device cannot go away
338 * under us.
330 */ 339 */
331static void tick_do_broadcast_on_off(unsigned long *reason) 340void tick_broadcast_control(enum tick_broadcast_mode mode)
332{ 341{
333 struct clock_event_device *bc, *dev; 342 struct clock_event_device *bc, *dev;
334 struct tick_device *td; 343 struct tick_device *td;
335 unsigned long flags;
336 int cpu, bc_stopped; 344 int cpu, bc_stopped;
337 345
338 raw_spin_lock_irqsave(&tick_broadcast_lock, flags); 346 td = this_cpu_ptr(&tick_cpu_device);
339
340 cpu = smp_processor_id();
341 td = &per_cpu(tick_cpu_device, cpu);
342 dev = td->evtdev; 347 dev = td->evtdev;
343 bc = tick_broadcast_device.evtdev;
344 348
345 /* 349 /*
346 * Is the device not affected by the powerstate ? 350 * Is the device not affected by the powerstate ?
347 */ 351 */
348 if (!dev || !(dev->features & CLOCK_EVT_FEAT_C3STOP)) 352 if (!dev || !(dev->features & CLOCK_EVT_FEAT_C3STOP))
349 goto out; 353 return;
350 354
351 if (!tick_device_is_functional(dev)) 355 if (!tick_device_is_functional(dev))
352 goto out; 356 return;
353 357
358 raw_spin_lock(&tick_broadcast_lock);
359 cpu = smp_processor_id();
360 bc = tick_broadcast_device.evtdev;
354 bc_stopped = cpumask_empty(tick_broadcast_mask); 361 bc_stopped = cpumask_empty(tick_broadcast_mask);
355 362
356 switch (*reason) { 363 switch (mode) {
357 case CLOCK_EVT_NOTIFY_BROADCAST_ON: 364 case TICK_BROADCAST_FORCE:
358 case CLOCK_EVT_NOTIFY_BROADCAST_FORCE: 365 tick_broadcast_forced = 1;
366 case TICK_BROADCAST_ON:
359 cpumask_set_cpu(cpu, tick_broadcast_on); 367 cpumask_set_cpu(cpu, tick_broadcast_on);
360 if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_mask)) { 368 if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_mask)) {
361 if (tick_broadcast_device.mode == 369 if (tick_broadcast_device.mode ==
362 TICKDEV_MODE_PERIODIC) 370 TICKDEV_MODE_PERIODIC)
363 clockevents_shutdown(dev); 371 clockevents_shutdown(dev);
364 } 372 }
365 if (*reason == CLOCK_EVT_NOTIFY_BROADCAST_FORCE)
366 tick_broadcast_force = 1;
367 break; 373 break;
368 case CLOCK_EVT_NOTIFY_BROADCAST_OFF: 374
369 if (tick_broadcast_force) 375 case TICK_BROADCAST_OFF:
376 if (tick_broadcast_forced)
370 break; 377 break;
371 cpumask_clear_cpu(cpu, tick_broadcast_on); 378 cpumask_clear_cpu(cpu, tick_broadcast_on);
372 if (!tick_device_is_functional(dev)) 379 if (!tick_device_is_functional(dev))
@@ -388,22 +395,9 @@ static void tick_do_broadcast_on_off(unsigned long *reason)
388 else 395 else
389 tick_broadcast_setup_oneshot(bc); 396 tick_broadcast_setup_oneshot(bc);
390 } 397 }
391out: 398 raw_spin_unlock(&tick_broadcast_lock);
392 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
393}
394
395/*
396 * Powerstate information: The system enters/leaves a state, where
397 * affected devices might stop.
398 */
399void tick_broadcast_on_off(unsigned long reason, int *oncpu)
400{
401 if (!cpumask_test_cpu(*oncpu, cpu_online_mask))
402 printk(KERN_ERR "tick-broadcast: ignoring broadcast for "
403 "offline CPU #%d\n", *oncpu);
404 else
405 tick_do_broadcast_on_off(&reason);
406} 399}
400EXPORT_SYMBOL_GPL(tick_broadcast_control);
407 401
408/* 402/*
409 * Set the periodic handler depending on broadcast on/off 403 * Set the periodic handler depending on broadcast on/off
@@ -416,14 +410,14 @@ void tick_set_periodic_handler(struct clock_event_device *dev, int broadcast)
416 dev->event_handler = tick_handle_periodic_broadcast; 410 dev->event_handler = tick_handle_periodic_broadcast;
417} 411}
418 412
413#ifdef CONFIG_HOTPLUG_CPU
419/* 414/*
420 * Remove a CPU from broadcasting 415 * Remove a CPU from broadcasting
421 */ 416 */
422void tick_shutdown_broadcast(unsigned int *cpup) 417void tick_shutdown_broadcast(unsigned int cpu)
423{ 418{
424 struct clock_event_device *bc; 419 struct clock_event_device *bc;
425 unsigned long flags; 420 unsigned long flags;
426 unsigned int cpu = *cpup;
427 421
428 raw_spin_lock_irqsave(&tick_broadcast_lock, flags); 422 raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
429 423
@@ -438,6 +432,7 @@ void tick_shutdown_broadcast(unsigned int *cpup)
438 432
439 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); 433 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
440} 434}
435#endif
441 436
442void tick_suspend_broadcast(void) 437void tick_suspend_broadcast(void)
443{ 438{
@@ -453,38 +448,48 @@ void tick_suspend_broadcast(void)
453 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); 448 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
454} 449}
455 450
456int tick_resume_broadcast(void) 451/*
452 * This is called from tick_resume_local() on a resuming CPU. That's
453 * called from the core resume function, tick_unfreeze() and the magic XEN
454 * resume hackery.
455 *
456 * In none of these cases the broadcast device mode can change and the
457 * bit of the resuming CPU in the broadcast mask is safe as well.
458 */
459bool tick_resume_check_broadcast(void)
460{
461 if (tick_broadcast_device.mode == TICKDEV_MODE_ONESHOT)
462 return false;
463 else
464 return cpumask_test_cpu(smp_processor_id(), tick_broadcast_mask);
465}
466
467void tick_resume_broadcast(void)
457{ 468{
458 struct clock_event_device *bc; 469 struct clock_event_device *bc;
459 unsigned long flags; 470 unsigned long flags;
460 int broadcast = 0;
461 471
462 raw_spin_lock_irqsave(&tick_broadcast_lock, flags); 472 raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
463 473
464 bc = tick_broadcast_device.evtdev; 474 bc = tick_broadcast_device.evtdev;
465 475
466 if (bc) { 476 if (bc) {
467 clockevents_set_mode(bc, CLOCK_EVT_MODE_RESUME); 477 clockevents_tick_resume(bc);
468 478
469 switch (tick_broadcast_device.mode) { 479 switch (tick_broadcast_device.mode) {
470 case TICKDEV_MODE_PERIODIC: 480 case TICKDEV_MODE_PERIODIC:
471 if (!cpumask_empty(tick_broadcast_mask)) 481 if (!cpumask_empty(tick_broadcast_mask))
472 tick_broadcast_start_periodic(bc); 482 tick_broadcast_start_periodic(bc);
473 broadcast = cpumask_test_cpu(smp_processor_id(),
474 tick_broadcast_mask);
475 break; 483 break;
476 case TICKDEV_MODE_ONESHOT: 484 case TICKDEV_MODE_ONESHOT:
477 if (!cpumask_empty(tick_broadcast_mask)) 485 if (!cpumask_empty(tick_broadcast_mask))
478 broadcast = tick_resume_broadcast_oneshot(bc); 486 tick_resume_broadcast_oneshot(bc);
479 break; 487 break;
480 } 488 }
481 } 489 }
482 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); 490 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
483
484 return broadcast;
485} 491}
486 492
487
488#ifdef CONFIG_TICK_ONESHOT 493#ifdef CONFIG_TICK_ONESHOT
489 494
490static cpumask_var_t tick_broadcast_oneshot_mask; 495static cpumask_var_t tick_broadcast_oneshot_mask;
@@ -532,8 +537,8 @@ static int tick_broadcast_set_event(struct clock_event_device *bc, int cpu,
532{ 537{
533 int ret; 538 int ret;
534 539
535 if (bc->mode != CLOCK_EVT_MODE_ONESHOT) 540 if (bc->state != CLOCK_EVT_STATE_ONESHOT)
536 clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); 541 clockevents_set_state(bc, CLOCK_EVT_STATE_ONESHOT);
537 542
538 ret = clockevents_program_event(bc, expires, force); 543 ret = clockevents_program_event(bc, expires, force);
539 if (!ret) 544 if (!ret)
@@ -541,10 +546,9 @@ static int tick_broadcast_set_event(struct clock_event_device *bc, int cpu,
541 return ret; 546 return ret;
542} 547}
543 548
544int tick_resume_broadcast_oneshot(struct clock_event_device *bc) 549static void tick_resume_broadcast_oneshot(struct clock_event_device *bc)
545{ 550{
546 clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); 551 clockevents_set_state(bc, CLOCK_EVT_STATE_ONESHOT);
547 return 0;
548} 552}
549 553
550/* 554/*
@@ -562,8 +566,8 @@ void tick_check_oneshot_broadcast_this_cpu(void)
562 * switched over, leave the device alone. 566 * switched over, leave the device alone.
563 */ 567 */
564 if (td->mode == TICKDEV_MODE_ONESHOT) { 568 if (td->mode == TICKDEV_MODE_ONESHOT) {
565 clockevents_set_mode(td->evtdev, 569 clockevents_set_state(td->evtdev,
566 CLOCK_EVT_MODE_ONESHOT); 570 CLOCK_EVT_STATE_ONESHOT);
567 } 571 }
568 } 572 }
569} 573}
@@ -666,31 +670,26 @@ static void broadcast_shutdown_local(struct clock_event_device *bc,
666 if (dev->next_event.tv64 < bc->next_event.tv64) 670 if (dev->next_event.tv64 < bc->next_event.tv64)
667 return; 671 return;
668 } 672 }
669 clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN); 673 clockevents_set_state(dev, CLOCK_EVT_STATE_SHUTDOWN);
670} 674}
671 675
672static void broadcast_move_bc(int deadcpu) 676/**
673{ 677 * tick_broadcast_oneshot_control - Enter/exit broadcast oneshot mode
674 struct clock_event_device *bc = tick_broadcast_device.evtdev; 678 * @state: The target state (enter/exit)
675 679 *
676 if (!bc || !broadcast_needs_cpu(bc, deadcpu)) 680 * The system enters/leaves a state, where affected devices might stop
677 return;
678 /* This moves the broadcast assignment to this cpu */
679 clockevents_program_event(bc, bc->next_event, 1);
680}
681
682/*
683 * Powerstate information: The system enters/leaves a state, where
684 * affected devices might stop
685 * Returns 0 on success, -EBUSY if the cpu is used to broadcast wakeups. 681 * Returns 0 on success, -EBUSY if the cpu is used to broadcast wakeups.
682 *
683 * Called with interrupts disabled, so clockevents_lock is not
684 * required here because the local clock event device cannot go away
685 * under us.
686 */ 686 */
687int tick_broadcast_oneshot_control(unsigned long reason) 687int tick_broadcast_oneshot_control(enum tick_broadcast_state state)
688{ 688{
689 struct clock_event_device *bc, *dev; 689 struct clock_event_device *bc, *dev;
690 struct tick_device *td; 690 struct tick_device *td;
691 unsigned long flags;
692 ktime_t now;
693 int cpu, ret = 0; 691 int cpu, ret = 0;
692 ktime_t now;
694 693
695 /* 694 /*
696 * Periodic mode does not care about the enter/exit of power 695 * Periodic mode does not care about the enter/exit of power
@@ -703,17 +702,17 @@ int tick_broadcast_oneshot_control(unsigned long reason)
703 * We are called with preemtion disabled from the depth of the 702 * We are called with preemtion disabled from the depth of the
704 * idle code, so we can't be moved away. 703 * idle code, so we can't be moved away.
705 */ 704 */
706 cpu = smp_processor_id(); 705 td = this_cpu_ptr(&tick_cpu_device);
707 td = &per_cpu(tick_cpu_device, cpu);
708 dev = td->evtdev; 706 dev = td->evtdev;
709 707
710 if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) 708 if (!(dev->features & CLOCK_EVT_FEAT_C3STOP))
711 return 0; 709 return 0;
712 710
711 raw_spin_lock(&tick_broadcast_lock);
713 bc = tick_broadcast_device.evtdev; 712 bc = tick_broadcast_device.evtdev;
713 cpu = smp_processor_id();
714 714
715 raw_spin_lock_irqsave(&tick_broadcast_lock, flags); 715 if (state == TICK_BROADCAST_ENTER) {
716 if (reason == CLOCK_EVT_NOTIFY_BROADCAST_ENTER) {
717 if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_oneshot_mask)) { 716 if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_oneshot_mask)) {
718 WARN_ON_ONCE(cpumask_test_cpu(cpu, tick_broadcast_pending_mask)); 717 WARN_ON_ONCE(cpumask_test_cpu(cpu, tick_broadcast_pending_mask));
719 broadcast_shutdown_local(bc, dev); 718 broadcast_shutdown_local(bc, dev);
@@ -741,7 +740,7 @@ int tick_broadcast_oneshot_control(unsigned long reason)
741 cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask); 740 cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask);
742 } else { 741 } else {
743 if (cpumask_test_and_clear_cpu(cpu, tick_broadcast_oneshot_mask)) { 742 if (cpumask_test_and_clear_cpu(cpu, tick_broadcast_oneshot_mask)) {
744 clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT); 743 clockevents_set_state(dev, CLOCK_EVT_STATE_ONESHOT);
745 /* 744 /*
746 * The cpu which was handling the broadcast 745 * The cpu which was handling the broadcast
747 * timer marked this cpu in the broadcast 746 * timer marked this cpu in the broadcast
@@ -805,9 +804,10 @@ int tick_broadcast_oneshot_control(unsigned long reason)
805 } 804 }
806 } 805 }
807out: 806out:
808 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); 807 raw_spin_unlock(&tick_broadcast_lock);
809 return ret; 808 return ret;
810} 809}
810EXPORT_SYMBOL_GPL(tick_broadcast_oneshot_control);
811 811
812/* 812/*
813 * Reset the one shot broadcast for a cpu 813 * Reset the one shot broadcast for a cpu
@@ -842,7 +842,7 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
842 842
843 /* Set it up only once ! */ 843 /* Set it up only once ! */
844 if (bc->event_handler != tick_handle_oneshot_broadcast) { 844 if (bc->event_handler != tick_handle_oneshot_broadcast) {
845 int was_periodic = bc->mode == CLOCK_EVT_MODE_PERIODIC; 845 int was_periodic = bc->state == CLOCK_EVT_STATE_PERIODIC;
846 846
847 bc->event_handler = tick_handle_oneshot_broadcast; 847 bc->event_handler = tick_handle_oneshot_broadcast;
848 848
@@ -858,7 +858,7 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
858 tick_broadcast_oneshot_mask, tmpmask); 858 tick_broadcast_oneshot_mask, tmpmask);
859 859
860 if (was_periodic && !cpumask_empty(tmpmask)) { 860 if (was_periodic && !cpumask_empty(tmpmask)) {
861 clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); 861 clockevents_set_state(bc, CLOCK_EVT_STATE_ONESHOT);
862 tick_broadcast_init_next_event(tmpmask, 862 tick_broadcast_init_next_event(tmpmask,
863 tick_next_period); 863 tick_next_period);
864 tick_broadcast_set_event(bc, cpu, tick_next_period, 1); 864 tick_broadcast_set_event(bc, cpu, tick_next_period, 1);
@@ -894,14 +894,28 @@ void tick_broadcast_switch_to_oneshot(void)
894 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); 894 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
895} 895}
896 896
897#ifdef CONFIG_HOTPLUG_CPU
898void hotplug_cpu__broadcast_tick_pull(int deadcpu)
899{
900 struct clock_event_device *bc;
901 unsigned long flags;
902
903 raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
904 bc = tick_broadcast_device.evtdev;
905
906 if (bc && broadcast_needs_cpu(bc, deadcpu)) {
907 /* This moves the broadcast assignment to this CPU: */
908 clockevents_program_event(bc, bc->next_event, 1);
909 }
910 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
911}
897 912
898/* 913/*
899 * Remove a dead CPU from broadcasting 914 * Remove a dead CPU from broadcasting
900 */ 915 */
901void tick_shutdown_broadcast_oneshot(unsigned int *cpup) 916void tick_shutdown_broadcast_oneshot(unsigned int cpu)
902{ 917{
903 unsigned long flags; 918 unsigned long flags;
904 unsigned int cpu = *cpup;
905 919
906 raw_spin_lock_irqsave(&tick_broadcast_lock, flags); 920 raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
907 921
@@ -913,10 +927,9 @@ void tick_shutdown_broadcast_oneshot(unsigned int *cpup)
913 cpumask_clear_cpu(cpu, tick_broadcast_pending_mask); 927 cpumask_clear_cpu(cpu, tick_broadcast_pending_mask);
914 cpumask_clear_cpu(cpu, tick_broadcast_force_mask); 928 cpumask_clear_cpu(cpu, tick_broadcast_force_mask);
915 929
916 broadcast_move_bc(cpu);
917
918 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); 930 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
919} 931}
932#endif
920 933
921/* 934/*
922 * Check, whether the broadcast device is in one shot mode 935 * Check, whether the broadcast device is in one shot mode
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index f7c515595b42..3ae6afa1eb98 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -102,7 +102,7 @@ void tick_handle_periodic(struct clock_event_device *dev)
102 102
103 tick_periodic(cpu); 103 tick_periodic(cpu);
104 104
105 if (dev->mode != CLOCK_EVT_MODE_ONESHOT) 105 if (dev->state != CLOCK_EVT_STATE_ONESHOT)
106 return; 106 return;
107 for (;;) { 107 for (;;) {
108 /* 108 /*
@@ -140,7 +140,7 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast)
140 140
141 if ((dev->features & CLOCK_EVT_FEAT_PERIODIC) && 141 if ((dev->features & CLOCK_EVT_FEAT_PERIODIC) &&
142 !tick_broadcast_oneshot_active()) { 142 !tick_broadcast_oneshot_active()) {
143 clockevents_set_mode(dev, CLOCK_EVT_MODE_PERIODIC); 143 clockevents_set_state(dev, CLOCK_EVT_STATE_PERIODIC);
144 } else { 144 } else {
145 unsigned long seq; 145 unsigned long seq;
146 ktime_t next; 146 ktime_t next;
@@ -150,7 +150,7 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast)
150 next = tick_next_period; 150 next = tick_next_period;
151 } while (read_seqretry(&jiffies_lock, seq)); 151 } while (read_seqretry(&jiffies_lock, seq));
152 152
153 clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT); 153 clockevents_set_state(dev, CLOCK_EVT_STATE_ONESHOT);
154 154
155 for (;;) { 155 for (;;) {
156 if (!clockevents_program_event(dev, next, false)) 156 if (!clockevents_program_event(dev, next, false))
@@ -332,14 +332,16 @@ out_bc:
332 tick_install_broadcast_device(newdev); 332 tick_install_broadcast_device(newdev);
333} 333}
334 334
335#ifdef CONFIG_HOTPLUG_CPU
335/* 336/*
336 * Transfer the do_timer job away from a dying cpu. 337 * Transfer the do_timer job away from a dying cpu.
337 * 338 *
338 * Called with interrupts disabled. 339 * Called with interrupts disabled. Not locking required. If
340 * tick_do_timer_cpu is owned by this cpu, nothing can change it.
339 */ 341 */
340void tick_handover_do_timer(int *cpup) 342void tick_handover_do_timer(void)
341{ 343{
342 if (*cpup == tick_do_timer_cpu) { 344 if (tick_do_timer_cpu == smp_processor_id()) {
343 int cpu = cpumask_first(cpu_online_mask); 345 int cpu = cpumask_first(cpu_online_mask);
344 346
345 tick_do_timer_cpu = (cpu < nr_cpu_ids) ? cpu : 347 tick_do_timer_cpu = (cpu < nr_cpu_ids) ? cpu :
@@ -354,9 +356,9 @@ void tick_handover_do_timer(int *cpup)
354 * access the hardware device itself. 356 * access the hardware device itself.
355 * We just set the mode and remove it from the lists. 357 * We just set the mode and remove it from the lists.
356 */ 358 */
357void tick_shutdown(unsigned int *cpup) 359void tick_shutdown(unsigned int cpu)
358{ 360{
359 struct tick_device *td = &per_cpu(tick_cpu_device, *cpup); 361 struct tick_device *td = &per_cpu(tick_cpu_device, cpu);
360 struct clock_event_device *dev = td->evtdev; 362 struct clock_event_device *dev = td->evtdev;
361 363
362 td->mode = TICKDEV_MODE_PERIODIC; 364 td->mode = TICKDEV_MODE_PERIODIC;
@@ -365,27 +367,42 @@ void tick_shutdown(unsigned int *cpup)
365 * Prevent that the clock events layer tries to call 367 * Prevent that the clock events layer tries to call
366 * the set mode function! 368 * the set mode function!
367 */ 369 */
370 dev->state = CLOCK_EVT_STATE_DETACHED;
368 dev->mode = CLOCK_EVT_MODE_UNUSED; 371 dev->mode = CLOCK_EVT_MODE_UNUSED;
369 clockevents_exchange_device(dev, NULL); 372 clockevents_exchange_device(dev, NULL);
370 dev->event_handler = clockevents_handle_noop; 373 dev->event_handler = clockevents_handle_noop;
371 td->evtdev = NULL; 374 td->evtdev = NULL;
372 } 375 }
373} 376}
377#endif
374 378
375void tick_suspend(void) 379/**
380 * tick_suspend_local - Suspend the local tick device
381 *
382 * Called from the local cpu for freeze with interrupts disabled.
383 *
384 * No locks required. Nothing can change the per cpu device.
385 */
386void tick_suspend_local(void)
376{ 387{
377 struct tick_device *td = this_cpu_ptr(&tick_cpu_device); 388 struct tick_device *td = this_cpu_ptr(&tick_cpu_device);
378 389
379 clockevents_shutdown(td->evtdev); 390 clockevents_shutdown(td->evtdev);
380} 391}
381 392
382void tick_resume(void) 393/**
394 * tick_resume_local - Resume the local tick device
395 *
396 * Called from the local CPU for unfreeze or XEN resume magic.
397 *
398 * No locks required. Nothing can change the per cpu device.
399 */
400void tick_resume_local(void)
383{ 401{
384 struct tick_device *td = this_cpu_ptr(&tick_cpu_device); 402 struct tick_device *td = this_cpu_ptr(&tick_cpu_device);
385 int broadcast = tick_resume_broadcast(); 403 bool broadcast = tick_resume_check_broadcast();
386
387 clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_RESUME);
388 404
405 clockevents_tick_resume(td->evtdev);
389 if (!broadcast) { 406 if (!broadcast) {
390 if (td->mode == TICKDEV_MODE_PERIODIC) 407 if (td->mode == TICKDEV_MODE_PERIODIC)
391 tick_setup_periodic(td->evtdev, 0); 408 tick_setup_periodic(td->evtdev, 0);
@@ -394,6 +411,35 @@ void tick_resume(void)
394 } 411 }
395} 412}
396 413
414/**
415 * tick_suspend - Suspend the tick and the broadcast device
416 *
417 * Called from syscore_suspend() via timekeeping_suspend with only one
418 * CPU online and interrupts disabled or from tick_unfreeze() under
419 * tick_freeze_lock.
420 *
421 * No locks required. Nothing can change the per cpu device.
422 */
423void tick_suspend(void)
424{
425 tick_suspend_local();
426 tick_suspend_broadcast();
427}
428
429/**
430 * tick_resume - Resume the tick and the broadcast device
431 *
432 * Called from syscore_resume() via timekeeping_resume with only one
433 * CPU online and interrupts disabled.
434 *
435 * No locks required. Nothing can change the per cpu device.
436 */
437void tick_resume(void)
438{
439 tick_resume_broadcast();
440 tick_resume_local();
441}
442
397static DEFINE_RAW_SPINLOCK(tick_freeze_lock); 443static DEFINE_RAW_SPINLOCK(tick_freeze_lock);
398static unsigned int tick_freeze_depth; 444static unsigned int tick_freeze_depth;
399 445
@@ -411,12 +457,10 @@ void tick_freeze(void)
411 raw_spin_lock(&tick_freeze_lock); 457 raw_spin_lock(&tick_freeze_lock);
412 458
413 tick_freeze_depth++; 459 tick_freeze_depth++;
414 if (tick_freeze_depth == num_online_cpus()) { 460 if (tick_freeze_depth == num_online_cpus())
415 timekeeping_suspend(); 461 timekeeping_suspend();
416 } else { 462 else
417 tick_suspend(); 463 tick_suspend_local();
418 tick_suspend_broadcast();
419 }
420 464
421 raw_spin_unlock(&tick_freeze_lock); 465 raw_spin_unlock(&tick_freeze_lock);
422} 466}
@@ -437,7 +481,7 @@ void tick_unfreeze(void)
437 if (tick_freeze_depth == num_online_cpus()) 481 if (tick_freeze_depth == num_online_cpus())
438 timekeeping_resume(); 482 timekeeping_resume();
439 else 483 else
440 tick_resume(); 484 tick_resume_local();
441 485
442 tick_freeze_depth--; 486 tick_freeze_depth--;
443 487
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index 366aeb4f2c66..b64fdd8054c5 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -5,15 +5,12 @@
5#include <linux/tick.h> 5#include <linux/tick.h>
6 6
7#include "timekeeping.h" 7#include "timekeeping.h"
8#include "tick-sched.h"
8 9
9extern seqlock_t jiffies_lock; 10#ifdef CONFIG_GENERIC_CLOCKEVENTS
10 11
11#define CS_NAME_LEN 32 12# define TICK_DO_TIMER_NONE -1
12 13# define TICK_DO_TIMER_BOOT -2
13#ifdef CONFIG_GENERIC_CLOCKEVENTS_BUILD
14
15#define TICK_DO_TIMER_NONE -1
16#define TICK_DO_TIMER_BOOT -2
17 14
18DECLARE_PER_CPU(struct tick_device, tick_cpu_device); 15DECLARE_PER_CPU(struct tick_device, tick_cpu_device);
19extern ktime_t tick_next_period; 16extern ktime_t tick_next_period;
@@ -23,21 +20,72 @@ extern int tick_do_timer_cpu __read_mostly;
23extern void tick_setup_periodic(struct clock_event_device *dev, int broadcast); 20extern void tick_setup_periodic(struct clock_event_device *dev, int broadcast);
24extern void tick_handle_periodic(struct clock_event_device *dev); 21extern void tick_handle_periodic(struct clock_event_device *dev);
25extern void tick_check_new_device(struct clock_event_device *dev); 22extern void tick_check_new_device(struct clock_event_device *dev);
26extern void tick_handover_do_timer(int *cpup); 23extern void tick_shutdown(unsigned int cpu);
27extern void tick_shutdown(unsigned int *cpup);
28extern void tick_suspend(void); 24extern void tick_suspend(void);
29extern void tick_resume(void); 25extern void tick_resume(void);
30extern bool tick_check_replacement(struct clock_event_device *curdev, 26extern bool tick_check_replacement(struct clock_event_device *curdev,
31 struct clock_event_device *newdev); 27 struct clock_event_device *newdev);
32extern void tick_install_replacement(struct clock_event_device *dev); 28extern void tick_install_replacement(struct clock_event_device *dev);
29extern int tick_is_oneshot_available(void);
30extern struct tick_device *tick_get_device(int cpu);
33 31
34extern void clockevents_shutdown(struct clock_event_device *dev); 32extern int clockevents_tick_resume(struct clock_event_device *dev);
33/* Check, if the device is functional or a dummy for broadcast */
34static inline int tick_device_is_functional(struct clock_event_device *dev)
35{
36 return !(dev->features & CLOCK_EVT_FEAT_DUMMY);
37}
35 38
39extern void clockevents_shutdown(struct clock_event_device *dev);
40extern void clockevents_exchange_device(struct clock_event_device *old,
41 struct clock_event_device *new);
42extern void clockevents_set_state(struct clock_event_device *dev,
43 enum clock_event_state state);
44extern int clockevents_program_event(struct clock_event_device *dev,
45 ktime_t expires, bool force);
46extern void clockevents_handle_noop(struct clock_event_device *dev);
47extern int __clockevents_update_freq(struct clock_event_device *dev, u32 freq);
36extern ssize_t sysfs_get_uname(const char *buf, char *dst, size_t cnt); 48extern ssize_t sysfs_get_uname(const char *buf, char *dst, size_t cnt);
37 49
38/* 50/* Broadcasting support */
39 * NO_HZ / high resolution timer shared code 51# ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
40 */ 52extern int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu);
53extern void tick_install_broadcast_device(struct clock_event_device *dev);
54extern int tick_is_broadcast_device(struct clock_event_device *dev);
55extern void tick_shutdown_broadcast(unsigned int cpu);
56extern void tick_suspend_broadcast(void);
57extern void tick_resume_broadcast(void);
58extern bool tick_resume_check_broadcast(void);
59extern void tick_broadcast_init(void);
60extern void tick_set_periodic_handler(struct clock_event_device *dev, int broadcast);
61extern int tick_broadcast_update_freq(struct clock_event_device *dev, u32 freq);
62extern struct tick_device *tick_get_broadcast_device(void);
63extern struct cpumask *tick_get_broadcast_mask(void);
64# else /* !CONFIG_GENERIC_CLOCKEVENTS_BROADCAST: */
65static inline void tick_install_broadcast_device(struct clock_event_device *dev) { }
66static inline int tick_is_broadcast_device(struct clock_event_device *dev) { return 0; }
67static inline int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu) { return 0; }
68static inline void tick_do_periodic_broadcast(struct clock_event_device *d) { }
69static inline void tick_shutdown_broadcast(unsigned int cpu) { }
70static inline void tick_suspend_broadcast(void) { }
71static inline void tick_resume_broadcast(void) { }
72static inline bool tick_resume_check_broadcast(void) { return false; }
73static inline void tick_broadcast_init(void) { }
74static inline int tick_broadcast_update_freq(struct clock_event_device *dev, u32 freq) { return -ENODEV; }
75
76/* Set the periodic handler in non broadcast mode */
77static inline void tick_set_periodic_handler(struct clock_event_device *dev, int broadcast)
78{
79 dev->event_handler = tick_handle_periodic;
80}
81# endif /* !CONFIG_GENERIC_CLOCKEVENTS_BROADCAST */
82
83#else /* !GENERIC_CLOCKEVENTS: */
84static inline void tick_suspend(void) { }
85static inline void tick_resume(void) { }
86#endif /* !GENERIC_CLOCKEVENTS */
87
88/* Oneshot related functions */
41#ifdef CONFIG_TICK_ONESHOT 89#ifdef CONFIG_TICK_ONESHOT
42extern void tick_setup_oneshot(struct clock_event_device *newdev, 90extern void tick_setup_oneshot(struct clock_event_device *newdev,
43 void (*handler)(struct clock_event_device *), 91 void (*handler)(struct clock_event_device *),
@@ -46,58 +94,42 @@ extern int tick_program_event(ktime_t expires, int force);
46extern void tick_oneshot_notify(void); 94extern void tick_oneshot_notify(void);
47extern int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *)); 95extern int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *));
48extern void tick_resume_oneshot(void); 96extern void tick_resume_oneshot(void);
49# ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST 97static inline bool tick_oneshot_possible(void) { return true; }
98extern int tick_oneshot_mode_active(void);
99extern void tick_clock_notify(void);
100extern int tick_check_oneshot_change(int allow_nohz);
101extern int tick_init_highres(void);
102#else /* !CONFIG_TICK_ONESHOT: */
103static inline
104void tick_setup_oneshot(struct clock_event_device *newdev,
105 void (*handler)(struct clock_event_device *),
106 ktime_t nextevt) { BUG(); }
107static inline void tick_resume_oneshot(void) { BUG(); }
108static inline int tick_program_event(ktime_t expires, int force) { return 0; }
109static inline void tick_oneshot_notify(void) { }
110static inline bool tick_oneshot_possible(void) { return false; }
111static inline int tick_oneshot_mode_active(void) { return 0; }
112static inline void tick_clock_notify(void) { }
113static inline int tick_check_oneshot_change(int allow_nohz) { return 0; }
114#endif /* !CONFIG_TICK_ONESHOT */
115
116/* Functions related to oneshot broadcasting */
117#if defined(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) && defined(CONFIG_TICK_ONESHOT)
50extern void tick_broadcast_setup_oneshot(struct clock_event_device *bc); 118extern void tick_broadcast_setup_oneshot(struct clock_event_device *bc);
51extern int tick_broadcast_oneshot_control(unsigned long reason);
52extern void tick_broadcast_switch_to_oneshot(void); 119extern void tick_broadcast_switch_to_oneshot(void);
53extern void tick_shutdown_broadcast_oneshot(unsigned int *cpup); 120extern void tick_shutdown_broadcast_oneshot(unsigned int cpu);
54extern int tick_resume_broadcast_oneshot(struct clock_event_device *bc);
55extern int tick_broadcast_oneshot_active(void); 121extern int tick_broadcast_oneshot_active(void);
56extern void tick_check_oneshot_broadcast_this_cpu(void); 122extern void tick_check_oneshot_broadcast_this_cpu(void);
57bool tick_broadcast_oneshot_available(void); 123bool tick_broadcast_oneshot_available(void);
58# else /* BROADCAST */ 124extern struct cpumask *tick_get_broadcast_oneshot_mask(void);
59static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) 125#else /* !(BROADCAST && ONESHOT): */
60{ 126static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) { BUG(); }
61 BUG();
62}
63static inline int tick_broadcast_oneshot_control(unsigned long reason) { return 0; }
64static inline void tick_broadcast_switch_to_oneshot(void) { } 127static inline void tick_broadcast_switch_to_oneshot(void) { }
65static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { } 128static inline void tick_shutdown_broadcast_oneshot(unsigned int cpu) { }
66static inline int tick_broadcast_oneshot_active(void) { return 0; } 129static inline int tick_broadcast_oneshot_active(void) { return 0; }
67static inline void tick_check_oneshot_broadcast_this_cpu(void) { } 130static inline void tick_check_oneshot_broadcast_this_cpu(void) { }
68static inline bool tick_broadcast_oneshot_available(void) { return true; } 131static inline bool tick_broadcast_oneshot_available(void) { return tick_oneshot_possible(); }
69# endif /* !BROADCAST */ 132#endif /* !(BROADCAST && ONESHOT) */
70
71#else /* !ONESHOT */
72static inline
73void tick_setup_oneshot(struct clock_event_device *newdev,
74 void (*handler)(struct clock_event_device *),
75 ktime_t nextevt)
76{
77 BUG();
78}
79static inline void tick_resume_oneshot(void)
80{
81 BUG();
82}
83static inline int tick_program_event(ktime_t expires, int force)
84{
85 return 0;
86}
87static inline void tick_oneshot_notify(void) { }
88static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
89{
90 BUG();
91}
92static inline int tick_broadcast_oneshot_control(unsigned long reason) { return 0; }
93static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { }
94static inline int tick_resume_broadcast_oneshot(struct clock_event_device *bc)
95{
96 return 0;
97}
98static inline int tick_broadcast_oneshot_active(void) { return 0; }
99static inline bool tick_broadcast_oneshot_available(void) { return false; }
100#endif /* !TICK_ONESHOT */
101 133
102/* NO_HZ_FULL internal */ 134/* NO_HZ_FULL internal */
103#ifdef CONFIG_NO_HZ_FULL 135#ifdef CONFIG_NO_HZ_FULL
@@ -105,68 +137,3 @@ extern void tick_nohz_init(void);
105# else 137# else
106static inline void tick_nohz_init(void) { } 138static inline void tick_nohz_init(void) { }
107#endif 139#endif
108
109/*
110 * Broadcasting support
111 */
112#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
113extern int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu);
114extern void tick_install_broadcast_device(struct clock_event_device *dev);
115extern int tick_is_broadcast_device(struct clock_event_device *dev);
116extern void tick_broadcast_on_off(unsigned long reason, int *oncpu);
117extern void tick_shutdown_broadcast(unsigned int *cpup);
118extern void tick_suspend_broadcast(void);
119extern int tick_resume_broadcast(void);
120extern void tick_broadcast_init(void);
121extern void
122tick_set_periodic_handler(struct clock_event_device *dev, int broadcast);
123int tick_broadcast_update_freq(struct clock_event_device *dev, u32 freq);
124
125#else /* !BROADCAST */
126
127static inline void tick_install_broadcast_device(struct clock_event_device *dev)
128{
129}
130
131static inline int tick_is_broadcast_device(struct clock_event_device *dev)
132{
133 return 0;
134}
135static inline int tick_device_uses_broadcast(struct clock_event_device *dev,
136 int cpu)
137{
138 return 0;
139}
140static inline void tick_do_periodic_broadcast(struct clock_event_device *d) { }
141static inline void tick_broadcast_on_off(unsigned long reason, int *oncpu) { }
142static inline void tick_shutdown_broadcast(unsigned int *cpup) { }
143static inline void tick_suspend_broadcast(void) { }
144static inline int tick_resume_broadcast(void) { return 0; }
145static inline void tick_broadcast_init(void) { }
146static inline int tick_broadcast_update_freq(struct clock_event_device *dev,
147 u32 freq) { return -ENODEV; }
148
149/*
150 * Set the periodic handler in non broadcast mode
151 */
152static inline void tick_set_periodic_handler(struct clock_event_device *dev,
153 int broadcast)
154{
155 dev->event_handler = tick_handle_periodic;
156}
157#endif /* !BROADCAST */
158
159/*
160 * Check, if the device is functional or a dummy for broadcast
161 */
162static inline int tick_device_is_functional(struct clock_event_device *dev)
163{
164 return !(dev->features & CLOCK_EVT_FEAT_DUMMY);
165}
166
167int __clockevents_update_freq(struct clock_event_device *dev, u32 freq);
168
169#endif
170
171extern void do_timer(unsigned long ticks);
172extern void update_wall_time(void);
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c
index 7ce740e78e1b..67a64b1670bf 100644
--- a/kernel/time/tick-oneshot.c
+++ b/kernel/time/tick-oneshot.c
@@ -38,7 +38,7 @@ void tick_resume_oneshot(void)
38{ 38{
39 struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); 39 struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);
40 40
41 clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT); 41 clockevents_set_state(dev, CLOCK_EVT_STATE_ONESHOT);
42 clockevents_program_event(dev, ktime_get(), true); 42 clockevents_program_event(dev, ktime_get(), true);
43} 43}
44 44
@@ -50,7 +50,7 @@ void tick_setup_oneshot(struct clock_event_device *newdev,
50 ktime_t next_event) 50 ktime_t next_event)
51{ 51{
52 newdev->event_handler = handler; 52 newdev->event_handler = handler;
53 clockevents_set_mode(newdev, CLOCK_EVT_MODE_ONESHOT); 53 clockevents_set_state(newdev, CLOCK_EVT_STATE_ONESHOT);
54 clockevents_program_event(newdev, next_event, true); 54 clockevents_program_event(newdev, next_event, true);
55} 55}
56 56
@@ -81,7 +81,7 @@ int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *))
81 81
82 td->mode = TICKDEV_MODE_ONESHOT; 82 td->mode = TICKDEV_MODE_ONESHOT;
83 dev->event_handler = handler; 83 dev->event_handler = handler;
84 clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT); 84 clockevents_set_state(dev, CLOCK_EVT_STATE_ONESHOT);
85 tick_broadcast_switch_to_oneshot(); 85 tick_broadcast_switch_to_oneshot();
86 return 0; 86 return 0;
87} 87}
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index a4c4edac4528..914259128145 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -34,7 +34,7 @@
34/* 34/*
35 * Per cpu nohz control structure 35 * Per cpu nohz control structure
36 */ 36 */
37DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched); 37static DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched);
38 38
39/* 39/*
40 * The time, when the last jiffy update happened. Protected by jiffies_lock. 40 * The time, when the last jiffy update happened. Protected by jiffies_lock.
@@ -416,6 +416,11 @@ static int __init setup_tick_nohz(char *str)
416 416
417__setup("nohz=", setup_tick_nohz); 417__setup("nohz=", setup_tick_nohz);
418 418
419int tick_nohz_tick_stopped(void)
420{
421 return __this_cpu_read(tick_cpu_sched.tick_stopped);
422}
423
419/** 424/**
420 * tick_nohz_update_jiffies - update jiffies when idle was interrupted 425 * tick_nohz_update_jiffies - update jiffies when idle was interrupted
421 * 426 *
diff --git a/kernel/time/tick-sched.h b/kernel/time/tick-sched.h
new file mode 100644
index 000000000000..28b5da3e1a17
--- /dev/null
+++ b/kernel/time/tick-sched.h
@@ -0,0 +1,74 @@
1#ifndef _TICK_SCHED_H
2#define _TICK_SCHED_H
3
4#include <linux/hrtimer.h>
5
6enum tick_device_mode {
7 TICKDEV_MODE_PERIODIC,
8 TICKDEV_MODE_ONESHOT,
9};
10
11struct tick_device {
12 struct clock_event_device *evtdev;
13 enum tick_device_mode mode;
14};
15
16enum tick_nohz_mode {
17 NOHZ_MODE_INACTIVE,
18 NOHZ_MODE_LOWRES,
19 NOHZ_MODE_HIGHRES,
20};
21
22/**
23 * struct tick_sched - sched tick emulation and no idle tick control/stats
24 * @sched_timer: hrtimer to schedule the periodic tick in high
25 * resolution mode
26 * @last_tick: Store the last tick expiry time when the tick
27 * timer is modified for nohz sleeps. This is necessary
28 * to resume the tick timer operation in the timeline
29 * when the CPU returns from nohz sleep.
30 * @tick_stopped: Indicator that the idle tick has been stopped
31 * @idle_jiffies: jiffies at the entry to idle for idle time accounting
32 * @idle_calls: Total number of idle calls
33 * @idle_sleeps: Number of idle calls, where the sched tick was stopped
34 * @idle_entrytime: Time when the idle call was entered
35 * @idle_waketime: Time when the idle was interrupted
36 * @idle_exittime: Time when the idle state was left
37 * @idle_sleeptime: Sum of the time slept in idle with sched tick stopped
38 * @iowait_sleeptime: Sum of the time slept in idle with sched tick stopped, with IO outstanding
39 * @sleep_length: Duration of the current idle sleep
40 * @do_timer_lst: CPU was the last one doing do_timer before going idle
41 */
42struct tick_sched {
43 struct hrtimer sched_timer;
44 unsigned long check_clocks;
45 enum tick_nohz_mode nohz_mode;
46 ktime_t last_tick;
47 int inidle;
48 int tick_stopped;
49 unsigned long idle_jiffies;
50 unsigned long idle_calls;
51 unsigned long idle_sleeps;
52 int idle_active;
53 ktime_t idle_entrytime;
54 ktime_t idle_waketime;
55 ktime_t idle_exittime;
56 ktime_t idle_sleeptime;
57 ktime_t iowait_sleeptime;
58 ktime_t sleep_length;
59 unsigned long last_jiffies;
60 unsigned long next_jiffies;
61 ktime_t idle_expires;
62 int do_timer_last;
63};
64
65extern struct tick_sched *tick_get_tick_sched(int cpu);
66
67extern void tick_setup_sched_timer(void);
68#if defined CONFIG_NO_HZ_COMMON || defined CONFIG_HIGH_RES_TIMERS
69extern void tick_cancel_sched_timer(int cpu);
70#else
71static inline void tick_cancel_sched_timer(int cpu) { }
72#endif
73
74#endif
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 91db94136c10..946acb72179f 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -59,17 +59,15 @@ struct tk_fast {
59}; 59};
60 60
61static struct tk_fast tk_fast_mono ____cacheline_aligned; 61static struct tk_fast tk_fast_mono ____cacheline_aligned;
62static struct tk_fast tk_fast_raw ____cacheline_aligned;
62 63
63/* flag for if timekeeping is suspended */ 64/* flag for if timekeeping is suspended */
64int __read_mostly timekeeping_suspended; 65int __read_mostly timekeeping_suspended;
65 66
66/* Flag for if there is a persistent clock on this platform */
67bool __read_mostly persistent_clock_exist = false;
68
69static inline void tk_normalize_xtime(struct timekeeper *tk) 67static inline void tk_normalize_xtime(struct timekeeper *tk)
70{ 68{
71 while (tk->tkr.xtime_nsec >= ((u64)NSEC_PER_SEC << tk->tkr.shift)) { 69 while (tk->tkr_mono.xtime_nsec >= ((u64)NSEC_PER_SEC << tk->tkr_mono.shift)) {
72 tk->tkr.xtime_nsec -= (u64)NSEC_PER_SEC << tk->tkr.shift; 70 tk->tkr_mono.xtime_nsec -= (u64)NSEC_PER_SEC << tk->tkr_mono.shift;
73 tk->xtime_sec++; 71 tk->xtime_sec++;
74 } 72 }
75} 73}
@@ -79,20 +77,20 @@ static inline struct timespec64 tk_xtime(struct timekeeper *tk)
79 struct timespec64 ts; 77 struct timespec64 ts;
80 78
81 ts.tv_sec = tk->xtime_sec; 79 ts.tv_sec = tk->xtime_sec;
82 ts.tv_nsec = (long)(tk->tkr.xtime_nsec >> tk->tkr.shift); 80 ts.tv_nsec = (long)(tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift);
83 return ts; 81 return ts;
84} 82}
85 83
86static void tk_set_xtime(struct timekeeper *tk, const struct timespec64 *ts) 84static void tk_set_xtime(struct timekeeper *tk, const struct timespec64 *ts)
87{ 85{
88 tk->xtime_sec = ts->tv_sec; 86 tk->xtime_sec = ts->tv_sec;
89 tk->tkr.xtime_nsec = (u64)ts->tv_nsec << tk->tkr.shift; 87 tk->tkr_mono.xtime_nsec = (u64)ts->tv_nsec << tk->tkr_mono.shift;
90} 88}
91 89
92static void tk_xtime_add(struct timekeeper *tk, const struct timespec64 *ts) 90static void tk_xtime_add(struct timekeeper *tk, const struct timespec64 *ts)
93{ 91{
94 tk->xtime_sec += ts->tv_sec; 92 tk->xtime_sec += ts->tv_sec;
95 tk->tkr.xtime_nsec += (u64)ts->tv_nsec << tk->tkr.shift; 93 tk->tkr_mono.xtime_nsec += (u64)ts->tv_nsec << tk->tkr_mono.shift;
96 tk_normalize_xtime(tk); 94 tk_normalize_xtime(tk);
97} 95}
98 96
@@ -118,6 +116,117 @@ static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta)
118 tk->offs_boot = ktime_add(tk->offs_boot, delta); 116 tk->offs_boot = ktime_add(tk->offs_boot, delta);
119} 117}
120 118
119#ifdef CONFIG_DEBUG_TIMEKEEPING
120#define WARNING_FREQ (HZ*300) /* 5 minute rate-limiting */
121/*
122 * These simple flag variables are managed
123 * without locks, which is racy, but ok since
124 * we don't really care about being super
125 * precise about how many events were seen,
126 * just that a problem was observed.
127 */
128static int timekeeping_underflow_seen;
129static int timekeeping_overflow_seen;
130
131/* last_warning is only modified under the timekeeping lock */
132static long timekeeping_last_warning;
133
134static void timekeeping_check_update(struct timekeeper *tk, cycle_t offset)
135{
136
137 cycle_t max_cycles = tk->tkr_mono.clock->max_cycles;
138 const char *name = tk->tkr_mono.clock->name;
139
140 if (offset > max_cycles) {
141 printk_deferred("WARNING: timekeeping: Cycle offset (%lld) is larger than allowed by the '%s' clock's max_cycles value (%lld): time overflow danger\n",
142 offset, name, max_cycles);
143 printk_deferred(" timekeeping: Your kernel is sick, but tries to cope by capping time updates\n");
144 } else {
145 if (offset > (max_cycles >> 1)) {
146 printk_deferred("INFO: timekeeping: Cycle offset (%lld) is larger than the the '%s' clock's 50%% safety margin (%lld)\n",
147 offset, name, max_cycles >> 1);
148 printk_deferred(" timekeeping: Your kernel is still fine, but is feeling a bit nervous\n");
149 }
150 }
151
152 if (timekeeping_underflow_seen) {
153 if (jiffies - timekeeping_last_warning > WARNING_FREQ) {
154 printk_deferred("WARNING: Underflow in clocksource '%s' observed, time update ignored.\n", name);
155 printk_deferred(" Please report this, consider using a different clocksource, if possible.\n");
156 printk_deferred(" Your kernel is probably still fine.\n");
157 timekeeping_last_warning = jiffies;
158 }
159 timekeeping_underflow_seen = 0;
160 }
161
162 if (timekeeping_overflow_seen) {
163 if (jiffies - timekeeping_last_warning > WARNING_FREQ) {
164 printk_deferred("WARNING: Overflow in clocksource '%s' observed, time update capped.\n", name);
165 printk_deferred(" Please report this, consider using a different clocksource, if possible.\n");
166 printk_deferred(" Your kernel is probably still fine.\n");
167 timekeeping_last_warning = jiffies;
168 }
169 timekeeping_overflow_seen = 0;
170 }
171}
172
173static inline cycle_t timekeeping_get_delta(struct tk_read_base *tkr)
174{
175 cycle_t now, last, mask, max, delta;
176 unsigned int seq;
177
178 /*
179 * Since we're called holding a seqlock, the data may shift
180 * under us while we're doing the calculation. This can cause
181 * false positives, since we'd note a problem but throw the
182 * results away. So nest another seqlock here to atomically
183 * grab the points we are checking with.
184 */
185 do {
186 seq = read_seqcount_begin(&tk_core.seq);
187 now = tkr->read(tkr->clock);
188 last = tkr->cycle_last;
189 mask = tkr->mask;
190 max = tkr->clock->max_cycles;
191 } while (read_seqcount_retry(&tk_core.seq, seq));
192
193 delta = clocksource_delta(now, last, mask);
194
195 /*
196 * Try to catch underflows by checking if we are seeing small
197 * mask-relative negative values.
198 */
199 if (unlikely((~delta & mask) < (mask >> 3))) {
200 timekeeping_underflow_seen = 1;
201 delta = 0;
202 }
203
204 /* Cap delta value to the max_cycles values to avoid mult overflows */
205 if (unlikely(delta > max)) {
206 timekeeping_overflow_seen = 1;
207 delta = tkr->clock->max_cycles;
208 }
209
210 return delta;
211}
212#else
213static inline void timekeeping_check_update(struct timekeeper *tk, cycle_t offset)
214{
215}
216static inline cycle_t timekeeping_get_delta(struct tk_read_base *tkr)
217{
218 cycle_t cycle_now, delta;
219
220 /* read clocksource */
221 cycle_now = tkr->read(tkr->clock);
222
223 /* calculate the delta since the last update_wall_time */
224 delta = clocksource_delta(cycle_now, tkr->cycle_last, tkr->mask);
225
226 return delta;
227}
228#endif
229
121/** 230/**
122 * tk_setup_internals - Set up internals to use clocksource clock. 231 * tk_setup_internals - Set up internals to use clocksource clock.
123 * 232 *
@@ -135,11 +244,16 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock)
135 u64 tmp, ntpinterval; 244 u64 tmp, ntpinterval;
136 struct clocksource *old_clock; 245 struct clocksource *old_clock;
137 246
138 old_clock = tk->tkr.clock; 247 old_clock = tk->tkr_mono.clock;
139 tk->tkr.clock = clock; 248 tk->tkr_mono.clock = clock;
140 tk->tkr.read = clock->read; 249 tk->tkr_mono.read = clock->read;
141 tk->tkr.mask = clock->mask; 250 tk->tkr_mono.mask = clock->mask;
142 tk->tkr.cycle_last = tk->tkr.read(clock); 251 tk->tkr_mono.cycle_last = tk->tkr_mono.read(clock);
252
253 tk->tkr_raw.clock = clock;
254 tk->tkr_raw.read = clock->read;
255 tk->tkr_raw.mask = clock->mask;
256 tk->tkr_raw.cycle_last = tk->tkr_mono.cycle_last;
143 257
144 /* Do the ns -> cycle conversion first, using original mult */ 258 /* Do the ns -> cycle conversion first, using original mult */
145 tmp = NTP_INTERVAL_LENGTH; 259 tmp = NTP_INTERVAL_LENGTH;
@@ -163,11 +277,14 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock)
163 if (old_clock) { 277 if (old_clock) {
164 int shift_change = clock->shift - old_clock->shift; 278 int shift_change = clock->shift - old_clock->shift;
165 if (shift_change < 0) 279 if (shift_change < 0)
166 tk->tkr.xtime_nsec >>= -shift_change; 280 tk->tkr_mono.xtime_nsec >>= -shift_change;
167 else 281 else
168 tk->tkr.xtime_nsec <<= shift_change; 282 tk->tkr_mono.xtime_nsec <<= shift_change;
169 } 283 }
170 tk->tkr.shift = clock->shift; 284 tk->tkr_raw.xtime_nsec = 0;
285
286 tk->tkr_mono.shift = clock->shift;
287 tk->tkr_raw.shift = clock->shift;
171 288
172 tk->ntp_error = 0; 289 tk->ntp_error = 0;
173 tk->ntp_error_shift = NTP_SCALE_SHIFT - clock->shift; 290 tk->ntp_error_shift = NTP_SCALE_SHIFT - clock->shift;
@@ -178,7 +295,8 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock)
178 * active clocksource. These value will be adjusted via NTP 295 * active clocksource. These value will be adjusted via NTP
179 * to counteract clock drifting. 296 * to counteract clock drifting.
180 */ 297 */
181 tk->tkr.mult = clock->mult; 298 tk->tkr_mono.mult = clock->mult;
299 tk->tkr_raw.mult = clock->mult;
182 tk->ntp_err_mult = 0; 300 tk->ntp_err_mult = 0;
183} 301}
184 302
@@ -193,14 +311,10 @@ static inline u32 arch_gettimeoffset(void) { return 0; }
193 311
194static inline s64 timekeeping_get_ns(struct tk_read_base *tkr) 312static inline s64 timekeeping_get_ns(struct tk_read_base *tkr)
195{ 313{
196 cycle_t cycle_now, delta; 314 cycle_t delta;
197 s64 nsec; 315 s64 nsec;
198 316
199 /* read clocksource: */ 317 delta = timekeeping_get_delta(tkr);
200 cycle_now = tkr->read(tkr->clock);
201
202 /* calculate the delta since the last update_wall_time: */
203 delta = clocksource_delta(cycle_now, tkr->cycle_last, tkr->mask);
204 318
205 nsec = delta * tkr->mult + tkr->xtime_nsec; 319 nsec = delta * tkr->mult + tkr->xtime_nsec;
206 nsec >>= tkr->shift; 320 nsec >>= tkr->shift;
@@ -209,25 +323,6 @@ static inline s64 timekeeping_get_ns(struct tk_read_base *tkr)
209 return nsec + arch_gettimeoffset(); 323 return nsec + arch_gettimeoffset();
210} 324}
211 325
212static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk)
213{
214 struct clocksource *clock = tk->tkr.clock;
215 cycle_t cycle_now, delta;
216 s64 nsec;
217
218 /* read clocksource: */
219 cycle_now = tk->tkr.read(clock);
220
221 /* calculate the delta since the last update_wall_time: */
222 delta = clocksource_delta(cycle_now, tk->tkr.cycle_last, tk->tkr.mask);
223
224 /* convert delta to nanoseconds. */
225 nsec = clocksource_cyc2ns(delta, clock->mult, clock->shift);
226
227 /* If arch requires, add in get_arch_timeoffset() */
228 return nsec + arch_gettimeoffset();
229}
230
231/** 326/**
232 * update_fast_timekeeper - Update the fast and NMI safe monotonic timekeeper. 327 * update_fast_timekeeper - Update the fast and NMI safe monotonic timekeeper.
233 * @tkr: Timekeeping readout base from which we take the update 328 * @tkr: Timekeeping readout base from which we take the update
@@ -267,18 +362,18 @@ static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk)
267 * slightly wrong timestamp (a few nanoseconds). See 362 * slightly wrong timestamp (a few nanoseconds). See
268 * @ktime_get_mono_fast_ns. 363 * @ktime_get_mono_fast_ns.
269 */ 364 */
270static void update_fast_timekeeper(struct tk_read_base *tkr) 365static void update_fast_timekeeper(struct tk_read_base *tkr, struct tk_fast *tkf)
271{ 366{
272 struct tk_read_base *base = tk_fast_mono.base; 367 struct tk_read_base *base = tkf->base;
273 368
274 /* Force readers off to base[1] */ 369 /* Force readers off to base[1] */
275 raw_write_seqcount_latch(&tk_fast_mono.seq); 370 raw_write_seqcount_latch(&tkf->seq);
276 371
277 /* Update base[0] */ 372 /* Update base[0] */
278 memcpy(base, tkr, sizeof(*base)); 373 memcpy(base, tkr, sizeof(*base));
279 374
280 /* Force readers back to base[0] */ 375 /* Force readers back to base[0] */
281 raw_write_seqcount_latch(&tk_fast_mono.seq); 376 raw_write_seqcount_latch(&tkf->seq);
282 377
283 /* Update base[1] */ 378 /* Update base[1] */
284 memcpy(base + 1, base, sizeof(*base)); 379 memcpy(base + 1, base, sizeof(*base));
@@ -316,22 +411,33 @@ static void update_fast_timekeeper(struct tk_read_base *tkr)
316 * of the following timestamps. Callers need to be aware of that and 411 * of the following timestamps. Callers need to be aware of that and
317 * deal with it. 412 * deal with it.
318 */ 413 */
319u64 notrace ktime_get_mono_fast_ns(void) 414static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf)
320{ 415{
321 struct tk_read_base *tkr; 416 struct tk_read_base *tkr;
322 unsigned int seq; 417 unsigned int seq;
323 u64 now; 418 u64 now;
324 419
325 do { 420 do {
326 seq = raw_read_seqcount(&tk_fast_mono.seq); 421 seq = raw_read_seqcount(&tkf->seq);
327 tkr = tk_fast_mono.base + (seq & 0x01); 422 tkr = tkf->base + (seq & 0x01);
328 now = ktime_to_ns(tkr->base_mono) + timekeeping_get_ns(tkr); 423 now = ktime_to_ns(tkr->base) + timekeeping_get_ns(tkr);
424 } while (read_seqcount_retry(&tkf->seq, seq));
329 425
330 } while (read_seqcount_retry(&tk_fast_mono.seq, seq));
331 return now; 426 return now;
332} 427}
428
429u64 ktime_get_mono_fast_ns(void)
430{
431 return __ktime_get_fast_ns(&tk_fast_mono);
432}
333EXPORT_SYMBOL_GPL(ktime_get_mono_fast_ns); 433EXPORT_SYMBOL_GPL(ktime_get_mono_fast_ns);
334 434
435u64 ktime_get_raw_fast_ns(void)
436{
437 return __ktime_get_fast_ns(&tk_fast_raw);
438}
439EXPORT_SYMBOL_GPL(ktime_get_raw_fast_ns);
440
335/* Suspend-time cycles value for halted fast timekeeper. */ 441/* Suspend-time cycles value for halted fast timekeeper. */
336static cycle_t cycles_at_suspend; 442static cycle_t cycles_at_suspend;
337 443
@@ -353,12 +459,17 @@ static cycle_t dummy_clock_read(struct clocksource *cs)
353static void halt_fast_timekeeper(struct timekeeper *tk) 459static void halt_fast_timekeeper(struct timekeeper *tk)
354{ 460{
355 static struct tk_read_base tkr_dummy; 461 static struct tk_read_base tkr_dummy;
356 struct tk_read_base *tkr = &tk->tkr; 462 struct tk_read_base *tkr = &tk->tkr_mono;
357 463
358 memcpy(&tkr_dummy, tkr, sizeof(tkr_dummy)); 464 memcpy(&tkr_dummy, tkr, sizeof(tkr_dummy));
359 cycles_at_suspend = tkr->read(tkr->clock); 465 cycles_at_suspend = tkr->read(tkr->clock);
360 tkr_dummy.read = dummy_clock_read; 466 tkr_dummy.read = dummy_clock_read;
361 update_fast_timekeeper(&tkr_dummy); 467 update_fast_timekeeper(&tkr_dummy, &tk_fast_mono);
468
469 tkr = &tk->tkr_raw;
470 memcpy(&tkr_dummy, tkr, sizeof(tkr_dummy));
471 tkr_dummy.read = dummy_clock_read;
472 update_fast_timekeeper(&tkr_dummy, &tk_fast_raw);
362} 473}
363 474
364#ifdef CONFIG_GENERIC_TIME_VSYSCALL_OLD 475#ifdef CONFIG_GENERIC_TIME_VSYSCALL_OLD
@@ -369,8 +480,8 @@ static inline void update_vsyscall(struct timekeeper *tk)
369 480
370 xt = timespec64_to_timespec(tk_xtime(tk)); 481 xt = timespec64_to_timespec(tk_xtime(tk));
371 wm = timespec64_to_timespec(tk->wall_to_monotonic); 482 wm = timespec64_to_timespec(tk->wall_to_monotonic);
372 update_vsyscall_old(&xt, &wm, tk->tkr.clock, tk->tkr.mult, 483 update_vsyscall_old(&xt, &wm, tk->tkr_mono.clock, tk->tkr_mono.mult,
373 tk->tkr.cycle_last); 484 tk->tkr_mono.cycle_last);
374} 485}
375 486
376static inline void old_vsyscall_fixup(struct timekeeper *tk) 487static inline void old_vsyscall_fixup(struct timekeeper *tk)
@@ -387,11 +498,11 @@ static inline void old_vsyscall_fixup(struct timekeeper *tk)
387 * (shifted nanoseconds), and CONFIG_GENERIC_TIME_VSYSCALL_OLD 498 * (shifted nanoseconds), and CONFIG_GENERIC_TIME_VSYSCALL_OLD
388 * users are removed, this can be killed. 499 * users are removed, this can be killed.
389 */ 500 */
390 remainder = tk->tkr.xtime_nsec & ((1ULL << tk->tkr.shift) - 1); 501 remainder = tk->tkr_mono.xtime_nsec & ((1ULL << tk->tkr_mono.shift) - 1);
391 tk->tkr.xtime_nsec -= remainder; 502 tk->tkr_mono.xtime_nsec -= remainder;
392 tk->tkr.xtime_nsec += 1ULL << tk->tkr.shift; 503 tk->tkr_mono.xtime_nsec += 1ULL << tk->tkr_mono.shift;
393 tk->ntp_error += remainder << tk->ntp_error_shift; 504 tk->ntp_error += remainder << tk->ntp_error_shift;
394 tk->ntp_error -= (1ULL << tk->tkr.shift) << tk->ntp_error_shift; 505 tk->ntp_error -= (1ULL << tk->tkr_mono.shift) << tk->ntp_error_shift;
395} 506}
396#else 507#else
397#define old_vsyscall_fixup(tk) 508#define old_vsyscall_fixup(tk)
@@ -456,17 +567,17 @@ static inline void tk_update_ktime_data(struct timekeeper *tk)
456 */ 567 */
457 seconds = (u64)(tk->xtime_sec + tk->wall_to_monotonic.tv_sec); 568 seconds = (u64)(tk->xtime_sec + tk->wall_to_monotonic.tv_sec);
458 nsec = (u32) tk->wall_to_monotonic.tv_nsec; 569 nsec = (u32) tk->wall_to_monotonic.tv_nsec;
459 tk->tkr.base_mono = ns_to_ktime(seconds * NSEC_PER_SEC + nsec); 570 tk->tkr_mono.base = ns_to_ktime(seconds * NSEC_PER_SEC + nsec);
460 571
461 /* Update the monotonic raw base */ 572 /* Update the monotonic raw base */
462 tk->base_raw = timespec64_to_ktime(tk->raw_time); 573 tk->tkr_raw.base = timespec64_to_ktime(tk->raw_time);
463 574
464 /* 575 /*
465 * The sum of the nanoseconds portions of xtime and 576 * The sum of the nanoseconds portions of xtime and
466 * wall_to_monotonic can be greater/equal one second. Take 577 * wall_to_monotonic can be greater/equal one second. Take
467 * this into account before updating tk->ktime_sec. 578 * this into account before updating tk->ktime_sec.
468 */ 579 */
469 nsec += (u32)(tk->tkr.xtime_nsec >> tk->tkr.shift); 580 nsec += (u32)(tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift);
470 if (nsec >= NSEC_PER_SEC) 581 if (nsec >= NSEC_PER_SEC)
471 seconds++; 582 seconds++;
472 tk->ktime_sec = seconds; 583 tk->ktime_sec = seconds;
@@ -489,7 +600,8 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action)
489 memcpy(&shadow_timekeeper, &tk_core.timekeeper, 600 memcpy(&shadow_timekeeper, &tk_core.timekeeper,
490 sizeof(tk_core.timekeeper)); 601 sizeof(tk_core.timekeeper));
491 602
492 update_fast_timekeeper(&tk->tkr); 603 update_fast_timekeeper(&tk->tkr_mono, &tk_fast_mono);
604 update_fast_timekeeper(&tk->tkr_raw, &tk_fast_raw);
493} 605}
494 606
495/** 607/**
@@ -501,22 +613,23 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action)
501 */ 613 */
502static void timekeeping_forward_now(struct timekeeper *tk) 614static void timekeeping_forward_now(struct timekeeper *tk)
503{ 615{
504 struct clocksource *clock = tk->tkr.clock; 616 struct clocksource *clock = tk->tkr_mono.clock;
505 cycle_t cycle_now, delta; 617 cycle_t cycle_now, delta;
506 s64 nsec; 618 s64 nsec;
507 619
508 cycle_now = tk->tkr.read(clock); 620 cycle_now = tk->tkr_mono.read(clock);
509 delta = clocksource_delta(cycle_now, tk->tkr.cycle_last, tk->tkr.mask); 621 delta = clocksource_delta(cycle_now, tk->tkr_mono.cycle_last, tk->tkr_mono.mask);
510 tk->tkr.cycle_last = cycle_now; 622 tk->tkr_mono.cycle_last = cycle_now;
623 tk->tkr_raw.cycle_last = cycle_now;
511 624
512 tk->tkr.xtime_nsec += delta * tk->tkr.mult; 625 tk->tkr_mono.xtime_nsec += delta * tk->tkr_mono.mult;
513 626
514 /* If arch requires, add in get_arch_timeoffset() */ 627 /* If arch requires, add in get_arch_timeoffset() */
515 tk->tkr.xtime_nsec += (u64)arch_gettimeoffset() << tk->tkr.shift; 628 tk->tkr_mono.xtime_nsec += (u64)arch_gettimeoffset() << tk->tkr_mono.shift;
516 629
517 tk_normalize_xtime(tk); 630 tk_normalize_xtime(tk);
518 631
519 nsec = clocksource_cyc2ns(delta, clock->mult, clock->shift); 632 nsec = clocksource_cyc2ns(delta, tk->tkr_raw.mult, tk->tkr_raw.shift);
520 timespec64_add_ns(&tk->raw_time, nsec); 633 timespec64_add_ns(&tk->raw_time, nsec);
521} 634}
522 635
@@ -537,7 +650,7 @@ int __getnstimeofday64(struct timespec64 *ts)
537 seq = read_seqcount_begin(&tk_core.seq); 650 seq = read_seqcount_begin(&tk_core.seq);
538 651
539 ts->tv_sec = tk->xtime_sec; 652 ts->tv_sec = tk->xtime_sec;
540 nsecs = timekeeping_get_ns(&tk->tkr); 653 nsecs = timekeeping_get_ns(&tk->tkr_mono);
541 654
542 } while (read_seqcount_retry(&tk_core.seq, seq)); 655 } while (read_seqcount_retry(&tk_core.seq, seq));
543 656
@@ -577,8 +690,8 @@ ktime_t ktime_get(void)
577 690
578 do { 691 do {
579 seq = read_seqcount_begin(&tk_core.seq); 692 seq = read_seqcount_begin(&tk_core.seq);
580 base = tk->tkr.base_mono; 693 base = tk->tkr_mono.base;
581 nsecs = timekeeping_get_ns(&tk->tkr); 694 nsecs = timekeeping_get_ns(&tk->tkr_mono);
582 695
583 } while (read_seqcount_retry(&tk_core.seq, seq)); 696 } while (read_seqcount_retry(&tk_core.seq, seq));
584 697
@@ -603,8 +716,8 @@ ktime_t ktime_get_with_offset(enum tk_offsets offs)
603 716
604 do { 717 do {
605 seq = read_seqcount_begin(&tk_core.seq); 718 seq = read_seqcount_begin(&tk_core.seq);
606 base = ktime_add(tk->tkr.base_mono, *offset); 719 base = ktime_add(tk->tkr_mono.base, *offset);
607 nsecs = timekeeping_get_ns(&tk->tkr); 720 nsecs = timekeeping_get_ns(&tk->tkr_mono);
608 721
609 } while (read_seqcount_retry(&tk_core.seq, seq)); 722 } while (read_seqcount_retry(&tk_core.seq, seq));
610 723
@@ -645,8 +758,8 @@ ktime_t ktime_get_raw(void)
645 758
646 do { 759 do {
647 seq = read_seqcount_begin(&tk_core.seq); 760 seq = read_seqcount_begin(&tk_core.seq);
648 base = tk->base_raw; 761 base = tk->tkr_raw.base;
649 nsecs = timekeeping_get_ns_raw(tk); 762 nsecs = timekeeping_get_ns(&tk->tkr_raw);
650 763
651 } while (read_seqcount_retry(&tk_core.seq, seq)); 764 } while (read_seqcount_retry(&tk_core.seq, seq));
652 765
@@ -674,7 +787,7 @@ void ktime_get_ts64(struct timespec64 *ts)
674 do { 787 do {
675 seq = read_seqcount_begin(&tk_core.seq); 788 seq = read_seqcount_begin(&tk_core.seq);
676 ts->tv_sec = tk->xtime_sec; 789 ts->tv_sec = tk->xtime_sec;
677 nsec = timekeeping_get_ns(&tk->tkr); 790 nsec = timekeeping_get_ns(&tk->tkr_mono);
678 tomono = tk->wall_to_monotonic; 791 tomono = tk->wall_to_monotonic;
679 792
680 } while (read_seqcount_retry(&tk_core.seq, seq)); 793 } while (read_seqcount_retry(&tk_core.seq, seq));
@@ -759,8 +872,8 @@ void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real)
759 ts_real->tv_sec = tk->xtime_sec; 872 ts_real->tv_sec = tk->xtime_sec;
760 ts_real->tv_nsec = 0; 873 ts_real->tv_nsec = 0;
761 874
762 nsecs_raw = timekeeping_get_ns_raw(tk); 875 nsecs_raw = timekeeping_get_ns(&tk->tkr_raw);
763 nsecs_real = timekeeping_get_ns(&tk->tkr); 876 nsecs_real = timekeeping_get_ns(&tk->tkr_mono);
764 877
765 } while (read_seqcount_retry(&tk_core.seq, seq)); 878 } while (read_seqcount_retry(&tk_core.seq, seq));
766 879
@@ -943,7 +1056,7 @@ static int change_clocksource(void *data)
943 */ 1056 */
944 if (try_module_get(new->owner)) { 1057 if (try_module_get(new->owner)) {
945 if (!new->enable || new->enable(new) == 0) { 1058 if (!new->enable || new->enable(new) == 0) {
946 old = tk->tkr.clock; 1059 old = tk->tkr_mono.clock;
947 tk_setup_internals(tk, new); 1060 tk_setup_internals(tk, new);
948 if (old->disable) 1061 if (old->disable)
949 old->disable(old); 1062 old->disable(old);
@@ -971,11 +1084,11 @@ int timekeeping_notify(struct clocksource *clock)
971{ 1084{
972 struct timekeeper *tk = &tk_core.timekeeper; 1085 struct timekeeper *tk = &tk_core.timekeeper;
973 1086
974 if (tk->tkr.clock == clock) 1087 if (tk->tkr_mono.clock == clock)
975 return 0; 1088 return 0;
976 stop_machine(change_clocksource, clock, NULL); 1089 stop_machine(change_clocksource, clock, NULL);
977 tick_clock_notify(); 1090 tick_clock_notify();
978 return tk->tkr.clock == clock ? 0 : -1; 1091 return tk->tkr_mono.clock == clock ? 0 : -1;
979} 1092}
980 1093
981/** 1094/**
@@ -993,7 +1106,7 @@ void getrawmonotonic64(struct timespec64 *ts)
993 1106
994 do { 1107 do {
995 seq = read_seqcount_begin(&tk_core.seq); 1108 seq = read_seqcount_begin(&tk_core.seq);
996 nsecs = timekeeping_get_ns_raw(tk); 1109 nsecs = timekeeping_get_ns(&tk->tkr_raw);
997 ts64 = tk->raw_time; 1110 ts64 = tk->raw_time;
998 1111
999 } while (read_seqcount_retry(&tk_core.seq, seq)); 1112 } while (read_seqcount_retry(&tk_core.seq, seq));
@@ -1016,7 +1129,7 @@ int timekeeping_valid_for_hres(void)
1016 do { 1129 do {
1017 seq = read_seqcount_begin(&tk_core.seq); 1130 seq = read_seqcount_begin(&tk_core.seq);
1018 1131
1019 ret = tk->tkr.clock->flags & CLOCK_SOURCE_VALID_FOR_HRES; 1132 ret = tk->tkr_mono.clock->flags & CLOCK_SOURCE_VALID_FOR_HRES;
1020 1133
1021 } while (read_seqcount_retry(&tk_core.seq, seq)); 1134 } while (read_seqcount_retry(&tk_core.seq, seq));
1022 1135
@@ -1035,7 +1148,7 @@ u64 timekeeping_max_deferment(void)
1035 do { 1148 do {
1036 seq = read_seqcount_begin(&tk_core.seq); 1149 seq = read_seqcount_begin(&tk_core.seq);
1037 1150
1038 ret = tk->tkr.clock->max_idle_ns; 1151 ret = tk->tkr_mono.clock->max_idle_ns;
1039 1152
1040 } while (read_seqcount_retry(&tk_core.seq, seq)); 1153 } while (read_seqcount_retry(&tk_core.seq, seq));
1041 1154
@@ -1057,6 +1170,14 @@ void __weak read_persistent_clock(struct timespec *ts)
1057 ts->tv_nsec = 0; 1170 ts->tv_nsec = 0;
1058} 1171}
1059 1172
1173void __weak read_persistent_clock64(struct timespec64 *ts64)
1174{
1175 struct timespec ts;
1176
1177 read_persistent_clock(&ts);
1178 *ts64 = timespec_to_timespec64(ts);
1179}
1180
1060/** 1181/**
1061 * read_boot_clock - Return time of the system start. 1182 * read_boot_clock - Return time of the system start.
1062 * 1183 *
@@ -1072,6 +1193,20 @@ void __weak read_boot_clock(struct timespec *ts)
1072 ts->tv_nsec = 0; 1193 ts->tv_nsec = 0;
1073} 1194}
1074 1195
1196void __weak read_boot_clock64(struct timespec64 *ts64)
1197{
1198 struct timespec ts;
1199
1200 read_boot_clock(&ts);
1201 *ts64 = timespec_to_timespec64(ts);
1202}
1203
1204/* Flag for if timekeeping_resume() has injected sleeptime */
1205static bool sleeptime_injected;
1206
1207/* Flag for if there is a persistent clock on this platform */
1208static bool persistent_clock_exists;
1209
1075/* 1210/*
1076 * timekeeping_init - Initializes the clocksource and common timekeeping values 1211 * timekeeping_init - Initializes the clocksource and common timekeeping values
1077 */ 1212 */
@@ -1081,20 +1216,17 @@ void __init timekeeping_init(void)
1081 struct clocksource *clock; 1216 struct clocksource *clock;
1082 unsigned long flags; 1217 unsigned long flags;
1083 struct timespec64 now, boot, tmp; 1218 struct timespec64 now, boot, tmp;
1084 struct timespec ts;
1085 1219
1086 read_persistent_clock(&ts); 1220 read_persistent_clock64(&now);
1087 now = timespec_to_timespec64(ts);
1088 if (!timespec64_valid_strict(&now)) { 1221 if (!timespec64_valid_strict(&now)) {
1089 pr_warn("WARNING: Persistent clock returned invalid value!\n" 1222 pr_warn("WARNING: Persistent clock returned invalid value!\n"
1090 " Check your CMOS/BIOS settings.\n"); 1223 " Check your CMOS/BIOS settings.\n");
1091 now.tv_sec = 0; 1224 now.tv_sec = 0;
1092 now.tv_nsec = 0; 1225 now.tv_nsec = 0;
1093 } else if (now.tv_sec || now.tv_nsec) 1226 } else if (now.tv_sec || now.tv_nsec)
1094 persistent_clock_exist = true; 1227 persistent_clock_exists = true;
1095 1228
1096 read_boot_clock(&ts); 1229 read_boot_clock64(&boot);
1097 boot = timespec_to_timespec64(ts);
1098 if (!timespec64_valid_strict(&boot)) { 1230 if (!timespec64_valid_strict(&boot)) {
1099 pr_warn("WARNING: Boot clock returned invalid value!\n" 1231 pr_warn("WARNING: Boot clock returned invalid value!\n"
1100 " Check your CMOS/BIOS settings.\n"); 1232 " Check your CMOS/BIOS settings.\n");
@@ -1114,7 +1246,6 @@ void __init timekeeping_init(void)
1114 tk_set_xtime(tk, &now); 1246 tk_set_xtime(tk, &now);
1115 tk->raw_time.tv_sec = 0; 1247 tk->raw_time.tv_sec = 0;
1116 tk->raw_time.tv_nsec = 0; 1248 tk->raw_time.tv_nsec = 0;
1117 tk->base_raw.tv64 = 0;
1118 if (boot.tv_sec == 0 && boot.tv_nsec == 0) 1249 if (boot.tv_sec == 0 && boot.tv_nsec == 0)
1119 boot = tk_xtime(tk); 1250 boot = tk_xtime(tk);
1120 1251
@@ -1127,7 +1258,7 @@ void __init timekeeping_init(void)
1127 raw_spin_unlock_irqrestore(&timekeeper_lock, flags); 1258 raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
1128} 1259}
1129 1260
1130/* time in seconds when suspend began */ 1261/* time in seconds when suspend began for persistent clock */
1131static struct timespec64 timekeeping_suspend_time; 1262static struct timespec64 timekeeping_suspend_time;
1132 1263
1133/** 1264/**
@@ -1152,12 +1283,49 @@ static void __timekeeping_inject_sleeptime(struct timekeeper *tk,
1152 tk_debug_account_sleep_time(delta); 1283 tk_debug_account_sleep_time(delta);
1153} 1284}
1154 1285
1286#if defined(CONFIG_PM_SLEEP) && defined(CONFIG_RTC_HCTOSYS_DEVICE)
1287/**
1288 * We have three kinds of time sources to use for sleep time
1289 * injection, the preference order is:
1290 * 1) non-stop clocksource
1291 * 2) persistent clock (ie: RTC accessible when irqs are off)
1292 * 3) RTC
1293 *
1294 * 1) and 2) are used by timekeeping, 3) by RTC subsystem.
1295 * If system has neither 1) nor 2), 3) will be used finally.
1296 *
1297 *
1298 * If timekeeping has injected sleeptime via either 1) or 2),
1299 * 3) becomes needless, so in this case we don't need to call
1300 * rtc_resume(), and this is what timekeeping_rtc_skipresume()
1301 * means.
1302 */
1303bool timekeeping_rtc_skipresume(void)
1304{
1305 return sleeptime_injected;
1306}
1307
1308/**
1309 * 1) can be determined whether to use or not only when doing
1310 * timekeeping_resume() which is invoked after rtc_suspend(),
1311 * so we can't skip rtc_suspend() surely if system has 1).
1312 *
1313 * But if system has 2), 2) will definitely be used, so in this
1314 * case we don't need to call rtc_suspend(), and this is what
1315 * timekeeping_rtc_skipsuspend() means.
1316 */
1317bool timekeeping_rtc_skipsuspend(void)
1318{
1319 return persistent_clock_exists;
1320}
1321
1155/** 1322/**
1156 * timekeeping_inject_sleeptime64 - Adds suspend interval to timeekeeping values 1323 * timekeeping_inject_sleeptime64 - Adds suspend interval to timeekeeping values
1157 * @delta: pointer to a timespec64 delta value 1324 * @delta: pointer to a timespec64 delta value
1158 * 1325 *
1159 * This hook is for architectures that cannot support read_persistent_clock 1326 * This hook is for architectures that cannot support read_persistent_clock64
1160 * because their RTC/persistent clock is only accessible when irqs are enabled. 1327 * because their RTC/persistent clock is only accessible when irqs are enabled.
1328 * and also don't have an effective nonstop clocksource.
1161 * 1329 *
1162 * This function should only be called by rtc_resume(), and allows 1330 * This function should only be called by rtc_resume(), and allows
1163 * a suspend offset to be injected into the timekeeping values. 1331 * a suspend offset to be injected into the timekeeping values.
@@ -1167,13 +1335,6 @@ void timekeeping_inject_sleeptime64(struct timespec64 *delta)
1167 struct timekeeper *tk = &tk_core.timekeeper; 1335 struct timekeeper *tk = &tk_core.timekeeper;
1168 unsigned long flags; 1336 unsigned long flags;
1169 1337
1170 /*
1171 * Make sure we don't set the clock twice, as timekeeping_resume()
1172 * already did it
1173 */
1174 if (has_persistent_clock())
1175 return;
1176
1177 raw_spin_lock_irqsave(&timekeeper_lock, flags); 1338 raw_spin_lock_irqsave(&timekeeper_lock, flags);
1178 write_seqcount_begin(&tk_core.seq); 1339 write_seqcount_begin(&tk_core.seq);
1179 1340
@@ -1189,26 +1350,21 @@ void timekeeping_inject_sleeptime64(struct timespec64 *delta)
1189 /* signal hrtimers about time change */ 1350 /* signal hrtimers about time change */
1190 clock_was_set(); 1351 clock_was_set();
1191} 1352}
1353#endif
1192 1354
1193/** 1355/**
1194 * timekeeping_resume - Resumes the generic timekeeping subsystem. 1356 * timekeeping_resume - Resumes the generic timekeeping subsystem.
1195 *
1196 * This is for the generic clocksource timekeeping.
1197 * xtime/wall_to_monotonic/jiffies/etc are
1198 * still managed by arch specific suspend/resume code.
1199 */ 1357 */
1200void timekeeping_resume(void) 1358void timekeeping_resume(void)
1201{ 1359{
1202 struct timekeeper *tk = &tk_core.timekeeper; 1360 struct timekeeper *tk = &tk_core.timekeeper;
1203 struct clocksource *clock = tk->tkr.clock; 1361 struct clocksource *clock = tk->tkr_mono.clock;
1204 unsigned long flags; 1362 unsigned long flags;
1205 struct timespec64 ts_new, ts_delta; 1363 struct timespec64 ts_new, ts_delta;
1206 struct timespec tmp;
1207 cycle_t cycle_now, cycle_delta; 1364 cycle_t cycle_now, cycle_delta;
1208 bool suspendtime_found = false;
1209 1365
1210 read_persistent_clock(&tmp); 1366 sleeptime_injected = false;
1211 ts_new = timespec_to_timespec64(tmp); 1367 read_persistent_clock64(&ts_new);
1212 1368
1213 clockevents_resume(); 1369 clockevents_resume();
1214 clocksource_resume(); 1370 clocksource_resume();
@@ -1228,16 +1384,16 @@ void timekeeping_resume(void)
1228 * The less preferred source will only be tried if there is no better 1384 * The less preferred source will only be tried if there is no better
1229 * usable source. The rtc part is handled separately in rtc core code. 1385 * usable source. The rtc part is handled separately in rtc core code.
1230 */ 1386 */
1231 cycle_now = tk->tkr.read(clock); 1387 cycle_now = tk->tkr_mono.read(clock);
1232 if ((clock->flags & CLOCK_SOURCE_SUSPEND_NONSTOP) && 1388 if ((clock->flags & CLOCK_SOURCE_SUSPEND_NONSTOP) &&
1233 cycle_now > tk->tkr.cycle_last) { 1389 cycle_now > tk->tkr_mono.cycle_last) {
1234 u64 num, max = ULLONG_MAX; 1390 u64 num, max = ULLONG_MAX;
1235 u32 mult = clock->mult; 1391 u32 mult = clock->mult;
1236 u32 shift = clock->shift; 1392 u32 shift = clock->shift;
1237 s64 nsec = 0; 1393 s64 nsec = 0;
1238 1394
1239 cycle_delta = clocksource_delta(cycle_now, tk->tkr.cycle_last, 1395 cycle_delta = clocksource_delta(cycle_now, tk->tkr_mono.cycle_last,
1240 tk->tkr.mask); 1396 tk->tkr_mono.mask);
1241 1397
1242 /* 1398 /*
1243 * "cycle_delta * mutl" may cause 64 bits overflow, if the 1399 * "cycle_delta * mutl" may cause 64 bits overflow, if the
@@ -1253,17 +1409,19 @@ void timekeeping_resume(void)
1253 nsec += ((u64) cycle_delta * mult) >> shift; 1409 nsec += ((u64) cycle_delta * mult) >> shift;
1254 1410
1255 ts_delta = ns_to_timespec64(nsec); 1411 ts_delta = ns_to_timespec64(nsec);
1256 suspendtime_found = true; 1412 sleeptime_injected = true;
1257 } else if (timespec64_compare(&ts_new, &timekeeping_suspend_time) > 0) { 1413 } else if (timespec64_compare(&ts_new, &timekeeping_suspend_time) > 0) {
1258 ts_delta = timespec64_sub(ts_new, timekeeping_suspend_time); 1414 ts_delta = timespec64_sub(ts_new, timekeeping_suspend_time);
1259 suspendtime_found = true; 1415 sleeptime_injected = true;
1260 } 1416 }
1261 1417
1262 if (suspendtime_found) 1418 if (sleeptime_injected)
1263 __timekeeping_inject_sleeptime(tk, &ts_delta); 1419 __timekeeping_inject_sleeptime(tk, &ts_delta);
1264 1420
1265 /* Re-base the last cycle value */ 1421 /* Re-base the last cycle value */
1266 tk->tkr.cycle_last = cycle_now; 1422 tk->tkr_mono.cycle_last = cycle_now;
1423 tk->tkr_raw.cycle_last = cycle_now;
1424
1267 tk->ntp_error = 0; 1425 tk->ntp_error = 0;
1268 timekeeping_suspended = 0; 1426 timekeeping_suspended = 0;
1269 timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET); 1427 timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET);
@@ -1272,9 +1430,7 @@ void timekeeping_resume(void)
1272 1430
1273 touch_softlockup_watchdog(); 1431 touch_softlockup_watchdog();
1274 1432
1275 clockevents_notify(CLOCK_EVT_NOTIFY_RESUME, NULL); 1433 tick_resume();
1276
1277 /* Resume hrtimers */
1278 hrtimers_resume(); 1434 hrtimers_resume();
1279} 1435}
1280 1436
@@ -1284,10 +1440,8 @@ int timekeeping_suspend(void)
1284 unsigned long flags; 1440 unsigned long flags;
1285 struct timespec64 delta, delta_delta; 1441 struct timespec64 delta, delta_delta;
1286 static struct timespec64 old_delta; 1442 static struct timespec64 old_delta;
1287 struct timespec tmp;
1288 1443
1289 read_persistent_clock(&tmp); 1444 read_persistent_clock64(&timekeeping_suspend_time);
1290 timekeeping_suspend_time = timespec_to_timespec64(tmp);
1291 1445
1292 /* 1446 /*
1293 * On some systems the persistent_clock can not be detected at 1447 * On some systems the persistent_clock can not be detected at
@@ -1295,31 +1449,33 @@ int timekeeping_suspend(void)
1295 * value returned, update the persistent_clock_exists flag. 1449 * value returned, update the persistent_clock_exists flag.
1296 */ 1450 */
1297 if (timekeeping_suspend_time.tv_sec || timekeeping_suspend_time.tv_nsec) 1451 if (timekeeping_suspend_time.tv_sec || timekeeping_suspend_time.tv_nsec)
1298 persistent_clock_exist = true; 1452 persistent_clock_exists = true;
1299 1453
1300 raw_spin_lock_irqsave(&timekeeper_lock, flags); 1454 raw_spin_lock_irqsave(&timekeeper_lock, flags);
1301 write_seqcount_begin(&tk_core.seq); 1455 write_seqcount_begin(&tk_core.seq);
1302 timekeeping_forward_now(tk); 1456 timekeeping_forward_now(tk);
1303 timekeeping_suspended = 1; 1457 timekeeping_suspended = 1;
1304 1458
1305 /* 1459 if (persistent_clock_exists) {
1306 * To avoid drift caused by repeated suspend/resumes,
1307 * which each can add ~1 second drift error,
1308 * try to compensate so the difference in system time
1309 * and persistent_clock time stays close to constant.
1310 */
1311 delta = timespec64_sub(tk_xtime(tk), timekeeping_suspend_time);
1312 delta_delta = timespec64_sub(delta, old_delta);
1313 if (abs(delta_delta.tv_sec) >= 2) {
1314 /* 1460 /*
1315 * if delta_delta is too large, assume time correction 1461 * To avoid drift caused by repeated suspend/resumes,
1316 * has occured and set old_delta to the current delta. 1462 * which each can add ~1 second drift error,
1463 * try to compensate so the difference in system time
1464 * and persistent_clock time stays close to constant.
1317 */ 1465 */
1318 old_delta = delta; 1466 delta = timespec64_sub(tk_xtime(tk), timekeeping_suspend_time);
1319 } else { 1467 delta_delta = timespec64_sub(delta, old_delta);
1320 /* Otherwise try to adjust old_system to compensate */ 1468 if (abs(delta_delta.tv_sec) >= 2) {
1321 timekeeping_suspend_time = 1469 /*
1322 timespec64_add(timekeeping_suspend_time, delta_delta); 1470 * if delta_delta is too large, assume time correction
1471 * has occurred and set old_delta to the current delta.
1472 */
1473 old_delta = delta;
1474 } else {
1475 /* Otherwise try to adjust old_system to compensate */
1476 timekeeping_suspend_time =
1477 timespec64_add(timekeeping_suspend_time, delta_delta);
1478 }
1323 } 1479 }
1324 1480
1325 timekeeping_update(tk, TK_MIRROR); 1481 timekeeping_update(tk, TK_MIRROR);
@@ -1327,7 +1483,7 @@ int timekeeping_suspend(void)
1327 write_seqcount_end(&tk_core.seq); 1483 write_seqcount_end(&tk_core.seq);
1328 raw_spin_unlock_irqrestore(&timekeeper_lock, flags); 1484 raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
1329 1485
1330 clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL); 1486 tick_suspend();
1331 clocksource_suspend(); 1487 clocksource_suspend();
1332 clockevents_suspend(); 1488 clockevents_suspend();
1333 1489
@@ -1416,15 +1572,15 @@ static __always_inline void timekeeping_apply_adjustment(struct timekeeper *tk,
1416 * 1572 *
1417 * XXX - TODO: Doc ntp_error calculation. 1573 * XXX - TODO: Doc ntp_error calculation.
1418 */ 1574 */
1419 if ((mult_adj > 0) && (tk->tkr.mult + mult_adj < mult_adj)) { 1575 if ((mult_adj > 0) && (tk->tkr_mono.mult + mult_adj < mult_adj)) {
1420 /* NTP adjustment caused clocksource mult overflow */ 1576 /* NTP adjustment caused clocksource mult overflow */
1421 WARN_ON_ONCE(1); 1577 WARN_ON_ONCE(1);
1422 return; 1578 return;
1423 } 1579 }
1424 1580
1425 tk->tkr.mult += mult_adj; 1581 tk->tkr_mono.mult += mult_adj;
1426 tk->xtime_interval += interval; 1582 tk->xtime_interval += interval;
1427 tk->tkr.xtime_nsec -= offset; 1583 tk->tkr_mono.xtime_nsec -= offset;
1428 tk->ntp_error -= (interval - offset) << tk->ntp_error_shift; 1584 tk->ntp_error -= (interval - offset) << tk->ntp_error_shift;
1429} 1585}
1430 1586
@@ -1486,13 +1642,13 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset)
1486 tk->ntp_err_mult = 0; 1642 tk->ntp_err_mult = 0;
1487 } 1643 }
1488 1644
1489 if (unlikely(tk->tkr.clock->maxadj && 1645 if (unlikely(tk->tkr_mono.clock->maxadj &&
1490 (abs(tk->tkr.mult - tk->tkr.clock->mult) 1646 (abs(tk->tkr_mono.mult - tk->tkr_mono.clock->mult)
1491 > tk->tkr.clock->maxadj))) { 1647 > tk->tkr_mono.clock->maxadj))) {
1492 printk_once(KERN_WARNING 1648 printk_once(KERN_WARNING
1493 "Adjusting %s more than 11%% (%ld vs %ld)\n", 1649 "Adjusting %s more than 11%% (%ld vs %ld)\n",
1494 tk->tkr.clock->name, (long)tk->tkr.mult, 1650 tk->tkr_mono.clock->name, (long)tk->tkr_mono.mult,
1495 (long)tk->tkr.clock->mult + tk->tkr.clock->maxadj); 1651 (long)tk->tkr_mono.clock->mult + tk->tkr_mono.clock->maxadj);
1496 } 1652 }
1497 1653
1498 /* 1654 /*
@@ -1509,9 +1665,9 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset)
1509 * We'll correct this error next time through this function, when 1665 * We'll correct this error next time through this function, when
1510 * xtime_nsec is not as small. 1666 * xtime_nsec is not as small.
1511 */ 1667 */
1512 if (unlikely((s64)tk->tkr.xtime_nsec < 0)) { 1668 if (unlikely((s64)tk->tkr_mono.xtime_nsec < 0)) {
1513 s64 neg = -(s64)tk->tkr.xtime_nsec; 1669 s64 neg = -(s64)tk->tkr_mono.xtime_nsec;
1514 tk->tkr.xtime_nsec = 0; 1670 tk->tkr_mono.xtime_nsec = 0;
1515 tk->ntp_error += neg << tk->ntp_error_shift; 1671 tk->ntp_error += neg << tk->ntp_error_shift;
1516 } 1672 }
1517} 1673}
@@ -1526,13 +1682,13 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset)
1526 */ 1682 */
1527static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk) 1683static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk)
1528{ 1684{
1529 u64 nsecps = (u64)NSEC_PER_SEC << tk->tkr.shift; 1685 u64 nsecps = (u64)NSEC_PER_SEC << tk->tkr_mono.shift;
1530 unsigned int clock_set = 0; 1686 unsigned int clock_set = 0;
1531 1687
1532 while (tk->tkr.xtime_nsec >= nsecps) { 1688 while (tk->tkr_mono.xtime_nsec >= nsecps) {
1533 int leap; 1689 int leap;
1534 1690
1535 tk->tkr.xtime_nsec -= nsecps; 1691 tk->tkr_mono.xtime_nsec -= nsecps;
1536 tk->xtime_sec++; 1692 tk->xtime_sec++;
1537 1693
1538 /* Figure out if its a leap sec and apply if needed */ 1694 /* Figure out if its a leap sec and apply if needed */
@@ -1577,9 +1733,10 @@ static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset,
1577 1733
1578 /* Accumulate one shifted interval */ 1734 /* Accumulate one shifted interval */
1579 offset -= interval; 1735 offset -= interval;
1580 tk->tkr.cycle_last += interval; 1736 tk->tkr_mono.cycle_last += interval;
1737 tk->tkr_raw.cycle_last += interval;
1581 1738
1582 tk->tkr.xtime_nsec += tk->xtime_interval << shift; 1739 tk->tkr_mono.xtime_nsec += tk->xtime_interval << shift;
1583 *clock_set |= accumulate_nsecs_to_secs(tk); 1740 *clock_set |= accumulate_nsecs_to_secs(tk);
1584 1741
1585 /* Accumulate raw time */ 1742 /* Accumulate raw time */
@@ -1622,14 +1779,17 @@ void update_wall_time(void)
1622#ifdef CONFIG_ARCH_USES_GETTIMEOFFSET 1779#ifdef CONFIG_ARCH_USES_GETTIMEOFFSET
1623 offset = real_tk->cycle_interval; 1780 offset = real_tk->cycle_interval;
1624#else 1781#else
1625 offset = clocksource_delta(tk->tkr.read(tk->tkr.clock), 1782 offset = clocksource_delta(tk->tkr_mono.read(tk->tkr_mono.clock),
1626 tk->tkr.cycle_last, tk->tkr.mask); 1783 tk->tkr_mono.cycle_last, tk->tkr_mono.mask);
1627#endif 1784#endif
1628 1785
1629 /* Check if there's really nothing to do */ 1786 /* Check if there's really nothing to do */
1630 if (offset < real_tk->cycle_interval) 1787 if (offset < real_tk->cycle_interval)
1631 goto out; 1788 goto out;
1632 1789
1790 /* Do some additional sanity checking */
1791 timekeeping_check_update(real_tk, offset);
1792
1633 /* 1793 /*
1634 * With NO_HZ we may have to accumulate many cycle_intervals 1794 * With NO_HZ we may have to accumulate many cycle_intervals
1635 * (think "ticks") worth of time at once. To do this efficiently, 1795 * (think "ticks") worth of time at once. To do this efficiently,
@@ -1784,8 +1944,8 @@ ktime_t ktime_get_update_offsets_tick(ktime_t *offs_real, ktime_t *offs_boot,
1784 do { 1944 do {
1785 seq = read_seqcount_begin(&tk_core.seq); 1945 seq = read_seqcount_begin(&tk_core.seq);
1786 1946
1787 base = tk->tkr.base_mono; 1947 base = tk->tkr_mono.base;
1788 nsecs = tk->tkr.xtime_nsec >> tk->tkr.shift; 1948 nsecs = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift;
1789 1949
1790 *offs_real = tk->offs_real; 1950 *offs_real = tk->offs_real;
1791 *offs_boot = tk->offs_boot; 1951 *offs_boot = tk->offs_boot;
@@ -1816,8 +1976,8 @@ ktime_t ktime_get_update_offsets_now(ktime_t *offs_real, ktime_t *offs_boot,
1816 do { 1976 do {
1817 seq = read_seqcount_begin(&tk_core.seq); 1977 seq = read_seqcount_begin(&tk_core.seq);
1818 1978
1819 base = tk->tkr.base_mono; 1979 base = tk->tkr_mono.base;
1820 nsecs = timekeeping_get_ns(&tk->tkr); 1980 nsecs = timekeeping_get_ns(&tk->tkr_mono);
1821 1981
1822 *offs_real = tk->offs_real; 1982 *offs_real = tk->offs_real;
1823 *offs_boot = tk->offs_boot; 1983 *offs_boot = tk->offs_boot;
diff --git a/kernel/time/timekeeping.h b/kernel/time/timekeeping.h
index 1d91416055d5..ead8794b9a4e 100644
--- a/kernel/time/timekeeping.h
+++ b/kernel/time/timekeeping.h
@@ -19,4 +19,11 @@ extern void timekeeping_clocktai(struct timespec *ts);
19extern int timekeeping_suspend(void); 19extern int timekeeping_suspend(void);
20extern void timekeeping_resume(void); 20extern void timekeeping_resume(void);
21 21
22extern void do_timer(unsigned long ticks);
23extern void update_wall_time(void);
24
25extern seqlock_t jiffies_lock;
26
27#define CS_NAME_LEN 32
28
22#endif 29#endif
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 2d3f5c504939..2ece3aa5069c 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -90,8 +90,18 @@ struct tvec_base {
90 struct tvec tv5; 90 struct tvec tv5;
91} ____cacheline_aligned; 91} ____cacheline_aligned;
92 92
93/*
94 * __TIMER_INITIALIZER() needs to set ->base to a valid pointer (because we've
95 * made NULL special, hint: lock_timer_base()) and we cannot get a compile time
96 * pointer to per-cpu entries because we don't know where we'll map the section,
97 * even for the boot cpu.
98 *
99 * And so we use boot_tvec_bases for boot CPU and per-cpu __tvec_bases for the
100 * rest of them.
101 */
93struct tvec_base boot_tvec_bases; 102struct tvec_base boot_tvec_bases;
94EXPORT_SYMBOL(boot_tvec_bases); 103EXPORT_SYMBOL(boot_tvec_bases);
104
95static DEFINE_PER_CPU(struct tvec_base *, tvec_bases) = &boot_tvec_bases; 105static DEFINE_PER_CPU(struct tvec_base *, tvec_bases) = &boot_tvec_bases;
96 106
97/* Functions below help us manage 'deferrable' flag */ 107/* Functions below help us manage 'deferrable' flag */
@@ -1027,6 +1037,8 @@ int try_to_del_timer_sync(struct timer_list *timer)
1027EXPORT_SYMBOL(try_to_del_timer_sync); 1037EXPORT_SYMBOL(try_to_del_timer_sync);
1028 1038
1029#ifdef CONFIG_SMP 1039#ifdef CONFIG_SMP
1040static DEFINE_PER_CPU(struct tvec_base, __tvec_bases);
1041
1030/** 1042/**
1031 * del_timer_sync - deactivate a timer and wait for the handler to finish. 1043 * del_timer_sync - deactivate a timer and wait for the handler to finish.
1032 * @timer: the timer to be deactivated 1044 * @timer: the timer to be deactivated
@@ -1532,64 +1544,6 @@ signed long __sched schedule_timeout_uninterruptible(signed long timeout)
1532} 1544}
1533EXPORT_SYMBOL(schedule_timeout_uninterruptible); 1545EXPORT_SYMBOL(schedule_timeout_uninterruptible);
1534 1546
1535static int init_timers_cpu(int cpu)
1536{
1537 int j;
1538 struct tvec_base *base;
1539 static char tvec_base_done[NR_CPUS];
1540
1541 if (!tvec_base_done[cpu]) {
1542 static char boot_done;
1543
1544 if (boot_done) {
1545 /*
1546 * The APs use this path later in boot
1547 */
1548 base = kzalloc_node(sizeof(*base), GFP_KERNEL,
1549 cpu_to_node(cpu));
1550 if (!base)
1551 return -ENOMEM;
1552
1553 /* Make sure tvec_base has TIMER_FLAG_MASK bits free */
1554 if (WARN_ON(base != tbase_get_base(base))) {
1555 kfree(base);
1556 return -ENOMEM;
1557 }
1558 per_cpu(tvec_bases, cpu) = base;
1559 } else {
1560 /*
1561 * This is for the boot CPU - we use compile-time
1562 * static initialisation because per-cpu memory isn't
1563 * ready yet and because the memory allocators are not
1564 * initialised either.
1565 */
1566 boot_done = 1;
1567 base = &boot_tvec_bases;
1568 }
1569 spin_lock_init(&base->lock);
1570 tvec_base_done[cpu] = 1;
1571 base->cpu = cpu;
1572 } else {
1573 base = per_cpu(tvec_bases, cpu);
1574 }
1575
1576
1577 for (j = 0; j < TVN_SIZE; j++) {
1578 INIT_LIST_HEAD(base->tv5.vec + j);
1579 INIT_LIST_HEAD(base->tv4.vec + j);
1580 INIT_LIST_HEAD(base->tv3.vec + j);
1581 INIT_LIST_HEAD(base->tv2.vec + j);
1582 }
1583 for (j = 0; j < TVR_SIZE; j++)
1584 INIT_LIST_HEAD(base->tv1.vec + j);
1585
1586 base->timer_jiffies = jiffies;
1587 base->next_timer = base->timer_jiffies;
1588 base->active_timers = 0;
1589 base->all_timers = 0;
1590 return 0;
1591}
1592
1593#ifdef CONFIG_HOTPLUG_CPU 1547#ifdef CONFIG_HOTPLUG_CPU
1594static void migrate_timer_list(struct tvec_base *new_base, struct list_head *head) 1548static void migrate_timer_list(struct tvec_base *new_base, struct list_head *head)
1595{ 1549{
@@ -1631,55 +1585,86 @@ static void migrate_timers(int cpu)
1631 migrate_timer_list(new_base, old_base->tv5.vec + i); 1585 migrate_timer_list(new_base, old_base->tv5.vec + i);
1632 } 1586 }
1633 1587
1588 old_base->active_timers = 0;
1589 old_base->all_timers = 0;
1590
1634 spin_unlock(&old_base->lock); 1591 spin_unlock(&old_base->lock);
1635 spin_unlock_irq(&new_base->lock); 1592 spin_unlock_irq(&new_base->lock);
1636 put_cpu_var(tvec_bases); 1593 put_cpu_var(tvec_bases);
1637} 1594}
1638#endif /* CONFIG_HOTPLUG_CPU */
1639 1595
1640static int timer_cpu_notify(struct notifier_block *self, 1596static int timer_cpu_notify(struct notifier_block *self,
1641 unsigned long action, void *hcpu) 1597 unsigned long action, void *hcpu)
1642{ 1598{
1643 long cpu = (long)hcpu; 1599 switch (action) {
1644 int err;
1645
1646 switch(action) {
1647 case CPU_UP_PREPARE:
1648 case CPU_UP_PREPARE_FROZEN:
1649 err = init_timers_cpu(cpu);
1650 if (err < 0)
1651 return notifier_from_errno(err);
1652 break;
1653#ifdef CONFIG_HOTPLUG_CPU
1654 case CPU_DEAD: 1600 case CPU_DEAD:
1655 case CPU_DEAD_FROZEN: 1601 case CPU_DEAD_FROZEN:
1656 migrate_timers(cpu); 1602 migrate_timers((long)hcpu);
1657 break; 1603 break;
1658#endif
1659 default: 1604 default:
1660 break; 1605 break;
1661 } 1606 }
1607
1662 return NOTIFY_OK; 1608 return NOTIFY_OK;
1663} 1609}
1664 1610
1665static struct notifier_block timers_nb = { 1611static inline void timer_register_cpu_notifier(void)
1666 .notifier_call = timer_cpu_notify, 1612{
1667}; 1613 cpu_notifier(timer_cpu_notify, 0);
1614}
1615#else
1616static inline void timer_register_cpu_notifier(void) { }
1617#endif /* CONFIG_HOTPLUG_CPU */
1668 1618
1619static void __init init_timer_cpu(struct tvec_base *base, int cpu)
1620{
1621 int j;
1669 1622
1670void __init init_timers(void) 1623 BUG_ON(base != tbase_get_base(base));
1624
1625 base->cpu = cpu;
1626 per_cpu(tvec_bases, cpu) = base;
1627 spin_lock_init(&base->lock);
1628
1629 for (j = 0; j < TVN_SIZE; j++) {
1630 INIT_LIST_HEAD(base->tv5.vec + j);
1631 INIT_LIST_HEAD(base->tv4.vec + j);
1632 INIT_LIST_HEAD(base->tv3.vec + j);
1633 INIT_LIST_HEAD(base->tv2.vec + j);
1634 }
1635 for (j = 0; j < TVR_SIZE; j++)
1636 INIT_LIST_HEAD(base->tv1.vec + j);
1637
1638 base->timer_jiffies = jiffies;
1639 base->next_timer = base->timer_jiffies;
1640}
1641
1642static void __init init_timer_cpus(void)
1671{ 1643{
1672 int err; 1644 struct tvec_base *base;
1645 int local_cpu = smp_processor_id();
1646 int cpu;
1673 1647
1648 for_each_possible_cpu(cpu) {
1649 if (cpu == local_cpu)
1650 base = &boot_tvec_bases;
1651#ifdef CONFIG_SMP
1652 else
1653 base = per_cpu_ptr(&__tvec_bases, cpu);
1654#endif
1655
1656 init_timer_cpu(base, cpu);
1657 }
1658}
1659
1660void __init init_timers(void)
1661{
1674 /* ensure there are enough low bits for flags in timer->base pointer */ 1662 /* ensure there are enough low bits for flags in timer->base pointer */
1675 BUILD_BUG_ON(__alignof__(struct tvec_base) & TIMER_FLAG_MASK); 1663 BUILD_BUG_ON(__alignof__(struct tvec_base) & TIMER_FLAG_MASK);
1676 1664
1677 err = timer_cpu_notify(&timers_nb, (unsigned long)CPU_UP_PREPARE, 1665 init_timer_cpus();
1678 (void *)(long)smp_processor_id());
1679 BUG_ON(err != NOTIFY_OK);
1680
1681 init_timer_stats(); 1666 init_timer_stats();
1682 register_cpu_notifier(&timers_nb); 1667 timer_register_cpu_notifier();
1683 open_softirq(TIMER_SOFTIRQ, run_timer_softirq); 1668 open_softirq(TIMER_SOFTIRQ, run_timer_softirq);
1684} 1669}
1685 1670
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index 61ed862cdd37..e878c2e0ba45 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -16,10 +16,10 @@
16#include <linux/sched.h> 16#include <linux/sched.h>
17#include <linux/seq_file.h> 17#include <linux/seq_file.h>
18#include <linux/kallsyms.h> 18#include <linux/kallsyms.h>
19#include <linux/tick.h>
20 19
21#include <asm/uaccess.h> 20#include <asm/uaccess.h>
22 21
22#include "tick-internal.h"
23 23
24struct timer_list_iter { 24struct timer_list_iter {
25 int cpu; 25 int cpu;
@@ -228,9 +228,35 @@ print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu)
228 print_name_offset(m, dev->set_next_event); 228 print_name_offset(m, dev->set_next_event);
229 SEQ_printf(m, "\n"); 229 SEQ_printf(m, "\n");
230 230
231 SEQ_printf(m, " set_mode: "); 231 if (dev->set_mode) {
232 print_name_offset(m, dev->set_mode); 232 SEQ_printf(m, " set_mode: ");
233 SEQ_printf(m, "\n"); 233 print_name_offset(m, dev->set_mode);
234 SEQ_printf(m, "\n");
235 } else {
236 if (dev->set_state_shutdown) {
237 SEQ_printf(m, " shutdown: ");
238 print_name_offset(m, dev->set_state_shutdown);
239 SEQ_printf(m, "\n");
240 }
241
242 if (dev->set_state_periodic) {
243 SEQ_printf(m, " periodic: ");
244 print_name_offset(m, dev->set_state_periodic);
245 SEQ_printf(m, "\n");
246 }
247
248 if (dev->set_state_oneshot) {
249 SEQ_printf(m, " oneshot: ");
250 print_name_offset(m, dev->set_state_oneshot);
251 SEQ_printf(m, "\n");
252 }
253
254 if (dev->tick_resume) {
255 SEQ_printf(m, " resume: ");
256 print_name_offset(m, dev->tick_resume);
257 SEQ_printf(m, "\n");
258 }
259 }
234 260
235 SEQ_printf(m, " event_handler: "); 261 SEQ_printf(m, " event_handler: ");
236 print_name_offset(m, dev->event_handler); 262 print_name_offset(m, dev->event_handler);
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index a5da09c899dd..3b9a48ae153a 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -432,6 +432,14 @@ config UPROBE_EVENT
432 This option is required if you plan to use perf-probe subcommand 432 This option is required if you plan to use perf-probe subcommand
433 of perf tools on user space applications. 433 of perf tools on user space applications.
434 434
435config BPF_EVENTS
436 depends on BPF_SYSCALL
437 depends on KPROBE_EVENT
438 bool
439 default y
440 help
441 This allows the user to attach BPF programs to kprobe events.
442
435config PROBE_EVENTS 443config PROBE_EVENTS
436 def_bool n 444 def_bool n
437 445
@@ -599,6 +607,34 @@ config RING_BUFFER_STARTUP_TEST
599 607
600 If unsure, say N 608 If unsure, say N
601 609
610config TRACE_ENUM_MAP_FILE
611 bool "Show enum mappings for trace events"
612 depends on TRACING
613 help
614 The "print fmt" of the trace events will show the enum names instead
615 of their values. This can cause problems for user space tools that
616 use this string to parse the raw data as user space does not know
617 how to convert the string to its value.
618
619 To fix this, there's a special macro in the kernel that can be used
620 to convert the enum into its value. If this macro is used, then the
621 print fmt strings will have the enums converted to their values.
622
623 If something does not get converted properly, this option can be
624 used to show what enums the kernel tried to convert.
625
626 This option is for debugging the enum conversions. A file is created
627 in the tracing directory called "enum_map" that will show the enum
628 names matched with their values and what trace event system they
629 belong too.
630
631 Normally, the mapping of the strings to values will be freed after
632 boot up or module load. With this option, they will not be freed, as
633 they are needed for the "enum_map" file. Enabling this option will
634 increase the memory footprint of the running kernel.
635
636 If unsure, say N
637
602endif # FTRACE 638endif # FTRACE
603 639
604endif # TRACING_SUPPORT 640endif # TRACING_SUPPORT
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 98f26588255e..9b1044e936a6 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -53,6 +53,7 @@ obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o
53endif 53endif
54obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o 54obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
55obj-$(CONFIG_EVENT_TRACING) += trace_events_trigger.o 55obj-$(CONFIG_EVENT_TRACING) += trace_events_trigger.o
56obj-$(CONFIG_BPF_EVENTS) += bpf_trace.o
56obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o 57obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o
57obj-$(CONFIG_TRACEPOINTS) += power-traces.o 58obj-$(CONFIG_TRACEPOINTS) += power-traces.o
58ifeq ($(CONFIG_PM),y) 59ifeq ($(CONFIG_PM),y)
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
new file mode 100644
index 000000000000..2d56ce501632
--- /dev/null
+++ b/kernel/trace/bpf_trace.c
@@ -0,0 +1,222 @@
1/* Copyright (c) 2011-2015 PLUMgrid, http://plumgrid.com
2 *
3 * This program is free software; you can redistribute it and/or
4 * modify it under the terms of version 2 of the GNU General Public
5 * License as published by the Free Software Foundation.
6 */
7#include <linux/kernel.h>
8#include <linux/types.h>
9#include <linux/slab.h>
10#include <linux/bpf.h>
11#include <linux/filter.h>
12#include <linux/uaccess.h>
13#include <linux/ctype.h>
14#include "trace.h"
15
16static DEFINE_PER_CPU(int, bpf_prog_active);
17
18/**
19 * trace_call_bpf - invoke BPF program
20 * @prog: BPF program
21 * @ctx: opaque context pointer
22 *
23 * kprobe handlers execute BPF programs via this helper.
24 * Can be used from static tracepoints in the future.
25 *
26 * Return: BPF programs always return an integer which is interpreted by
27 * kprobe handler as:
28 * 0 - return from kprobe (event is filtered out)
29 * 1 - store kprobe event into ring buffer
30 * Other values are reserved and currently alias to 1
31 */
32unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx)
33{
34 unsigned int ret;
35
36 if (in_nmi()) /* not supported yet */
37 return 1;
38
39 preempt_disable();
40
41 if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) {
42 /*
43 * since some bpf program is already running on this cpu,
44 * don't call into another bpf program (same or different)
45 * and don't send kprobe event into ring-buffer,
46 * so return zero here
47 */
48 ret = 0;
49 goto out;
50 }
51
52 rcu_read_lock();
53 ret = BPF_PROG_RUN(prog, ctx);
54 rcu_read_unlock();
55
56 out:
57 __this_cpu_dec(bpf_prog_active);
58 preempt_enable();
59
60 return ret;
61}
62EXPORT_SYMBOL_GPL(trace_call_bpf);
63
64static u64 bpf_probe_read(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
65{
66 void *dst = (void *) (long) r1;
67 int size = (int) r2;
68 void *unsafe_ptr = (void *) (long) r3;
69
70 return probe_kernel_read(dst, unsafe_ptr, size);
71}
72
73static const struct bpf_func_proto bpf_probe_read_proto = {
74 .func = bpf_probe_read,
75 .gpl_only = true,
76 .ret_type = RET_INTEGER,
77 .arg1_type = ARG_PTR_TO_STACK,
78 .arg2_type = ARG_CONST_STACK_SIZE,
79 .arg3_type = ARG_ANYTHING,
80};
81
82static u64 bpf_ktime_get_ns(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
83{
84 /* NMI safe access to clock monotonic */
85 return ktime_get_mono_fast_ns();
86}
87
88static const struct bpf_func_proto bpf_ktime_get_ns_proto = {
89 .func = bpf_ktime_get_ns,
90 .gpl_only = true,
91 .ret_type = RET_INTEGER,
92};
93
94/*
95 * limited trace_printk()
96 * only %d %u %x %ld %lu %lx %lld %llu %llx %p conversion specifiers allowed
97 */
98static u64 bpf_trace_printk(u64 r1, u64 fmt_size, u64 r3, u64 r4, u64 r5)
99{
100 char *fmt = (char *) (long) r1;
101 int mod[3] = {};
102 int fmt_cnt = 0;
103 int i;
104
105 /*
106 * bpf_check()->check_func_arg()->check_stack_boundary()
107 * guarantees that fmt points to bpf program stack,
108 * fmt_size bytes of it were initialized and fmt_size > 0
109 */
110 if (fmt[--fmt_size] != 0)
111 return -EINVAL;
112
113 /* check format string for allowed specifiers */
114 for (i = 0; i < fmt_size; i++) {
115 if ((!isprint(fmt[i]) && !isspace(fmt[i])) || !isascii(fmt[i]))
116 return -EINVAL;
117
118 if (fmt[i] != '%')
119 continue;
120
121 if (fmt_cnt >= 3)
122 return -EINVAL;
123
124 /* fmt[i] != 0 && fmt[last] == 0, so we can access fmt[i + 1] */
125 i++;
126 if (fmt[i] == 'l') {
127 mod[fmt_cnt]++;
128 i++;
129 } else if (fmt[i] == 'p') {
130 mod[fmt_cnt]++;
131 i++;
132 if (!isspace(fmt[i]) && !ispunct(fmt[i]) && fmt[i] != 0)
133 return -EINVAL;
134 fmt_cnt++;
135 continue;
136 }
137
138 if (fmt[i] == 'l') {
139 mod[fmt_cnt]++;
140 i++;
141 }
142
143 if (fmt[i] != 'd' && fmt[i] != 'u' && fmt[i] != 'x')
144 return -EINVAL;
145 fmt_cnt++;
146 }
147
148 return __trace_printk(1/* fake ip will not be printed */, fmt,
149 mod[0] == 2 ? r3 : mod[0] == 1 ? (long) r3 : (u32) r3,
150 mod[1] == 2 ? r4 : mod[1] == 1 ? (long) r4 : (u32) r4,
151 mod[2] == 2 ? r5 : mod[2] == 1 ? (long) r5 : (u32) r5);
152}
153
154static const struct bpf_func_proto bpf_trace_printk_proto = {
155 .func = bpf_trace_printk,
156 .gpl_only = true,
157 .ret_type = RET_INTEGER,
158 .arg1_type = ARG_PTR_TO_STACK,
159 .arg2_type = ARG_CONST_STACK_SIZE,
160};
161
162static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func_id)
163{
164 switch (func_id) {
165 case BPF_FUNC_map_lookup_elem:
166 return &bpf_map_lookup_elem_proto;
167 case BPF_FUNC_map_update_elem:
168 return &bpf_map_update_elem_proto;
169 case BPF_FUNC_map_delete_elem:
170 return &bpf_map_delete_elem_proto;
171 case BPF_FUNC_probe_read:
172 return &bpf_probe_read_proto;
173 case BPF_FUNC_ktime_get_ns:
174 return &bpf_ktime_get_ns_proto;
175
176 case BPF_FUNC_trace_printk:
177 /*
178 * this program might be calling bpf_trace_printk,
179 * so allocate per-cpu printk buffers
180 */
181 trace_printk_init_buffers();
182
183 return &bpf_trace_printk_proto;
184 default:
185 return NULL;
186 }
187}
188
189/* bpf+kprobe programs can access fields of 'struct pt_regs' */
190static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type type)
191{
192 /* check bounds */
193 if (off < 0 || off >= sizeof(struct pt_regs))
194 return false;
195
196 /* only read is allowed */
197 if (type != BPF_READ)
198 return false;
199
200 /* disallow misaligned access */
201 if (off % size != 0)
202 return false;
203
204 return true;
205}
206
207static struct bpf_verifier_ops kprobe_prog_ops = {
208 .get_func_proto = kprobe_prog_func_proto,
209 .is_valid_access = kprobe_prog_is_valid_access,
210};
211
212static struct bpf_prog_type_list kprobe_tl = {
213 .ops = &kprobe_prog_ops,
214 .type = BPF_PROG_TYPE_KPROBE,
215};
216
217static int __init register_kprobe_prog_ops(void)
218{
219 bpf_register_prog_type(&kprobe_tl);
220 return 0;
221}
222late_initcall(register_kprobe_prog_ops);
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 4f228024055b..02bece4a99ea 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -18,7 +18,7 @@
18#include <linux/kallsyms.h> 18#include <linux/kallsyms.h>
19#include <linux/seq_file.h> 19#include <linux/seq_file.h>
20#include <linux/suspend.h> 20#include <linux/suspend.h>
21#include <linux/debugfs.h> 21#include <linux/tracefs.h>
22#include <linux/hardirq.h> 22#include <linux/hardirq.h>
23#include <linux/kthread.h> 23#include <linux/kthread.h>
24#include <linux/uaccess.h> 24#include <linux/uaccess.h>
@@ -249,6 +249,19 @@ static void update_function_graph_func(void);
249static inline void update_function_graph_func(void) { } 249static inline void update_function_graph_func(void) { }
250#endif 250#endif
251 251
252
253static ftrace_func_t ftrace_ops_get_list_func(struct ftrace_ops *ops)
254{
255 /*
256 * If this is a dynamic ops or we force list func,
257 * then it needs to call the list anyway.
258 */
259 if (ops->flags & FTRACE_OPS_FL_DYNAMIC || FTRACE_FORCE_LIST_FUNC)
260 return ftrace_ops_list_func;
261
262 return ftrace_ops_get_func(ops);
263}
264
252static void update_ftrace_function(void) 265static void update_ftrace_function(void)
253{ 266{
254 ftrace_func_t func; 267 ftrace_func_t func;
@@ -270,7 +283,7 @@ static void update_ftrace_function(void)
270 * then have the mcount trampoline call the function directly. 283 * then have the mcount trampoline call the function directly.
271 */ 284 */
272 } else if (ftrace_ops_list->next == &ftrace_list_end) { 285 } else if (ftrace_ops_list->next == &ftrace_list_end) {
273 func = ftrace_ops_get_func(ftrace_ops_list); 286 func = ftrace_ops_get_list_func(ftrace_ops_list);
274 287
275 } else { 288 } else {
276 /* Just use the default ftrace_ops */ 289 /* Just use the default ftrace_ops */
@@ -1008,7 +1021,7 @@ static struct tracer_stat function_stats __initdata = {
1008 .stat_show = function_stat_show 1021 .stat_show = function_stat_show
1009}; 1022};
1010 1023
1011static __init void ftrace_profile_debugfs(struct dentry *d_tracer) 1024static __init void ftrace_profile_tracefs(struct dentry *d_tracer)
1012{ 1025{
1013 struct ftrace_profile_stat *stat; 1026 struct ftrace_profile_stat *stat;
1014 struct dentry *entry; 1027 struct dentry *entry;
@@ -1044,15 +1057,15 @@ static __init void ftrace_profile_debugfs(struct dentry *d_tracer)
1044 } 1057 }
1045 } 1058 }
1046 1059
1047 entry = debugfs_create_file("function_profile_enabled", 0644, 1060 entry = tracefs_create_file("function_profile_enabled", 0644,
1048 d_tracer, NULL, &ftrace_profile_fops); 1061 d_tracer, NULL, &ftrace_profile_fops);
1049 if (!entry) 1062 if (!entry)
1050 pr_warning("Could not create debugfs " 1063 pr_warning("Could not create tracefs "
1051 "'function_profile_enabled' entry\n"); 1064 "'function_profile_enabled' entry\n");
1052} 1065}
1053 1066
1054#else /* CONFIG_FUNCTION_PROFILER */ 1067#else /* CONFIG_FUNCTION_PROFILER */
1055static __init void ftrace_profile_debugfs(struct dentry *d_tracer) 1068static __init void ftrace_profile_tracefs(struct dentry *d_tracer)
1056{ 1069{
1057} 1070}
1058#endif /* CONFIG_FUNCTION_PROFILER */ 1071#endif /* CONFIG_FUNCTION_PROFILER */
@@ -4712,7 +4725,7 @@ void ftrace_destroy_filter_files(struct ftrace_ops *ops)
4712 mutex_unlock(&ftrace_lock); 4725 mutex_unlock(&ftrace_lock);
4713} 4726}
4714 4727
4715static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer) 4728static __init int ftrace_init_dyn_tracefs(struct dentry *d_tracer)
4716{ 4729{
4717 4730
4718 trace_create_file("available_filter_functions", 0444, 4731 trace_create_file("available_filter_functions", 0444,
@@ -5020,7 +5033,7 @@ static int __init ftrace_nodyn_init(void)
5020} 5033}
5021core_initcall(ftrace_nodyn_init); 5034core_initcall(ftrace_nodyn_init);
5022 5035
5023static inline int ftrace_init_dyn_debugfs(struct dentry *d_tracer) { return 0; } 5036static inline int ftrace_init_dyn_tracefs(struct dentry *d_tracer) { return 0; }
5024static inline void ftrace_startup_enable(int command) { } 5037static inline void ftrace_startup_enable(int command) { }
5025static inline void ftrace_startup_all(int command) { } 5038static inline void ftrace_startup_all(int command) { }
5026/* Keep as macros so we do not need to define the commands */ 5039/* Keep as macros so we do not need to define the commands */
@@ -5209,13 +5222,6 @@ static void ftrace_ops_recurs_func(unsigned long ip, unsigned long parent_ip,
5209ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops) 5222ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops)
5210{ 5223{
5211 /* 5224 /*
5212 * If this is a dynamic ops or we force list func,
5213 * then it needs to call the list anyway.
5214 */
5215 if (ops->flags & FTRACE_OPS_FL_DYNAMIC || FTRACE_FORCE_LIST_FUNC)
5216 return ftrace_ops_list_func;
5217
5218 /*
5219 * If the func handles its own recursion, call it directly. 5225 * If the func handles its own recursion, call it directly.
5220 * Otherwise call the recursion protected function that 5226 * Otherwise call the recursion protected function that
5221 * will call the ftrace ops function. 5227 * will call the ftrace ops function.
@@ -5473,7 +5479,7 @@ static const struct file_operations ftrace_pid_fops = {
5473 .release = ftrace_pid_release, 5479 .release = ftrace_pid_release,
5474}; 5480};
5475 5481
5476static __init int ftrace_init_debugfs(void) 5482static __init int ftrace_init_tracefs(void)
5477{ 5483{
5478 struct dentry *d_tracer; 5484 struct dentry *d_tracer;
5479 5485
@@ -5481,16 +5487,16 @@ static __init int ftrace_init_debugfs(void)
5481 if (IS_ERR(d_tracer)) 5487 if (IS_ERR(d_tracer))
5482 return 0; 5488 return 0;
5483 5489
5484 ftrace_init_dyn_debugfs(d_tracer); 5490 ftrace_init_dyn_tracefs(d_tracer);
5485 5491
5486 trace_create_file("set_ftrace_pid", 0644, d_tracer, 5492 trace_create_file("set_ftrace_pid", 0644, d_tracer,
5487 NULL, &ftrace_pid_fops); 5493 NULL, &ftrace_pid_fops);
5488 5494
5489 ftrace_profile_debugfs(d_tracer); 5495 ftrace_profile_tracefs(d_tracer);
5490 5496
5491 return 0; 5497 return 0;
5492} 5498}
5493fs_initcall(ftrace_init_debugfs); 5499fs_initcall(ftrace_init_tracefs);
5494 5500
5495/** 5501/**
5496 * ftrace_kill - kill ftrace 5502 * ftrace_kill - kill ftrace
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 5040d44fe5a3..0315d43176d8 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -2679,7 +2679,7 @@ static DEFINE_PER_CPU(unsigned int, current_context);
2679 2679
2680static __always_inline int trace_recursive_lock(void) 2680static __always_inline int trace_recursive_lock(void)
2681{ 2681{
2682 unsigned int val = this_cpu_read(current_context); 2682 unsigned int val = __this_cpu_read(current_context);
2683 int bit; 2683 int bit;
2684 2684
2685 if (in_interrupt()) { 2685 if (in_interrupt()) {
@@ -2696,18 +2696,14 @@ static __always_inline int trace_recursive_lock(void)
2696 return 1; 2696 return 1;
2697 2697
2698 val |= (1 << bit); 2698 val |= (1 << bit);
2699 this_cpu_write(current_context, val); 2699 __this_cpu_write(current_context, val);
2700 2700
2701 return 0; 2701 return 0;
2702} 2702}
2703 2703
2704static __always_inline void trace_recursive_unlock(void) 2704static __always_inline void trace_recursive_unlock(void)
2705{ 2705{
2706 unsigned int val = this_cpu_read(current_context); 2706 __this_cpu_and(current_context, __this_cpu_read(current_context) - 1);
2707
2708 val--;
2709 val &= this_cpu_read(current_context);
2710 this_cpu_write(current_context, val);
2711} 2707}
2712 2708
2713#else 2709#else
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 62c6506d663f..91eecaaa43e0 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -20,6 +20,7 @@
20#include <linux/notifier.h> 20#include <linux/notifier.h>
21#include <linux/irqflags.h> 21#include <linux/irqflags.h>
22#include <linux/debugfs.h> 22#include <linux/debugfs.h>
23#include <linux/tracefs.h>
23#include <linux/pagemap.h> 24#include <linux/pagemap.h>
24#include <linux/hardirq.h> 25#include <linux/hardirq.h>
25#include <linux/linkage.h> 26#include <linux/linkage.h>
@@ -31,6 +32,7 @@
31#include <linux/splice.h> 32#include <linux/splice.h>
32#include <linux/kdebug.h> 33#include <linux/kdebug.h>
33#include <linux/string.h> 34#include <linux/string.h>
35#include <linux/mount.h>
34#include <linux/rwsem.h> 36#include <linux/rwsem.h>
35#include <linux/slab.h> 37#include <linux/slab.h>
36#include <linux/ctype.h> 38#include <linux/ctype.h>
@@ -123,6 +125,42 @@ enum ftrace_dump_mode ftrace_dump_on_oops;
123/* When set, tracing will stop when a WARN*() is hit */ 125/* When set, tracing will stop when a WARN*() is hit */
124int __disable_trace_on_warning; 126int __disable_trace_on_warning;
125 127
128#ifdef CONFIG_TRACE_ENUM_MAP_FILE
129/* Map of enums to their values, for "enum_map" file */
130struct trace_enum_map_head {
131 struct module *mod;
132 unsigned long length;
133};
134
135union trace_enum_map_item;
136
137struct trace_enum_map_tail {
138 /*
139 * "end" is first and points to NULL as it must be different
140 * than "mod" or "enum_string"
141 */
142 union trace_enum_map_item *next;
143 const char *end; /* points to NULL */
144};
145
146static DEFINE_MUTEX(trace_enum_mutex);
147
148/*
149 * The trace_enum_maps are saved in an array with two extra elements,
150 * one at the beginning, and one at the end. The beginning item contains
151 * the count of the saved maps (head.length), and the module they
152 * belong to if not built in (head.mod). The ending item contains a
153 * pointer to the next array of saved enum_map items.
154 */
155union trace_enum_map_item {
156 struct trace_enum_map map;
157 struct trace_enum_map_head head;
158 struct trace_enum_map_tail tail;
159};
160
161static union trace_enum_map_item *trace_enum_maps;
162#endif /* CONFIG_TRACE_ENUM_MAP_FILE */
163
126static int tracing_set_tracer(struct trace_array *tr, const char *buf); 164static int tracing_set_tracer(struct trace_array *tr, const char *buf);
127 165
128#define MAX_TRACER_SIZE 100 166#define MAX_TRACER_SIZE 100
@@ -3908,6 +3946,182 @@ static const struct file_operations tracing_saved_cmdlines_size_fops = {
3908 .write = tracing_saved_cmdlines_size_write, 3946 .write = tracing_saved_cmdlines_size_write,
3909}; 3947};
3910 3948
3949#ifdef CONFIG_TRACE_ENUM_MAP_FILE
3950static union trace_enum_map_item *
3951update_enum_map(union trace_enum_map_item *ptr)
3952{
3953 if (!ptr->map.enum_string) {
3954 if (ptr->tail.next) {
3955 ptr = ptr->tail.next;
3956 /* Set ptr to the next real item (skip head) */
3957 ptr++;
3958 } else
3959 return NULL;
3960 }
3961 return ptr;
3962}
3963
3964static void *enum_map_next(struct seq_file *m, void *v, loff_t *pos)
3965{
3966 union trace_enum_map_item *ptr = v;
3967
3968 /*
3969 * Paranoid! If ptr points to end, we don't want to increment past it.
3970 * This really should never happen.
3971 */
3972 ptr = update_enum_map(ptr);
3973 if (WARN_ON_ONCE(!ptr))
3974 return NULL;
3975
3976 ptr++;
3977
3978 (*pos)++;
3979
3980 ptr = update_enum_map(ptr);
3981
3982 return ptr;
3983}
3984
3985static void *enum_map_start(struct seq_file *m, loff_t *pos)
3986{
3987 union trace_enum_map_item *v;
3988 loff_t l = 0;
3989
3990 mutex_lock(&trace_enum_mutex);
3991
3992 v = trace_enum_maps;
3993 if (v)
3994 v++;
3995
3996 while (v && l < *pos) {
3997 v = enum_map_next(m, v, &l);
3998 }
3999
4000 return v;
4001}
4002
4003static void enum_map_stop(struct seq_file *m, void *v)
4004{
4005 mutex_unlock(&trace_enum_mutex);
4006}
4007
4008static int enum_map_show(struct seq_file *m, void *v)
4009{
4010 union trace_enum_map_item *ptr = v;
4011
4012 seq_printf(m, "%s %ld (%s)\n",
4013 ptr->map.enum_string, ptr->map.enum_value,
4014 ptr->map.system);
4015
4016 return 0;
4017}
4018
4019static const struct seq_operations tracing_enum_map_seq_ops = {
4020 .start = enum_map_start,
4021 .next = enum_map_next,
4022 .stop = enum_map_stop,
4023 .show = enum_map_show,
4024};
4025
4026static int tracing_enum_map_open(struct inode *inode, struct file *filp)
4027{
4028 if (tracing_disabled)
4029 return -ENODEV;
4030
4031 return seq_open(filp, &tracing_enum_map_seq_ops);
4032}
4033
4034static const struct file_operations tracing_enum_map_fops = {
4035 .open = tracing_enum_map_open,
4036 .read = seq_read,
4037 .llseek = seq_lseek,
4038 .release = seq_release,
4039};
4040
4041static inline union trace_enum_map_item *
4042trace_enum_jmp_to_tail(union trace_enum_map_item *ptr)
4043{
4044 /* Return tail of array given the head */
4045 return ptr + ptr->head.length + 1;
4046}
4047
4048static void
4049trace_insert_enum_map_file(struct module *mod, struct trace_enum_map **start,
4050 int len)
4051{
4052 struct trace_enum_map **stop;
4053 struct trace_enum_map **map;
4054 union trace_enum_map_item *map_array;
4055 union trace_enum_map_item *ptr;
4056
4057 stop = start + len;
4058
4059 /*
4060 * The trace_enum_maps contains the map plus a head and tail item,
4061 * where the head holds the module and length of array, and the
4062 * tail holds a pointer to the next list.
4063 */
4064 map_array = kmalloc(sizeof(*map_array) * (len + 2), GFP_KERNEL);
4065 if (!map_array) {
4066 pr_warning("Unable to allocate trace enum mapping\n");
4067 return;
4068 }
4069
4070 mutex_lock(&trace_enum_mutex);
4071
4072 if (!trace_enum_maps)
4073 trace_enum_maps = map_array;
4074 else {
4075 ptr = trace_enum_maps;
4076 for (;;) {
4077 ptr = trace_enum_jmp_to_tail(ptr);
4078 if (!ptr->tail.next)
4079 break;
4080 ptr = ptr->tail.next;
4081
4082 }
4083 ptr->tail.next = map_array;
4084 }
4085 map_array->head.mod = mod;
4086 map_array->head.length = len;
4087 map_array++;
4088
4089 for (map = start; (unsigned long)map < (unsigned long)stop; map++) {
4090 map_array->map = **map;
4091 map_array++;
4092 }
4093 memset(map_array, 0, sizeof(*map_array));
4094
4095 mutex_unlock(&trace_enum_mutex);
4096}
4097
4098static void trace_create_enum_file(struct dentry *d_tracer)
4099{
4100 trace_create_file("enum_map", 0444, d_tracer,
4101 NULL, &tracing_enum_map_fops);
4102}
4103
4104#else /* CONFIG_TRACE_ENUM_MAP_FILE */
4105static inline void trace_create_enum_file(struct dentry *d_tracer) { }
4106static inline void trace_insert_enum_map_file(struct module *mod,
4107 struct trace_enum_map **start, int len) { }
4108#endif /* !CONFIG_TRACE_ENUM_MAP_FILE */
4109
4110static void trace_insert_enum_map(struct module *mod,
4111 struct trace_enum_map **start, int len)
4112{
4113 struct trace_enum_map **map;
4114
4115 if (len <= 0)
4116 return;
4117
4118 map = start;
4119
4120 trace_event_enum_update(map, len);
4121
4122 trace_insert_enum_map_file(mod, start, len);
4123}
4124
3911static ssize_t 4125static ssize_t
3912tracing_set_trace_read(struct file *filp, char __user *ubuf, 4126tracing_set_trace_read(struct file *filp, char __user *ubuf,
3913 size_t cnt, loff_t *ppos) 4127 size_t cnt, loff_t *ppos)
@@ -4105,9 +4319,24 @@ static void tracing_set_nop(struct trace_array *tr)
4105 tr->current_trace = &nop_trace; 4319 tr->current_trace = &nop_trace;
4106} 4320}
4107 4321
4108static int tracing_set_tracer(struct trace_array *tr, const char *buf) 4322static void update_tracer_options(struct trace_array *tr, struct tracer *t)
4109{ 4323{
4110 static struct trace_option_dentry *topts; 4324 static struct trace_option_dentry *topts;
4325
4326 /* Only enable if the directory has been created already. */
4327 if (!tr->dir)
4328 return;
4329
4330 /* Currently, only the top instance has options */
4331 if (!(tr->flags & TRACE_ARRAY_FL_GLOBAL))
4332 return;
4333
4334 destroy_trace_option_files(topts);
4335 topts = create_trace_option_files(tr, t);
4336}
4337
4338static int tracing_set_tracer(struct trace_array *tr, const char *buf)
4339{
4111 struct tracer *t; 4340 struct tracer *t;
4112#ifdef CONFIG_TRACER_MAX_TRACE 4341#ifdef CONFIG_TRACER_MAX_TRACE
4113 bool had_max_tr; 4342 bool had_max_tr;
@@ -4172,11 +4401,7 @@ static int tracing_set_tracer(struct trace_array *tr, const char *buf)
4172 free_snapshot(tr); 4401 free_snapshot(tr);
4173 } 4402 }
4174#endif 4403#endif
4175 /* Currently, only the top instance has options */ 4404 update_tracer_options(tr, t);
4176 if (tr->flags & TRACE_ARRAY_FL_GLOBAL) {
4177 destroy_trace_option_files(topts);
4178 topts = create_trace_option_files(tr, t);
4179 }
4180 4405
4181#ifdef CONFIG_TRACER_MAX_TRACE 4406#ifdef CONFIG_TRACER_MAX_TRACE
4182 if (t->use_max_tr && !had_max_tr) { 4407 if (t->use_max_tr && !had_max_tr) {
@@ -5817,6 +6042,14 @@ static inline __init int register_snapshot_cmd(void) { return 0; }
5817 6042
5818static struct dentry *tracing_get_dentry(struct trace_array *tr) 6043static struct dentry *tracing_get_dentry(struct trace_array *tr)
5819{ 6044{
6045 if (WARN_ON(!tr->dir))
6046 return ERR_PTR(-ENODEV);
6047
6048 /* Top directory uses NULL as the parent */
6049 if (tr->flags & TRACE_ARRAY_FL_GLOBAL)
6050 return NULL;
6051
6052 /* All sub buffers have a descriptor */
5820 return tr->dir; 6053 return tr->dir;
5821} 6054}
5822 6055
@@ -5831,10 +6064,10 @@ static struct dentry *tracing_dentry_percpu(struct trace_array *tr, int cpu)
5831 if (IS_ERR(d_tracer)) 6064 if (IS_ERR(d_tracer))
5832 return NULL; 6065 return NULL;
5833 6066
5834 tr->percpu_dir = debugfs_create_dir("per_cpu", d_tracer); 6067 tr->percpu_dir = tracefs_create_dir("per_cpu", d_tracer);
5835 6068
5836 WARN_ONCE(!tr->percpu_dir, 6069 WARN_ONCE(!tr->percpu_dir,
5837 "Could not create debugfs directory 'per_cpu/%d'\n", cpu); 6070 "Could not create tracefs directory 'per_cpu/%d'\n", cpu);
5838 6071
5839 return tr->percpu_dir; 6072 return tr->percpu_dir;
5840} 6073}
@@ -5851,7 +6084,7 @@ trace_create_cpu_file(const char *name, umode_t mode, struct dentry *parent,
5851} 6084}
5852 6085
5853static void 6086static void
5854tracing_init_debugfs_percpu(struct trace_array *tr, long cpu) 6087tracing_init_tracefs_percpu(struct trace_array *tr, long cpu)
5855{ 6088{
5856 struct dentry *d_percpu = tracing_dentry_percpu(tr, cpu); 6089 struct dentry *d_percpu = tracing_dentry_percpu(tr, cpu);
5857 struct dentry *d_cpu; 6090 struct dentry *d_cpu;
@@ -5861,9 +6094,9 @@ tracing_init_debugfs_percpu(struct trace_array *tr, long cpu)
5861 return; 6094 return;
5862 6095
5863 snprintf(cpu_dir, 30, "cpu%ld", cpu); 6096 snprintf(cpu_dir, 30, "cpu%ld", cpu);
5864 d_cpu = debugfs_create_dir(cpu_dir, d_percpu); 6097 d_cpu = tracefs_create_dir(cpu_dir, d_percpu);
5865 if (!d_cpu) { 6098 if (!d_cpu) {
5866 pr_warning("Could not create debugfs '%s' entry\n", cpu_dir); 6099 pr_warning("Could not create tracefs '%s' entry\n", cpu_dir);
5867 return; 6100 return;
5868 } 6101 }
5869 6102
@@ -6015,9 +6248,9 @@ struct dentry *trace_create_file(const char *name,
6015{ 6248{
6016 struct dentry *ret; 6249 struct dentry *ret;
6017 6250
6018 ret = debugfs_create_file(name, mode, parent, data, fops); 6251 ret = tracefs_create_file(name, mode, parent, data, fops);
6019 if (!ret) 6252 if (!ret)
6020 pr_warning("Could not create debugfs '%s' entry\n", name); 6253 pr_warning("Could not create tracefs '%s' entry\n", name);
6021 6254
6022 return ret; 6255 return ret;
6023} 6256}
@@ -6034,9 +6267,9 @@ static struct dentry *trace_options_init_dentry(struct trace_array *tr)
6034 if (IS_ERR(d_tracer)) 6267 if (IS_ERR(d_tracer))
6035 return NULL; 6268 return NULL;
6036 6269
6037 tr->options = debugfs_create_dir("options", d_tracer); 6270 tr->options = tracefs_create_dir("options", d_tracer);
6038 if (!tr->options) { 6271 if (!tr->options) {
6039 pr_warning("Could not create debugfs directory 'options'\n"); 6272 pr_warning("Could not create tracefs directory 'options'\n");
6040 return NULL; 6273 return NULL;
6041 } 6274 }
6042 6275
@@ -6105,7 +6338,7 @@ destroy_trace_option_files(struct trace_option_dentry *topts)
6105 return; 6338 return;
6106 6339
6107 for (cnt = 0; topts[cnt].opt; cnt++) 6340 for (cnt = 0; topts[cnt].opt; cnt++)
6108 debugfs_remove(topts[cnt].entry); 6341 tracefs_remove(topts[cnt].entry);
6109 6342
6110 kfree(topts); 6343 kfree(topts);
6111} 6344}
@@ -6194,7 +6427,7 @@ static const struct file_operations rb_simple_fops = {
6194struct dentry *trace_instance_dir; 6427struct dentry *trace_instance_dir;
6195 6428
6196static void 6429static void
6197init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer); 6430init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer);
6198 6431
6199static int 6432static int
6200allocate_trace_buffer(struct trace_array *tr, struct trace_buffer *buf, int size) 6433allocate_trace_buffer(struct trace_array *tr, struct trace_buffer *buf, int size)
@@ -6271,7 +6504,7 @@ static void free_trace_buffers(struct trace_array *tr)
6271#endif 6504#endif
6272} 6505}
6273 6506
6274static int new_instance_create(const char *name) 6507static int instance_mkdir(const char *name)
6275{ 6508{
6276 struct trace_array *tr; 6509 struct trace_array *tr;
6277 int ret; 6510 int ret;
@@ -6310,17 +6543,17 @@ static int new_instance_create(const char *name)
6310 if (allocate_trace_buffers(tr, trace_buf_size) < 0) 6543 if (allocate_trace_buffers(tr, trace_buf_size) < 0)
6311 goto out_free_tr; 6544 goto out_free_tr;
6312 6545
6313 tr->dir = debugfs_create_dir(name, trace_instance_dir); 6546 tr->dir = tracefs_create_dir(name, trace_instance_dir);
6314 if (!tr->dir) 6547 if (!tr->dir)
6315 goto out_free_tr; 6548 goto out_free_tr;
6316 6549
6317 ret = event_trace_add_tracer(tr->dir, tr); 6550 ret = event_trace_add_tracer(tr->dir, tr);
6318 if (ret) { 6551 if (ret) {
6319 debugfs_remove_recursive(tr->dir); 6552 tracefs_remove_recursive(tr->dir);
6320 goto out_free_tr; 6553 goto out_free_tr;
6321 } 6554 }
6322 6555
6323 init_tracer_debugfs(tr, tr->dir); 6556 init_tracer_tracefs(tr, tr->dir);
6324 6557
6325 list_add(&tr->list, &ftrace_trace_arrays); 6558 list_add(&tr->list, &ftrace_trace_arrays);
6326 6559
@@ -6341,7 +6574,7 @@ static int new_instance_create(const char *name)
6341 6574
6342} 6575}
6343 6576
6344static int instance_delete(const char *name) 6577static int instance_rmdir(const char *name)
6345{ 6578{
6346 struct trace_array *tr; 6579 struct trace_array *tr;
6347 int found = 0; 6580 int found = 0;
@@ -6382,82 +6615,17 @@ static int instance_delete(const char *name)
6382 return ret; 6615 return ret;
6383} 6616}
6384 6617
6385static int instance_mkdir (struct inode *inode, struct dentry *dentry, umode_t mode)
6386{
6387 struct dentry *parent;
6388 int ret;
6389
6390 /* Paranoid: Make sure the parent is the "instances" directory */
6391 parent = hlist_entry(inode->i_dentry.first, struct dentry, d_u.d_alias);
6392 if (WARN_ON_ONCE(parent != trace_instance_dir))
6393 return -ENOENT;
6394
6395 /*
6396 * The inode mutex is locked, but debugfs_create_dir() will also
6397 * take the mutex. As the instances directory can not be destroyed
6398 * or changed in any other way, it is safe to unlock it, and
6399 * let the dentry try. If two users try to make the same dir at
6400 * the same time, then the new_instance_create() will determine the
6401 * winner.
6402 */
6403 mutex_unlock(&inode->i_mutex);
6404
6405 ret = new_instance_create(dentry->d_iname);
6406
6407 mutex_lock(&inode->i_mutex);
6408
6409 return ret;
6410}
6411
6412static int instance_rmdir(struct inode *inode, struct dentry *dentry)
6413{
6414 struct dentry *parent;
6415 int ret;
6416
6417 /* Paranoid: Make sure the parent is the "instances" directory */
6418 parent = hlist_entry(inode->i_dentry.first, struct dentry, d_u.d_alias);
6419 if (WARN_ON_ONCE(parent != trace_instance_dir))
6420 return -ENOENT;
6421
6422 /* The caller did a dget() on dentry */
6423 mutex_unlock(&dentry->d_inode->i_mutex);
6424
6425 /*
6426 * The inode mutex is locked, but debugfs_create_dir() will also
6427 * take the mutex. As the instances directory can not be destroyed
6428 * or changed in any other way, it is safe to unlock it, and
6429 * let the dentry try. If two users try to make the same dir at
6430 * the same time, then the instance_delete() will determine the
6431 * winner.
6432 */
6433 mutex_unlock(&inode->i_mutex);
6434
6435 ret = instance_delete(dentry->d_iname);
6436
6437 mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT);
6438 mutex_lock(&dentry->d_inode->i_mutex);
6439
6440 return ret;
6441}
6442
6443static const struct inode_operations instance_dir_inode_operations = {
6444 .lookup = simple_lookup,
6445 .mkdir = instance_mkdir,
6446 .rmdir = instance_rmdir,
6447};
6448
6449static __init void create_trace_instances(struct dentry *d_tracer) 6618static __init void create_trace_instances(struct dentry *d_tracer)
6450{ 6619{
6451 trace_instance_dir = debugfs_create_dir("instances", d_tracer); 6620 trace_instance_dir = tracefs_create_instance_dir("instances", d_tracer,
6621 instance_mkdir,
6622 instance_rmdir);
6452 if (WARN_ON(!trace_instance_dir)) 6623 if (WARN_ON(!trace_instance_dir))
6453 return; 6624 return;
6454
6455 /* Hijack the dir inode operations, to allow mkdir */
6456 trace_instance_dir->d_inode->i_op = &instance_dir_inode_operations;
6457} 6625}
6458 6626
6459static void 6627static void
6460init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer) 6628init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer)
6461{ 6629{
6462 int cpu; 6630 int cpu;
6463 6631
@@ -6511,10 +6679,32 @@ init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer)
6511#endif 6679#endif
6512 6680
6513 for_each_tracing_cpu(cpu) 6681 for_each_tracing_cpu(cpu)
6514 tracing_init_debugfs_percpu(tr, cpu); 6682 tracing_init_tracefs_percpu(tr, cpu);
6515 6683
6516} 6684}
6517 6685
6686static struct vfsmount *trace_automount(void *ingore)
6687{
6688 struct vfsmount *mnt;
6689 struct file_system_type *type;
6690
6691 /*
6692 * To maintain backward compatibility for tools that mount
6693 * debugfs to get to the tracing facility, tracefs is automatically
6694 * mounted to the debugfs/tracing directory.
6695 */
6696 type = get_fs_type("tracefs");
6697 if (!type)
6698 return NULL;
6699 mnt = vfs_kern_mount(type, 0, "tracefs", NULL);
6700 put_filesystem(type);
6701 if (IS_ERR(mnt))
6702 return NULL;
6703 mntget(mnt);
6704
6705 return mnt;
6706}
6707
6518/** 6708/**
6519 * tracing_init_dentry - initialize top level trace array 6709 * tracing_init_dentry - initialize top level trace array
6520 * 6710 *
@@ -6526,23 +6716,112 @@ struct dentry *tracing_init_dentry(void)
6526{ 6716{
6527 struct trace_array *tr = &global_trace; 6717 struct trace_array *tr = &global_trace;
6528 6718
6719 /* The top level trace array uses NULL as parent */
6529 if (tr->dir) 6720 if (tr->dir)
6530 return tr->dir; 6721 return NULL;
6531 6722
6532 if (WARN_ON(!debugfs_initialized())) 6723 if (WARN_ON(!debugfs_initialized()))
6533 return ERR_PTR(-ENODEV); 6724 return ERR_PTR(-ENODEV);
6534 6725
6535 tr->dir = debugfs_create_dir("tracing", NULL); 6726 /*
6536 6727 * As there may still be users that expect the tracing
6728 * files to exist in debugfs/tracing, we must automount
6729 * the tracefs file system there, so older tools still
6730 * work with the newer kerenl.
6731 */
6732 tr->dir = debugfs_create_automount("tracing", NULL,
6733 trace_automount, NULL);
6537 if (!tr->dir) { 6734 if (!tr->dir) {
6538 pr_warn_once("Could not create debugfs directory 'tracing'\n"); 6735 pr_warn_once("Could not create debugfs directory 'tracing'\n");
6539 return ERR_PTR(-ENOMEM); 6736 return ERR_PTR(-ENOMEM);
6540 } 6737 }
6541 6738
6542 return tr->dir; 6739 return NULL;
6740}
6741
6742extern struct trace_enum_map *__start_ftrace_enum_maps[];
6743extern struct trace_enum_map *__stop_ftrace_enum_maps[];
6744
6745static void __init trace_enum_init(void)
6746{
6747 int len;
6748
6749 len = __stop_ftrace_enum_maps - __start_ftrace_enum_maps;
6750 trace_insert_enum_map(NULL, __start_ftrace_enum_maps, len);
6751}
6752
6753#ifdef CONFIG_MODULES
6754static void trace_module_add_enums(struct module *mod)
6755{
6756 if (!mod->num_trace_enums)
6757 return;
6758
6759 /*
6760 * Modules with bad taint do not have events created, do
6761 * not bother with enums either.
6762 */
6763 if (trace_module_has_bad_taint(mod))
6764 return;
6765
6766 trace_insert_enum_map(mod, mod->trace_enums, mod->num_trace_enums);
6543} 6767}
6544 6768
6545static __init int tracer_init_debugfs(void) 6769#ifdef CONFIG_TRACE_ENUM_MAP_FILE
6770static void trace_module_remove_enums(struct module *mod)
6771{
6772 union trace_enum_map_item *map;
6773 union trace_enum_map_item **last = &trace_enum_maps;
6774
6775 if (!mod->num_trace_enums)
6776 return;
6777
6778 mutex_lock(&trace_enum_mutex);
6779
6780 map = trace_enum_maps;
6781
6782 while (map) {
6783 if (map->head.mod == mod)
6784 break;
6785 map = trace_enum_jmp_to_tail(map);
6786 last = &map->tail.next;
6787 map = map->tail.next;
6788 }
6789 if (!map)
6790 goto out;
6791
6792 *last = trace_enum_jmp_to_tail(map)->tail.next;
6793 kfree(map);
6794 out:
6795 mutex_unlock(&trace_enum_mutex);
6796}
6797#else
6798static inline void trace_module_remove_enums(struct module *mod) { }
6799#endif /* CONFIG_TRACE_ENUM_MAP_FILE */
6800
6801static int trace_module_notify(struct notifier_block *self,
6802 unsigned long val, void *data)
6803{
6804 struct module *mod = data;
6805
6806 switch (val) {
6807 case MODULE_STATE_COMING:
6808 trace_module_add_enums(mod);
6809 break;
6810 case MODULE_STATE_GOING:
6811 trace_module_remove_enums(mod);
6812 break;
6813 }
6814
6815 return 0;
6816}
6817
6818static struct notifier_block trace_module_nb = {
6819 .notifier_call = trace_module_notify,
6820 .priority = 0,
6821};
6822#endif /* CONFIG_MODULES */
6823
6824static __init int tracer_init_tracefs(void)
6546{ 6825{
6547 struct dentry *d_tracer; 6826 struct dentry *d_tracer;
6548 6827
@@ -6552,7 +6831,7 @@ static __init int tracer_init_debugfs(void)
6552 if (IS_ERR(d_tracer)) 6831 if (IS_ERR(d_tracer))
6553 return 0; 6832 return 0;
6554 6833
6555 init_tracer_debugfs(&global_trace, d_tracer); 6834 init_tracer_tracefs(&global_trace, d_tracer);
6556 6835
6557 trace_create_file("tracing_thresh", 0644, d_tracer, 6836 trace_create_file("tracing_thresh", 0644, d_tracer,
6558 &global_trace, &tracing_thresh_fops); 6837 &global_trace, &tracing_thresh_fops);
@@ -6566,6 +6845,14 @@ static __init int tracer_init_debugfs(void)
6566 trace_create_file("saved_cmdlines_size", 0644, d_tracer, 6845 trace_create_file("saved_cmdlines_size", 0644, d_tracer,
6567 NULL, &tracing_saved_cmdlines_size_fops); 6846 NULL, &tracing_saved_cmdlines_size_fops);
6568 6847
6848 trace_enum_init();
6849
6850 trace_create_enum_file(d_tracer);
6851
6852#ifdef CONFIG_MODULES
6853 register_module_notifier(&trace_module_nb);
6854#endif
6855
6569#ifdef CONFIG_DYNAMIC_FTRACE 6856#ifdef CONFIG_DYNAMIC_FTRACE
6570 trace_create_file("dyn_ftrace_total_info", 0444, d_tracer, 6857 trace_create_file("dyn_ftrace_total_info", 0444, d_tracer,
6571 &ftrace_update_tot_cnt, &tracing_dyn_info_fops); 6858 &ftrace_update_tot_cnt, &tracing_dyn_info_fops);
@@ -6575,6 +6862,10 @@ static __init int tracer_init_debugfs(void)
6575 6862
6576 create_trace_options_dir(&global_trace); 6863 create_trace_options_dir(&global_trace);
6577 6864
6865 /* If the tracer was started via cmdline, create options for it here */
6866 if (global_trace.current_trace != &nop_trace)
6867 update_tracer_options(&global_trace, global_trace.current_trace);
6868
6578 return 0; 6869 return 0;
6579} 6870}
6580 6871
@@ -6888,7 +7179,7 @@ void __init trace_init(void)
6888 tracepoint_printk = 0; 7179 tracepoint_printk = 0;
6889 } 7180 }
6890 tracer_alloc_buffers(); 7181 tracer_alloc_buffers();
6891 trace_event_init(); 7182 trace_event_init();
6892} 7183}
6893 7184
6894__init static int clear_boot_tracer(void) 7185__init static int clear_boot_tracer(void)
@@ -6910,5 +7201,5 @@ __init static int clear_boot_tracer(void)
6910 return 0; 7201 return 0;
6911} 7202}
6912 7203
6913fs_initcall(tracer_init_debugfs); 7204fs_initcall(tracer_init_tracefs);
6914late_initcall(clear_boot_tracer); 7205late_initcall(clear_boot_tracer);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index dd8205a35760..d2612016de94 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -334,7 +334,7 @@ struct tracer_flags {
334 334
335 335
336/** 336/**
337 * struct tracer - a specific tracer and its callbacks to interact with debugfs 337 * struct tracer - a specific tracer and its callbacks to interact with tracefs
338 * @name: the name chosen to select it on the available_tracers file 338 * @name: the name chosen to select it on the available_tracers file
339 * @init: called when one switches to this tracer (echo name > current_tracer) 339 * @init: called when one switches to this tracer (echo name > current_tracer)
340 * @reset: called when one switches to another tracer 340 * @reset: called when one switches to another tracer
@@ -1309,8 +1309,10 @@ static inline void init_ftrace_syscalls(void) { }
1309 1309
1310#ifdef CONFIG_EVENT_TRACING 1310#ifdef CONFIG_EVENT_TRACING
1311void trace_event_init(void); 1311void trace_event_init(void);
1312void trace_event_enum_update(struct trace_enum_map **map, int len);
1312#else 1313#else
1313static inline void __init trace_event_init(void) { } 1314static inline void __init trace_event_init(void) { }
1315static inlin void trace_event_enum_update(struct trace_enum_map **map, int len) { }
1314#endif 1316#endif
1315 1317
1316extern struct trace_iterator *tracepoint_print_iter; 1318extern struct trace_iterator *tracepoint_print_iter;
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index e2d027ac66a2..ee7b94a4810a 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -223,7 +223,7 @@ FTRACE_ENTRY(bprint, bprint_entry,
223 __dynamic_array( u32, buf ) 223 __dynamic_array( u32, buf )
224 ), 224 ),
225 225
226 F_printk("%pf: %s", 226 F_printk("%ps: %s",
227 (void *)__entry->ip, __entry->fmt), 227 (void *)__entry->ip, __entry->fmt),
228 228
229 FILTER_OTHER 229 FILTER_OTHER
@@ -238,7 +238,7 @@ FTRACE_ENTRY(print, print_entry,
238 __dynamic_array( char, buf ) 238 __dynamic_array( char, buf )
239 ), 239 ),
240 240
241 F_printk("%pf: %s", 241 F_printk("%ps: %s",
242 (void *)__entry->ip, __entry->buf), 242 (void *)__entry->ip, __entry->buf),
243 243
244 FILTER_OTHER 244 FILTER_OTHER
@@ -253,7 +253,7 @@ FTRACE_ENTRY(bputs, bputs_entry,
253 __field( const char *, str ) 253 __field( const char *, str )
254 ), 254 ),
255 255
256 F_printk("%pf: %s", 256 F_printk("%ps: %s",
257 (void *)__entry->ip, __entry->str), 257 (void *)__entry->ip, __entry->str),
258 258
259 FILTER_OTHER 259 FILTER_OTHER
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index db54dda10ccc..7da1dfeb322e 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -13,7 +13,7 @@
13#include <linux/workqueue.h> 13#include <linux/workqueue.h>
14#include <linux/spinlock.h> 14#include <linux/spinlock.h>
15#include <linux/kthread.h> 15#include <linux/kthread.h>
16#include <linux/debugfs.h> 16#include <linux/tracefs.h>
17#include <linux/uaccess.h> 17#include <linux/uaccess.h>
18#include <linux/module.h> 18#include <linux/module.h>
19#include <linux/ctype.h> 19#include <linux/ctype.h>
@@ -480,7 +480,7 @@ static void remove_subsystem(struct ftrace_subsystem_dir *dir)
480 return; 480 return;
481 481
482 if (!--dir->nr_events) { 482 if (!--dir->nr_events) {
483 debugfs_remove_recursive(dir->entry); 483 tracefs_remove_recursive(dir->entry);
484 list_del(&dir->list); 484 list_del(&dir->list);
485 __put_system_dir(dir); 485 __put_system_dir(dir);
486 } 486 }
@@ -499,7 +499,7 @@ static void remove_event_file_dir(struct ftrace_event_file *file)
499 } 499 }
500 spin_unlock(&dir->d_lock); 500 spin_unlock(&dir->d_lock);
501 501
502 debugfs_remove_recursive(dir); 502 tracefs_remove_recursive(dir);
503 } 503 }
504 504
505 list_del(&file->list); 505 list_del(&file->list);
@@ -1526,7 +1526,7 @@ event_subsystem_dir(struct trace_array *tr, const char *name,
1526 } else 1526 } else
1527 __get_system(system); 1527 __get_system(system);
1528 1528
1529 dir->entry = debugfs_create_dir(name, parent); 1529 dir->entry = tracefs_create_dir(name, parent);
1530 if (!dir->entry) { 1530 if (!dir->entry) {
1531 pr_warn("Failed to create system directory %s\n", name); 1531 pr_warn("Failed to create system directory %s\n", name);
1532 __put_system(system); 1532 __put_system(system);
@@ -1539,12 +1539,12 @@ event_subsystem_dir(struct trace_array *tr, const char *name,
1539 dir->subsystem = system; 1539 dir->subsystem = system;
1540 file->system = dir; 1540 file->system = dir;
1541 1541
1542 entry = debugfs_create_file("filter", 0644, dir->entry, dir, 1542 entry = tracefs_create_file("filter", 0644, dir->entry, dir,
1543 &ftrace_subsystem_filter_fops); 1543 &ftrace_subsystem_filter_fops);
1544 if (!entry) { 1544 if (!entry) {
1545 kfree(system->filter); 1545 kfree(system->filter);
1546 system->filter = NULL; 1546 system->filter = NULL;
1547 pr_warn("Could not create debugfs '%s/filter' entry\n", name); 1547 pr_warn("Could not create tracefs '%s/filter' entry\n", name);
1548 } 1548 }
1549 1549
1550 trace_create_file("enable", 0644, dir->entry, dir, 1550 trace_create_file("enable", 0644, dir->entry, dir,
@@ -1585,9 +1585,9 @@ event_create_dir(struct dentry *parent, struct ftrace_event_file *file)
1585 d_events = parent; 1585 d_events = parent;
1586 1586
1587 name = ftrace_event_name(call); 1587 name = ftrace_event_name(call);
1588 file->dir = debugfs_create_dir(name, d_events); 1588 file->dir = tracefs_create_dir(name, d_events);
1589 if (!file->dir) { 1589 if (!file->dir) {
1590 pr_warn("Could not create debugfs '%s' directory\n", name); 1590 pr_warn("Could not create tracefs '%s' directory\n", name);
1591 return -1; 1591 return -1;
1592 } 1592 }
1593 1593
@@ -1704,6 +1704,125 @@ __register_event(struct ftrace_event_call *call, struct module *mod)
1704 return 0; 1704 return 0;
1705} 1705}
1706 1706
1707static char *enum_replace(char *ptr, struct trace_enum_map *map, int len)
1708{
1709 int rlen;
1710 int elen;
1711
1712 /* Find the length of the enum value as a string */
1713 elen = snprintf(ptr, 0, "%ld", map->enum_value);
1714 /* Make sure there's enough room to replace the string with the value */
1715 if (len < elen)
1716 return NULL;
1717
1718 snprintf(ptr, elen + 1, "%ld", map->enum_value);
1719
1720 /* Get the rest of the string of ptr */
1721 rlen = strlen(ptr + len);
1722 memmove(ptr + elen, ptr + len, rlen);
1723 /* Make sure we end the new string */
1724 ptr[elen + rlen] = 0;
1725
1726 return ptr + elen;
1727}
1728
1729static void update_event_printk(struct ftrace_event_call *call,
1730 struct trace_enum_map *map)
1731{
1732 char *ptr;
1733 int quote = 0;
1734 int len = strlen(map->enum_string);
1735
1736 for (ptr = call->print_fmt; *ptr; ptr++) {
1737 if (*ptr == '\\') {
1738 ptr++;
1739 /* paranoid */
1740 if (!*ptr)
1741 break;
1742 continue;
1743 }
1744 if (*ptr == '"') {
1745 quote ^= 1;
1746 continue;
1747 }
1748 if (quote)
1749 continue;
1750 if (isdigit(*ptr)) {
1751 /* skip numbers */
1752 do {
1753 ptr++;
1754 /* Check for alpha chars like ULL */
1755 } while (isalnum(*ptr));
1756 /*
1757 * A number must have some kind of delimiter after
1758 * it, and we can ignore that too.
1759 */
1760 continue;
1761 }
1762 if (isalpha(*ptr) || *ptr == '_') {
1763 if (strncmp(map->enum_string, ptr, len) == 0 &&
1764 !isalnum(ptr[len]) && ptr[len] != '_') {
1765 ptr = enum_replace(ptr, map, len);
1766 /* Hmm, enum string smaller than value */
1767 if (WARN_ON_ONCE(!ptr))
1768 return;
1769 /*
1770 * No need to decrement here, as enum_replace()
1771 * returns the pointer to the character passed
1772 * the enum, and two enums can not be placed
1773 * back to back without something in between.
1774 * We can skip that something in between.
1775 */
1776 continue;
1777 }
1778 skip_more:
1779 do {
1780 ptr++;
1781 } while (isalnum(*ptr) || *ptr == '_');
1782 /*
1783 * If what comes after this variable is a '.' or
1784 * '->' then we can continue to ignore that string.
1785 */
1786 if (*ptr == '.' || (ptr[0] == '-' && ptr[1] == '>')) {
1787 ptr += *ptr == '.' ? 1 : 2;
1788 goto skip_more;
1789 }
1790 /*
1791 * Once again, we can skip the delimiter that came
1792 * after the string.
1793 */
1794 continue;
1795 }
1796 }
1797}
1798
1799void trace_event_enum_update(struct trace_enum_map **map, int len)
1800{
1801 struct ftrace_event_call *call, *p;
1802 const char *last_system = NULL;
1803 int last_i;
1804 int i;
1805
1806 down_write(&trace_event_sem);
1807 list_for_each_entry_safe(call, p, &ftrace_events, list) {
1808 /* events are usually grouped together with systems */
1809 if (!last_system || call->class->system != last_system) {
1810 last_i = 0;
1811 last_system = call->class->system;
1812 }
1813
1814 for (i = last_i; i < len; i++) {
1815 if (call->class->system == map[i]->system) {
1816 /* Save the first system if need be */
1817 if (!last_i)
1818 last_i = i;
1819 update_event_printk(call, map[i]);
1820 }
1821 }
1822 }
1823 up_write(&trace_event_sem);
1824}
1825
1707static struct ftrace_event_file * 1826static struct ftrace_event_file *
1708trace_create_new_event(struct ftrace_event_call *call, 1827trace_create_new_event(struct ftrace_event_call *call,
1709 struct trace_array *tr) 1828 struct trace_array *tr)
@@ -1915,7 +2034,7 @@ static int trace_module_notify(struct notifier_block *self,
1915 2034
1916static struct notifier_block trace_module_nb = { 2035static struct notifier_block trace_module_nb = {
1917 .notifier_call = trace_module_notify, 2036 .notifier_call = trace_module_notify,
1918 .priority = 0, 2037 .priority = 1, /* higher than trace.c module notify */
1919}; 2038};
1920#endif /* CONFIG_MODULES */ 2039#endif /* CONFIG_MODULES */
1921 2040
@@ -2228,7 +2347,7 @@ static inline int register_event_cmds(void) { return 0; }
2228/* 2347/*
2229 * The top level array has already had its ftrace_event_file 2348 * The top level array has already had its ftrace_event_file
2230 * descriptors created in order to allow for early events to 2349 * descriptors created in order to allow for early events to
2231 * be recorded. This function is called after the debugfs has been 2350 * be recorded. This function is called after the tracefs has been
2232 * initialized, and we now have to create the files associated 2351 * initialized, and we now have to create the files associated
2233 * to the events. 2352 * to the events.
2234 */ 2353 */
@@ -2311,16 +2430,16 @@ create_event_toplevel_files(struct dentry *parent, struct trace_array *tr)
2311 struct dentry *d_events; 2430 struct dentry *d_events;
2312 struct dentry *entry; 2431 struct dentry *entry;
2313 2432
2314 entry = debugfs_create_file("set_event", 0644, parent, 2433 entry = tracefs_create_file("set_event", 0644, parent,
2315 tr, &ftrace_set_event_fops); 2434 tr, &ftrace_set_event_fops);
2316 if (!entry) { 2435 if (!entry) {
2317 pr_warn("Could not create debugfs 'set_event' entry\n"); 2436 pr_warn("Could not create tracefs 'set_event' entry\n");
2318 return -ENOMEM; 2437 return -ENOMEM;
2319 } 2438 }
2320 2439
2321 d_events = debugfs_create_dir("events", parent); 2440 d_events = tracefs_create_dir("events", parent);
2322 if (!d_events) { 2441 if (!d_events) {
2323 pr_warn("Could not create debugfs 'events' directory\n"); 2442 pr_warn("Could not create tracefs 'events' directory\n");
2324 return -ENOMEM; 2443 return -ENOMEM;
2325 } 2444 }
2326 2445
@@ -2412,7 +2531,7 @@ int event_trace_del_tracer(struct trace_array *tr)
2412 2531
2413 down_write(&trace_event_sem); 2532 down_write(&trace_event_sem);
2414 __trace_remove_event_dirs(tr); 2533 __trace_remove_event_dirs(tr);
2415 debugfs_remove_recursive(tr->event_dir); 2534 tracefs_remove_recursive(tr->event_dir);
2416 up_write(&trace_event_sem); 2535 up_write(&trace_event_sem);
2417 2536
2418 tr->event_dir = NULL; 2537 tr->event_dir = NULL;
@@ -2534,10 +2653,10 @@ static __init int event_trace_init(void)
2534 if (IS_ERR(d_tracer)) 2653 if (IS_ERR(d_tracer))
2535 return 0; 2654 return 0;
2536 2655
2537 entry = debugfs_create_file("available_events", 0444, d_tracer, 2656 entry = tracefs_create_file("available_events", 0444, d_tracer,
2538 tr, &ftrace_avail_fops); 2657 tr, &ftrace_avail_fops);
2539 if (!entry) 2658 if (!entry)
2540 pr_warn("Could not create debugfs 'available_events' entry\n"); 2659 pr_warn("Could not create tracefs 'available_events' entry\n");
2541 2660
2542 if (trace_define_common_fields()) 2661 if (trace_define_common_fields())
2543 pr_warn("tracing: Failed to allocate common fields"); 2662 pr_warn("tracing: Failed to allocate common fields");
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index 12e2b99be862..174a6a71146c 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -177,7 +177,7 @@ struct ftrace_event_call __used event_##call = { \
177 }, \ 177 }, \
178 .event.type = etype, \ 178 .event.type = etype, \
179 .print_fmt = print, \ 179 .print_fmt = print, \
180 .flags = TRACE_EVENT_FL_IGNORE_ENABLE | TRACE_EVENT_FL_USE_CALL_FILTER, \ 180 .flags = TRACE_EVENT_FL_IGNORE_ENABLE, \
181}; \ 181}; \
182struct ftrace_event_call __used \ 182struct ftrace_event_call __used \
183__attribute__((section("_ftrace_events"))) *__event_##call = &event_##call; 183__attribute__((section("_ftrace_events"))) *__event_##call = &event_##call;
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 2d25ad1526bb..9cfea4c6d314 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -6,7 +6,6 @@
6 * is Copyright (c) Steven Rostedt <srostedt@redhat.com> 6 * is Copyright (c) Steven Rostedt <srostedt@redhat.com>
7 * 7 *
8 */ 8 */
9#include <linux/debugfs.h>
10#include <linux/uaccess.h> 9#include <linux/uaccess.h>
11#include <linux/ftrace.h> 10#include <linux/ftrace.h>
12#include <linux/slab.h> 11#include <linux/slab.h>
@@ -151,7 +150,7 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth,
151 * The curr_ret_stack is initialized to -1 and get increased 150 * The curr_ret_stack is initialized to -1 and get increased
152 * in this function. So it can be less than -1 only if it was 151 * in this function. So it can be less than -1 only if it was
153 * filtered out via ftrace_graph_notrace_addr() which can be 152 * filtered out via ftrace_graph_notrace_addr() which can be
154 * set from set_graph_notrace file in debugfs by user. 153 * set from set_graph_notrace file in tracefs by user.
155 */ 154 */
156 if (current->curr_ret_stack < -1) 155 if (current->curr_ret_stack < -1)
157 return -EBUSY; 156 return -EBUSY;
@@ -1432,7 +1431,7 @@ static const struct file_operations graph_depth_fops = {
1432 .llseek = generic_file_llseek, 1431 .llseek = generic_file_llseek,
1433}; 1432};
1434 1433
1435static __init int init_graph_debugfs(void) 1434static __init int init_graph_tracefs(void)
1436{ 1435{
1437 struct dentry *d_tracer; 1436 struct dentry *d_tracer;
1438 1437
@@ -1445,7 +1444,7 @@ static __init int init_graph_debugfs(void)
1445 1444
1446 return 0; 1445 return 0;
1447} 1446}
1448fs_initcall(init_graph_debugfs); 1447fs_initcall(init_graph_tracefs);
1449 1448
1450static __init int init_graph_trace(void) 1449static __init int init_graph_trace(void)
1451{ 1450{
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index d73f565b4e06..d0ce590f06e1 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -250,7 +250,7 @@ DEFINE_FETCH_symbol(string_size)
250#define fetch_file_offset_string_size NULL 250#define fetch_file_offset_string_size NULL
251 251
252/* Fetch type information table */ 252/* Fetch type information table */
253const struct fetch_type kprobes_fetch_type_table[] = { 253static const struct fetch_type kprobes_fetch_type_table[] = {
254 /* Special types */ 254 /* Special types */
255 [FETCH_TYPE_STRING] = __ASSIGN_FETCH_TYPE("string", string, string, 255 [FETCH_TYPE_STRING] = __ASSIGN_FETCH_TYPE("string", string, string,
256 sizeof(u32), 1, "__data_loc char[]"), 256 sizeof(u32), 1, "__data_loc char[]"),
@@ -760,7 +760,8 @@ static int create_trace_kprobe(int argc, char **argv)
760 760
761 /* Parse fetch argument */ 761 /* Parse fetch argument */
762 ret = traceprobe_parse_probe_arg(arg, &tk->tp.size, parg, 762 ret = traceprobe_parse_probe_arg(arg, &tk->tp.size, parg,
763 is_return, true); 763 is_return, true,
764 kprobes_fetch_type_table);
764 if (ret) { 765 if (ret) {
765 pr_info("Parse error at argument[%d]. (%d)\n", i, ret); 766 pr_info("Parse error at argument[%d]. (%d)\n", i, ret);
766 goto error; 767 goto error;
@@ -1134,11 +1135,15 @@ static void
1134kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs) 1135kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
1135{ 1136{
1136 struct ftrace_event_call *call = &tk->tp.call; 1137 struct ftrace_event_call *call = &tk->tp.call;
1138 struct bpf_prog *prog = call->prog;
1137 struct kprobe_trace_entry_head *entry; 1139 struct kprobe_trace_entry_head *entry;
1138 struct hlist_head *head; 1140 struct hlist_head *head;
1139 int size, __size, dsize; 1141 int size, __size, dsize;
1140 int rctx; 1142 int rctx;
1141 1143
1144 if (prog && !trace_call_bpf(prog, regs))
1145 return;
1146
1142 head = this_cpu_ptr(call->perf_events); 1147 head = this_cpu_ptr(call->perf_events);
1143 if (hlist_empty(head)) 1148 if (hlist_empty(head))
1144 return; 1149 return;
@@ -1165,11 +1170,15 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
1165 struct pt_regs *regs) 1170 struct pt_regs *regs)
1166{ 1171{
1167 struct ftrace_event_call *call = &tk->tp.call; 1172 struct ftrace_event_call *call = &tk->tp.call;
1173 struct bpf_prog *prog = call->prog;
1168 struct kretprobe_trace_entry_head *entry; 1174 struct kretprobe_trace_entry_head *entry;
1169 struct hlist_head *head; 1175 struct hlist_head *head;
1170 int size, __size, dsize; 1176 int size, __size, dsize;
1171 int rctx; 1177 int rctx;
1172 1178
1179 if (prog && !trace_call_bpf(prog, regs))
1180 return;
1181
1173 head = this_cpu_ptr(call->perf_events); 1182 head = this_cpu_ptr(call->perf_events);
1174 if (hlist_empty(head)) 1183 if (hlist_empty(head))
1175 return; 1184 return;
@@ -1286,7 +1295,7 @@ static int register_kprobe_event(struct trace_kprobe *tk)
1286 kfree(call->print_fmt); 1295 kfree(call->print_fmt);
1287 return -ENODEV; 1296 return -ENODEV;
1288 } 1297 }
1289 call->flags = 0; 1298 call->flags = TRACE_EVENT_FL_KPROBE;
1290 call->class->reg = kprobe_register; 1299 call->class->reg = kprobe_register;
1291 call->data = tk; 1300 call->data = tk;
1292 ret = trace_add_event_call(call); 1301 ret = trace_add_event_call(call);
@@ -1310,7 +1319,7 @@ static int unregister_kprobe_event(struct trace_kprobe *tk)
1310 return ret; 1319 return ret;
1311} 1320}
1312 1321
1313/* Make a debugfs interface for controlling probe points */ 1322/* Make a tracefs interface for controlling probe points */
1314static __init int init_kprobe_trace(void) 1323static __init int init_kprobe_trace(void)
1315{ 1324{
1316 struct dentry *d_tracer; 1325 struct dentry *d_tracer;
@@ -1323,20 +1332,20 @@ static __init int init_kprobe_trace(void)
1323 if (IS_ERR(d_tracer)) 1332 if (IS_ERR(d_tracer))
1324 return 0; 1333 return 0;
1325 1334
1326 entry = debugfs_create_file("kprobe_events", 0644, d_tracer, 1335 entry = tracefs_create_file("kprobe_events", 0644, d_tracer,
1327 NULL, &kprobe_events_ops); 1336 NULL, &kprobe_events_ops);
1328 1337
1329 /* Event list interface */ 1338 /* Event list interface */
1330 if (!entry) 1339 if (!entry)
1331 pr_warning("Could not create debugfs " 1340 pr_warning("Could not create tracefs "
1332 "'kprobe_events' entry\n"); 1341 "'kprobe_events' entry\n");
1333 1342
1334 /* Profile interface */ 1343 /* Profile interface */
1335 entry = debugfs_create_file("kprobe_profile", 0444, d_tracer, 1344 entry = tracefs_create_file("kprobe_profile", 0444, d_tracer,
1336 NULL, &kprobe_profile_ops); 1345 NULL, &kprobe_profile_ops);
1337 1346
1338 if (!entry) 1347 if (!entry)
1339 pr_warning("Could not create debugfs " 1348 pr_warning("Could not create tracefs "
1340 "'kprobe_profile' entry\n"); 1349 "'kprobe_profile' entry\n");
1341 return 0; 1350 return 0;
1342} 1351}
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index b983b2fd2ca1..1769a81da8a7 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -356,17 +356,14 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t,
356 356
357/* Recursive argument parser */ 357/* Recursive argument parser */
358static int parse_probe_arg(char *arg, const struct fetch_type *t, 358static int parse_probe_arg(char *arg, const struct fetch_type *t,
359 struct fetch_param *f, bool is_return, bool is_kprobe) 359 struct fetch_param *f, bool is_return, bool is_kprobe,
360 const struct fetch_type *ftbl)
360{ 361{
361 const struct fetch_type *ftbl;
362 unsigned long param; 362 unsigned long param;
363 long offset; 363 long offset;
364 char *tmp; 364 char *tmp;
365 int ret = 0; 365 int ret = 0;
366 366
367 ftbl = is_kprobe ? kprobes_fetch_type_table : uprobes_fetch_type_table;
368 BUG_ON(ftbl == NULL);
369
370 switch (arg[0]) { 367 switch (arg[0]) {
371 case '$': 368 case '$':
372 ret = parse_probe_vars(arg + 1, t, f, is_return, is_kprobe); 369 ret = parse_probe_vars(arg + 1, t, f, is_return, is_kprobe);
@@ -447,7 +444,7 @@ static int parse_probe_arg(char *arg, const struct fetch_type *t,
447 dprm->fetch_size = get_fetch_size_function(t, 444 dprm->fetch_size = get_fetch_size_function(t,
448 dprm->fetch, ftbl); 445 dprm->fetch, ftbl);
449 ret = parse_probe_arg(arg, t2, &dprm->orig, is_return, 446 ret = parse_probe_arg(arg, t2, &dprm->orig, is_return,
450 is_kprobe); 447 is_kprobe, ftbl);
451 if (ret) 448 if (ret)
452 kfree(dprm); 449 kfree(dprm);
453 else { 450 else {
@@ -505,15 +502,12 @@ static int __parse_bitfield_probe_arg(const char *bf,
505 502
506/* String length checking wrapper */ 503/* String length checking wrapper */
507int traceprobe_parse_probe_arg(char *arg, ssize_t *size, 504int traceprobe_parse_probe_arg(char *arg, ssize_t *size,
508 struct probe_arg *parg, bool is_return, bool is_kprobe) 505 struct probe_arg *parg, bool is_return, bool is_kprobe,
506 const struct fetch_type *ftbl)
509{ 507{
510 const struct fetch_type *ftbl;
511 const char *t; 508 const char *t;
512 int ret; 509 int ret;
513 510
514 ftbl = is_kprobe ? kprobes_fetch_type_table : uprobes_fetch_type_table;
515 BUG_ON(ftbl == NULL);
516
517 if (strlen(arg) > MAX_ARGSTR_LEN) { 511 if (strlen(arg) > MAX_ARGSTR_LEN) {
518 pr_info("Argument is too long.: %s\n", arg); 512 pr_info("Argument is too long.: %s\n", arg);
519 return -ENOSPC; 513 return -ENOSPC;
@@ -535,7 +529,8 @@ int traceprobe_parse_probe_arg(char *arg, ssize_t *size,
535 } 529 }
536 parg->offset = *size; 530 parg->offset = *size;
537 *size += parg->type->size; 531 *size += parg->type->size;
538 ret = parse_probe_arg(arg, parg->type, &parg->fetch, is_return, is_kprobe); 532 ret = parse_probe_arg(arg, parg->type, &parg->fetch, is_return,
533 is_kprobe, ftbl);
539 534
540 if (ret >= 0 && t != NULL) 535 if (ret >= 0 && t != NULL)
541 ret = __parse_bitfield_probe_arg(t, parg->type, &parg->fetch); 536 ret = __parse_bitfield_probe_arg(t, parg->type, &parg->fetch);
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index 4f815fbce16d..ab283e146b70 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -25,7 +25,7 @@
25#include <linux/seq_file.h> 25#include <linux/seq_file.h>
26#include <linux/slab.h> 26#include <linux/slab.h>
27#include <linux/smp.h> 27#include <linux/smp.h>
28#include <linux/debugfs.h> 28#include <linux/tracefs.h>
29#include <linux/types.h> 29#include <linux/types.h>
30#include <linux/string.h> 30#include <linux/string.h>
31#include <linux/ctype.h> 31#include <linux/ctype.h>
@@ -229,13 +229,6 @@ ASSIGN_FETCH_FUNC(file_offset, ftype), \
229#define FETCH_TYPE_STRING 0 229#define FETCH_TYPE_STRING 0
230#define FETCH_TYPE_STRSIZE 1 230#define FETCH_TYPE_STRSIZE 1
231 231
232/*
233 * Fetch type information table.
234 * It's declared as a weak symbol due to conditional compilation.
235 */
236extern __weak const struct fetch_type kprobes_fetch_type_table[];
237extern __weak const struct fetch_type uprobes_fetch_type_table[];
238
239#ifdef CONFIG_KPROBE_EVENT 232#ifdef CONFIG_KPROBE_EVENT
240struct symbol_cache; 233struct symbol_cache;
241unsigned long update_symbol_cache(struct symbol_cache *sc); 234unsigned long update_symbol_cache(struct symbol_cache *sc);
@@ -333,7 +326,8 @@ find_event_file_link(struct trace_probe *tp, struct ftrace_event_file *file)
333} 326}
334 327
335extern int traceprobe_parse_probe_arg(char *arg, ssize_t *size, 328extern int traceprobe_parse_probe_arg(char *arg, ssize_t *size,
336 struct probe_arg *parg, bool is_return, bool is_kprobe); 329 struct probe_arg *parg, bool is_return, bool is_kprobe,
330 const struct fetch_type *ftbl);
337 331
338extern int traceprobe_conflict_field_name(const char *name, 332extern int traceprobe_conflict_field_name(const char *name,
339 struct probe_arg *args, int narg); 333 struct probe_arg *args, int narg);
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index c3e4fcfddd45..3f34496244e9 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -327,11 +327,11 @@ static void t_stop(struct seq_file *m, void *p)
327 local_irq_enable(); 327 local_irq_enable();
328} 328}
329 329
330static int trace_lookup_stack(struct seq_file *m, long i) 330static void trace_lookup_stack(struct seq_file *m, long i)
331{ 331{
332 unsigned long addr = stack_dump_trace[i]; 332 unsigned long addr = stack_dump_trace[i];
333 333
334 return seq_printf(m, "%pS\n", (void *)addr); 334 seq_printf(m, "%pS\n", (void *)addr);
335} 335}
336 336
337static void print_disabled(struct seq_file *m) 337static void print_disabled(struct seq_file *m)
diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c
index 75e19e86c954..6cf935316769 100644
--- a/kernel/trace/trace_stat.c
+++ b/kernel/trace/trace_stat.c
@@ -12,7 +12,7 @@
12#include <linux/list.h> 12#include <linux/list.h>
13#include <linux/slab.h> 13#include <linux/slab.h>
14#include <linux/rbtree.h> 14#include <linux/rbtree.h>
15#include <linux/debugfs.h> 15#include <linux/tracefs.h>
16#include "trace_stat.h" 16#include "trace_stat.h"
17#include "trace.h" 17#include "trace.h"
18 18
@@ -65,7 +65,7 @@ static void reset_stat_session(struct stat_session *session)
65 65
66static void destroy_session(struct stat_session *session) 66static void destroy_session(struct stat_session *session)
67{ 67{
68 debugfs_remove(session->file); 68 tracefs_remove(session->file);
69 __reset_stat_session(session); 69 __reset_stat_session(session);
70 mutex_destroy(&session->stat_mutex); 70 mutex_destroy(&session->stat_mutex);
71 kfree(session); 71 kfree(session);
@@ -279,9 +279,9 @@ static int tracing_stat_init(void)
279 if (IS_ERR(d_tracing)) 279 if (IS_ERR(d_tracing))
280 return 0; 280 return 0;
281 281
282 stat_dir = debugfs_create_dir("trace_stat", d_tracing); 282 stat_dir = tracefs_create_dir("trace_stat", d_tracing);
283 if (!stat_dir) 283 if (!stat_dir)
284 pr_warning("Could not create debugfs " 284 pr_warning("Could not create tracefs "
285 "'trace_stat' entry\n"); 285 "'trace_stat' entry\n");
286 return 0; 286 return 0;
287} 287}
@@ -291,7 +291,7 @@ static int init_stat_file(struct stat_session *session)
291 if (!stat_dir && tracing_stat_init()) 291 if (!stat_dir && tracing_stat_init())
292 return -ENODEV; 292 return -ENODEV;
293 293
294 session->file = debugfs_create_file(session->ts->name, 0644, 294 session->file = tracefs_create_file(session->ts->name, 0644,
295 stat_dir, 295 stat_dir,
296 session, &tracing_stat_fops); 296 session, &tracing_stat_fops);
297 if (!session->file) 297 if (!session->file)
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 7dc1c8abecd6..d60fe62ec4fa 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -196,7 +196,7 @@ DEFINE_FETCH_file_offset(string)
196DEFINE_FETCH_file_offset(string_size) 196DEFINE_FETCH_file_offset(string_size)
197 197
198/* Fetch type information table */ 198/* Fetch type information table */
199const struct fetch_type uprobes_fetch_type_table[] = { 199static const struct fetch_type uprobes_fetch_type_table[] = {
200 /* Special types */ 200 /* Special types */
201 [FETCH_TYPE_STRING] = __ASSIGN_FETCH_TYPE("string", string, string, 201 [FETCH_TYPE_STRING] = __ASSIGN_FETCH_TYPE("string", string, string,
202 sizeof(u32), 1, "__data_loc char[]"), 202 sizeof(u32), 1, "__data_loc char[]"),
@@ -535,7 +535,8 @@ static int create_trace_uprobe(int argc, char **argv)
535 535
536 /* Parse fetch argument */ 536 /* Parse fetch argument */
537 ret = traceprobe_parse_probe_arg(arg, &tu->tp.size, parg, 537 ret = traceprobe_parse_probe_arg(arg, &tu->tp.size, parg,
538 is_return, false); 538 is_return, false,
539 uprobes_fetch_type_table);
539 if (ret) { 540 if (ret) {
540 pr_info("Parse error at argument[%d]. (%d)\n", i, ret); 541 pr_info("Parse error at argument[%d]. (%d)\n", i, ret);
541 goto error; 542 goto error;
@@ -1005,7 +1006,7 @@ __uprobe_perf_filter(struct trace_uprobe_filter *filter, struct mm_struct *mm)
1005 return true; 1006 return true;
1006 1007
1007 list_for_each_entry(event, &filter->perf_events, hw.tp_list) { 1008 list_for_each_entry(event, &filter->perf_events, hw.tp_list) {
1008 if (event->hw.tp_target->mm == mm) 1009 if (event->hw.target->mm == mm)
1009 return true; 1010 return true;
1010 } 1011 }
1011 1012
@@ -1015,7 +1016,7 @@ __uprobe_perf_filter(struct trace_uprobe_filter *filter, struct mm_struct *mm)
1015static inline bool 1016static inline bool
1016uprobe_filter_event(struct trace_uprobe *tu, struct perf_event *event) 1017uprobe_filter_event(struct trace_uprobe *tu, struct perf_event *event)
1017{ 1018{
1018 return __uprobe_perf_filter(&tu->filter, event->hw.tp_target->mm); 1019 return __uprobe_perf_filter(&tu->filter, event->hw.target->mm);
1019} 1020}
1020 1021
1021static int uprobe_perf_close(struct trace_uprobe *tu, struct perf_event *event) 1022static int uprobe_perf_close(struct trace_uprobe *tu, struct perf_event *event)
@@ -1023,10 +1024,10 @@ static int uprobe_perf_close(struct trace_uprobe *tu, struct perf_event *event)
1023 bool done; 1024 bool done;
1024 1025
1025 write_lock(&tu->filter.rwlock); 1026 write_lock(&tu->filter.rwlock);
1026 if (event->hw.tp_target) { 1027 if (event->hw.target) {
1027 list_del(&event->hw.tp_list); 1028 list_del(&event->hw.tp_list);
1028 done = tu->filter.nr_systemwide || 1029 done = tu->filter.nr_systemwide ||
1029 (event->hw.tp_target->flags & PF_EXITING) || 1030 (event->hw.target->flags & PF_EXITING) ||
1030 uprobe_filter_event(tu, event); 1031 uprobe_filter_event(tu, event);
1031 } else { 1032 } else {
1032 tu->filter.nr_systemwide--; 1033 tu->filter.nr_systemwide--;
@@ -1046,7 +1047,7 @@ static int uprobe_perf_open(struct trace_uprobe *tu, struct perf_event *event)
1046 int err; 1047 int err;
1047 1048
1048 write_lock(&tu->filter.rwlock); 1049 write_lock(&tu->filter.rwlock);
1049 if (event->hw.tp_target) { 1050 if (event->hw.target) {
1050 /* 1051 /*
1051 * event->parent != NULL means copy_process(), we can avoid 1052 * event->parent != NULL means copy_process(), we can avoid
1052 * uprobe_apply(). current->mm must be probed and we can rely 1053 * uprobe_apply(). current->mm must be probed and we can rely
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 3174bf8e3538..2316f50b07a4 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -24,8 +24,33 @@
24#include <linux/kvm_para.h> 24#include <linux/kvm_para.h>
25#include <linux/perf_event.h> 25#include <linux/perf_event.h>
26 26
27int watchdog_user_enabled = 1; 27/*
28 * The run state of the lockup detectors is controlled by the content of the
29 * 'watchdog_enabled' variable. Each lockup detector has its dedicated bit -
30 * bit 0 for the hard lockup detector and bit 1 for the soft lockup detector.
31 *
32 * 'watchdog_user_enabled', 'nmi_watchdog_enabled' and 'soft_watchdog_enabled'
33 * are variables that are only used as an 'interface' between the parameters
34 * in /proc/sys/kernel and the internal state bits in 'watchdog_enabled'. The
35 * 'watchdog_thresh' variable is handled differently because its value is not
36 * boolean, and the lockup detectors are 'suspended' while 'watchdog_thresh'
37 * is equal zero.
38 */
39#define NMI_WATCHDOG_ENABLED_BIT 0
40#define SOFT_WATCHDOG_ENABLED_BIT 1
41#define NMI_WATCHDOG_ENABLED (1 << NMI_WATCHDOG_ENABLED_BIT)
42#define SOFT_WATCHDOG_ENABLED (1 << SOFT_WATCHDOG_ENABLED_BIT)
43
44#ifdef CONFIG_HARDLOCKUP_DETECTOR
45static unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED|NMI_WATCHDOG_ENABLED;
46#else
47static unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED;
48#endif
49int __read_mostly nmi_watchdog_enabled;
50int __read_mostly soft_watchdog_enabled;
51int __read_mostly watchdog_user_enabled;
28int __read_mostly watchdog_thresh = 10; 52int __read_mostly watchdog_thresh = 10;
53
29#ifdef CONFIG_SMP 54#ifdef CONFIG_SMP
30int __read_mostly sysctl_softlockup_all_cpu_backtrace; 55int __read_mostly sysctl_softlockup_all_cpu_backtrace;
31#else 56#else
@@ -58,8 +83,6 @@ static unsigned long soft_lockup_nmi_warn;
58#ifdef CONFIG_HARDLOCKUP_DETECTOR 83#ifdef CONFIG_HARDLOCKUP_DETECTOR
59static int hardlockup_panic = 84static int hardlockup_panic =
60 CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE; 85 CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE;
61
62static bool hardlockup_detector_enabled = true;
63/* 86/*
64 * We may not want to enable hard lockup detection by default in all cases, 87 * We may not want to enable hard lockup detection by default in all cases,
65 * for example when running the kernel as a guest on a hypervisor. In these 88 * for example when running the kernel as a guest on a hypervisor. In these
@@ -68,14 +91,9 @@ static bool hardlockup_detector_enabled = true;
68 * kernel command line parameters are parsed, because otherwise it is not 91 * kernel command line parameters are parsed, because otherwise it is not
69 * possible to override this in hardlockup_panic_setup(). 92 * possible to override this in hardlockup_panic_setup().
70 */ 93 */
71void watchdog_enable_hardlockup_detector(bool val) 94void hardlockup_detector_disable(void)
72{
73 hardlockup_detector_enabled = val;
74}
75
76bool watchdog_hardlockup_detector_is_enabled(void)
77{ 95{
78 return hardlockup_detector_enabled; 96 watchdog_enabled &= ~NMI_WATCHDOG_ENABLED;
79} 97}
80 98
81static int __init hardlockup_panic_setup(char *str) 99static int __init hardlockup_panic_setup(char *str)
@@ -85,15 +103,9 @@ static int __init hardlockup_panic_setup(char *str)
85 else if (!strncmp(str, "nopanic", 7)) 103 else if (!strncmp(str, "nopanic", 7))
86 hardlockup_panic = 0; 104 hardlockup_panic = 0;
87 else if (!strncmp(str, "0", 1)) 105 else if (!strncmp(str, "0", 1))
88 watchdog_user_enabled = 0; 106 watchdog_enabled &= ~NMI_WATCHDOG_ENABLED;
89 else if (!strncmp(str, "1", 1) || !strncmp(str, "2", 1)) { 107 else if (!strncmp(str, "1", 1))
90 /* 108 watchdog_enabled |= NMI_WATCHDOG_ENABLED;
91 * Setting 'nmi_watchdog=1' or 'nmi_watchdog=2' (legacy option)
92 * has the same effect.
93 */
94 watchdog_user_enabled = 1;
95 watchdog_enable_hardlockup_detector(true);
96 }
97 return 1; 109 return 1;
98} 110}
99__setup("nmi_watchdog=", hardlockup_panic_setup); 111__setup("nmi_watchdog=", hardlockup_panic_setup);
@@ -112,19 +124,18 @@ __setup("softlockup_panic=", softlockup_panic_setup);
112 124
113static int __init nowatchdog_setup(char *str) 125static int __init nowatchdog_setup(char *str)
114{ 126{
115 watchdog_user_enabled = 0; 127 watchdog_enabled = 0;
116 return 1; 128 return 1;
117} 129}
118__setup("nowatchdog", nowatchdog_setup); 130__setup("nowatchdog", nowatchdog_setup);
119 131
120/* deprecated */
121static int __init nosoftlockup_setup(char *str) 132static int __init nosoftlockup_setup(char *str)
122{ 133{
123 watchdog_user_enabled = 0; 134 watchdog_enabled &= ~SOFT_WATCHDOG_ENABLED;
124 return 1; 135 return 1;
125} 136}
126__setup("nosoftlockup", nosoftlockup_setup); 137__setup("nosoftlockup", nosoftlockup_setup);
127/* */ 138
128#ifdef CONFIG_SMP 139#ifdef CONFIG_SMP
129static int __init softlockup_all_cpu_backtrace_setup(char *str) 140static int __init softlockup_all_cpu_backtrace_setup(char *str)
130{ 141{
@@ -239,10 +250,11 @@ static int is_softlockup(unsigned long touch_ts)
239{ 250{
240 unsigned long now = get_timestamp(); 251 unsigned long now = get_timestamp();
241 252
242 /* Warn about unreasonable delays: */ 253 if (watchdog_enabled & SOFT_WATCHDOG_ENABLED) {
243 if (time_after(now, touch_ts + get_softlockup_thresh())) 254 /* Warn about unreasonable delays. */
244 return now - touch_ts; 255 if (time_after(now, touch_ts + get_softlockup_thresh()))
245 256 return now - touch_ts;
257 }
246 return 0; 258 return 0;
247} 259}
248 260
@@ -477,6 +489,21 @@ static void watchdog(unsigned int cpu)
477 __this_cpu_write(soft_lockup_hrtimer_cnt, 489 __this_cpu_write(soft_lockup_hrtimer_cnt,
478 __this_cpu_read(hrtimer_interrupts)); 490 __this_cpu_read(hrtimer_interrupts));
479 __touch_watchdog(); 491 __touch_watchdog();
492
493 /*
494 * watchdog_nmi_enable() clears the NMI_WATCHDOG_ENABLED bit in the
495 * failure path. Check for failures that can occur asynchronously -
496 * for example, when CPUs are on-lined - and shut down the hardware
497 * perf event on each CPU accordingly.
498 *
499 * The only non-obvious place this bit can be cleared is through
500 * watchdog_nmi_enable(), so a pr_info() is placed there. Placing a
501 * pr_info here would be too noisy as it would result in a message
502 * every few seconds if the hardlockup was disabled but the softlockup
503 * enabled.
504 */
505 if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED))
506 watchdog_nmi_disable(cpu);
480} 507}
481 508
482#ifdef CONFIG_HARDLOCKUP_DETECTOR 509#ifdef CONFIG_HARDLOCKUP_DETECTOR
@@ -492,14 +519,9 @@ static int watchdog_nmi_enable(unsigned int cpu)
492 struct perf_event_attr *wd_attr; 519 struct perf_event_attr *wd_attr;
493 struct perf_event *event = per_cpu(watchdog_ev, cpu); 520 struct perf_event *event = per_cpu(watchdog_ev, cpu);
494 521
495 /* 522 /* nothing to do if the hard lockup detector is disabled */
496 * Some kernels need to default hard lockup detection to 523 if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED))
497 * 'disabled', for example a guest on a hypervisor. 524 goto out;
498 */
499 if (!watchdog_hardlockup_detector_is_enabled()) {
500 event = ERR_PTR(-ENOENT);
501 goto handle_err;
502 }
503 525
504 /* is it already setup and enabled? */ 526 /* is it already setup and enabled? */
505 if (event && event->state > PERF_EVENT_STATE_OFF) 527 if (event && event->state > PERF_EVENT_STATE_OFF)
@@ -515,7 +537,6 @@ static int watchdog_nmi_enable(unsigned int cpu)
515 /* Try to register using hardware perf events */ 537 /* Try to register using hardware perf events */
516 event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL); 538 event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL);
517 539
518handle_err:
519 /* save cpu0 error for future comparision */ 540 /* save cpu0 error for future comparision */
520 if (cpu == 0 && IS_ERR(event)) 541 if (cpu == 0 && IS_ERR(event))
521 cpu0_err = PTR_ERR(event); 542 cpu0_err = PTR_ERR(event);
@@ -527,6 +548,18 @@ handle_err:
527 goto out_save; 548 goto out_save;
528 } 549 }
529 550
551 /*
552 * Disable the hard lockup detector if _any_ CPU fails to set up
553 * set up the hardware perf event. The watchdog() function checks
554 * the NMI_WATCHDOG_ENABLED bit periodically.
555 *
556 * The barriers are for syncing up watchdog_enabled across all the
557 * cpus, as clear_bit() does not use barriers.
558 */
559 smp_mb__before_atomic();
560 clear_bit(NMI_WATCHDOG_ENABLED_BIT, &watchdog_enabled);
561 smp_mb__after_atomic();
562
530 /* skip displaying the same error again */ 563 /* skip displaying the same error again */
531 if (cpu > 0 && (PTR_ERR(event) == cpu0_err)) 564 if (cpu > 0 && (PTR_ERR(event) == cpu0_err))
532 return PTR_ERR(event); 565 return PTR_ERR(event);
@@ -540,6 +573,9 @@ handle_err:
540 else 573 else
541 pr_err("disabled (cpu%i): unable to create perf event: %ld\n", 574 pr_err("disabled (cpu%i): unable to create perf event: %ld\n",
542 cpu, PTR_ERR(event)); 575 cpu, PTR_ERR(event));
576
577 pr_info("Shutting down hard lockup detector on all cpus\n");
578
543 return PTR_ERR(event); 579 return PTR_ERR(event);
544 580
545 /* success path */ 581 /* success path */
@@ -567,9 +603,37 @@ static void watchdog_nmi_disable(unsigned int cpu)
567 cpu0_err = 0; 603 cpu0_err = 0;
568 } 604 }
569} 605}
606
607void watchdog_nmi_enable_all(void)
608{
609 int cpu;
610
611 if (!watchdog_user_enabled)
612 return;
613
614 get_online_cpus();
615 for_each_online_cpu(cpu)
616 watchdog_nmi_enable(cpu);
617 put_online_cpus();
618}
619
620void watchdog_nmi_disable_all(void)
621{
622 int cpu;
623
624 if (!watchdog_running)
625 return;
626
627 get_online_cpus();
628 for_each_online_cpu(cpu)
629 watchdog_nmi_disable(cpu);
630 put_online_cpus();
631}
570#else 632#else
571static int watchdog_nmi_enable(unsigned int cpu) { return 0; } 633static int watchdog_nmi_enable(unsigned int cpu) { return 0; }
572static void watchdog_nmi_disable(unsigned int cpu) { return; } 634static void watchdog_nmi_disable(unsigned int cpu) { return; }
635void watchdog_nmi_enable_all(void) {}
636void watchdog_nmi_disable_all(void) {}
573#endif /* CONFIG_HARDLOCKUP_DETECTOR */ 637#endif /* CONFIG_HARDLOCKUP_DETECTOR */
574 638
575static struct smp_hotplug_thread watchdog_threads = { 639static struct smp_hotplug_thread watchdog_threads = {
@@ -600,7 +664,7 @@ static void restart_watchdog_hrtimer(void *info)
600 HRTIMER_MODE_REL_PINNED); 664 HRTIMER_MODE_REL_PINNED);
601} 665}
602 666
603static void update_timers(int cpu) 667static void update_watchdog(int cpu)
604{ 668{
605 /* 669 /*
606 * Make sure that perf event counter will adopt to a new 670 * Make sure that perf event counter will adopt to a new
@@ -615,17 +679,17 @@ static void update_timers(int cpu)
615 watchdog_nmi_enable(cpu); 679 watchdog_nmi_enable(cpu);
616} 680}
617 681
618static void update_timers_all_cpus(void) 682static void update_watchdog_all_cpus(void)
619{ 683{
620 int cpu; 684 int cpu;
621 685
622 get_online_cpus(); 686 get_online_cpus();
623 for_each_online_cpu(cpu) 687 for_each_online_cpu(cpu)
624 update_timers(cpu); 688 update_watchdog(cpu);
625 put_online_cpus(); 689 put_online_cpus();
626} 690}
627 691
628static int watchdog_enable_all_cpus(bool sample_period_changed) 692static int watchdog_enable_all_cpus(void)
629{ 693{
630 int err = 0; 694 int err = 0;
631 695
@@ -635,8 +699,12 @@ static int watchdog_enable_all_cpus(bool sample_period_changed)
635 pr_err("Failed to create watchdog threads, disabled\n"); 699 pr_err("Failed to create watchdog threads, disabled\n");
636 else 700 else
637 watchdog_running = 1; 701 watchdog_running = 1;
638 } else if (sample_period_changed) { 702 } else {
639 update_timers_all_cpus(); 703 /*
704 * Enable/disable the lockup detectors or
705 * change the sample period 'on the fly'.
706 */
707 update_watchdog_all_cpus();
640 } 708 }
641 709
642 return err; 710 return err;
@@ -654,58 +722,159 @@ static void watchdog_disable_all_cpus(void)
654} 722}
655 723
656/* 724/*
657 * proc handler for /proc/sys/kernel/nmi_watchdog,watchdog_thresh 725 * Update the run state of the lockup detectors.
658 */ 726 */
727static int proc_watchdog_update(void)
728{
729 int err = 0;
659 730
660int proc_dowatchdog(struct ctl_table *table, int write, 731 /*
661 void __user *buffer, size_t *lenp, loff_t *ppos) 732 * Watchdog threads won't be started if they are already active.
733 * The 'watchdog_running' variable in watchdog_*_all_cpus() takes
734 * care of this. If those threads are already active, the sample
735 * period will be updated and the lockup detectors will be enabled
736 * or disabled 'on the fly'.
737 */
738 if (watchdog_enabled && watchdog_thresh)
739 err = watchdog_enable_all_cpus();
740 else
741 watchdog_disable_all_cpus();
742
743 return err;
744
745}
746
747static DEFINE_MUTEX(watchdog_proc_mutex);
748
749/*
750 * common function for watchdog, nmi_watchdog and soft_watchdog parameter
751 *
752 * caller | table->data points to | 'which' contains the flag(s)
753 * -------------------|-----------------------|-----------------------------
754 * proc_watchdog | watchdog_user_enabled | NMI_WATCHDOG_ENABLED or'ed
755 * | | with SOFT_WATCHDOG_ENABLED
756 * -------------------|-----------------------|-----------------------------
757 * proc_nmi_watchdog | nmi_watchdog_enabled | NMI_WATCHDOG_ENABLED
758 * -------------------|-----------------------|-----------------------------
759 * proc_soft_watchdog | soft_watchdog_enabled | SOFT_WATCHDOG_ENABLED
760 */
761static int proc_watchdog_common(int which, struct ctl_table *table, int write,
762 void __user *buffer, size_t *lenp, loff_t *ppos)
662{ 763{
663 int err, old_thresh, old_enabled; 764 int err, old, new;
664 bool old_hardlockup; 765 int *watchdog_param = (int *)table->data;
665 static DEFINE_MUTEX(watchdog_proc_mutex);
666 766
667 mutex_lock(&watchdog_proc_mutex); 767 mutex_lock(&watchdog_proc_mutex);
668 old_thresh = ACCESS_ONCE(watchdog_thresh);
669 old_enabled = ACCESS_ONCE(watchdog_user_enabled);
670 old_hardlockup = watchdog_hardlockup_detector_is_enabled();
671 768
672 err = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
673 if (err || !write)
674 goto out;
675
676 set_sample_period();
677 /* 769 /*
678 * Watchdog threads shouldn't be enabled if they are 770 * If the parameter is being read return the state of the corresponding
679 * disabled. The 'watchdog_running' variable check in 771 * bit(s) in 'watchdog_enabled', else update 'watchdog_enabled' and the
680 * watchdog_*_all_cpus() function takes care of this. 772 * run state of the lockup detectors.
681 */ 773 */
682 if (watchdog_user_enabled && watchdog_thresh) { 774 if (!write) {
775 *watchdog_param = (watchdog_enabled & which) != 0;
776 err = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
777 } else {
778 err = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
779 if (err)
780 goto out;
781
683 /* 782 /*
684 * Prevent a change in watchdog_thresh accidentally overriding 783 * There is a race window between fetching the current value
685 * the enablement of the hardlockup detector. 784 * from 'watchdog_enabled' and storing the new value. During
785 * this race window, watchdog_nmi_enable() can sneak in and
786 * clear the NMI_WATCHDOG_ENABLED bit in 'watchdog_enabled'.
787 * The 'cmpxchg' detects this race and the loop retries.
686 */ 788 */
687 if (watchdog_user_enabled != old_enabled) 789 do {
688 watchdog_enable_hardlockup_detector(true); 790 old = watchdog_enabled;
689 err = watchdog_enable_all_cpus(old_thresh != watchdog_thresh); 791 /*
690 } else 792 * If the parameter value is not zero set the
691 watchdog_disable_all_cpus(); 793 * corresponding bit(s), else clear it(them).
794 */
795 if (*watchdog_param)
796 new = old | which;
797 else
798 new = old & ~which;
799 } while (cmpxchg(&watchdog_enabled, old, new) != old);
692 800
693 /* Restore old values on failure */ 801 /*
694 if (err) { 802 * Update the run state of the lockup detectors.
695 watchdog_thresh = old_thresh; 803 * Restore 'watchdog_enabled' on failure.
696 watchdog_user_enabled = old_enabled; 804 */
697 watchdog_enable_hardlockup_detector(old_hardlockup); 805 err = proc_watchdog_update();
806 if (err)
807 watchdog_enabled = old;
698 } 808 }
699out: 809out:
700 mutex_unlock(&watchdog_proc_mutex); 810 mutex_unlock(&watchdog_proc_mutex);
701 return err; 811 return err;
702} 812}
813
814/*
815 * /proc/sys/kernel/watchdog
816 */
817int proc_watchdog(struct ctl_table *table, int write,
818 void __user *buffer, size_t *lenp, loff_t *ppos)
819{
820 return proc_watchdog_common(NMI_WATCHDOG_ENABLED|SOFT_WATCHDOG_ENABLED,
821 table, write, buffer, lenp, ppos);
822}
823
824/*
825 * /proc/sys/kernel/nmi_watchdog
826 */
827int proc_nmi_watchdog(struct ctl_table *table, int write,
828 void __user *buffer, size_t *lenp, loff_t *ppos)
829{
830 return proc_watchdog_common(NMI_WATCHDOG_ENABLED,
831 table, write, buffer, lenp, ppos);
832}
833
834/*
835 * /proc/sys/kernel/soft_watchdog
836 */
837int proc_soft_watchdog(struct ctl_table *table, int write,
838 void __user *buffer, size_t *lenp, loff_t *ppos)
839{
840 return proc_watchdog_common(SOFT_WATCHDOG_ENABLED,
841 table, write, buffer, lenp, ppos);
842}
843
844/*
845 * /proc/sys/kernel/watchdog_thresh
846 */
847int proc_watchdog_thresh(struct ctl_table *table, int write,
848 void __user *buffer, size_t *lenp, loff_t *ppos)
849{
850 int err, old;
851
852 mutex_lock(&watchdog_proc_mutex);
853
854 old = ACCESS_ONCE(watchdog_thresh);
855 err = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
856
857 if (err || !write)
858 goto out;
859
860 /*
861 * Update the sample period.
862 * Restore 'watchdog_thresh' on failure.
863 */
864 set_sample_period();
865 err = proc_watchdog_update();
866 if (err)
867 watchdog_thresh = old;
868out:
869 mutex_unlock(&watchdog_proc_mutex);
870 return err;
871}
703#endif /* CONFIG_SYSCTL */ 872#endif /* CONFIG_SYSCTL */
704 873
705void __init lockup_detector_init(void) 874void __init lockup_detector_init(void)
706{ 875{
707 set_sample_period(); 876 set_sample_period();
708 877
709 if (watchdog_user_enabled) 878 if (watchdog_enabled)
710 watchdog_enable_all_cpus(false); 879 watchdog_enable_all_cpus();
711} 880}
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 41ff75b478c6..586ad91300b0 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -159,6 +159,7 @@ struct worker_pool {
159 159
160 /* see manage_workers() for details on the two manager mutexes */ 160 /* see manage_workers() for details on the two manager mutexes */
161 struct mutex manager_arb; /* manager arbitration */ 161 struct mutex manager_arb; /* manager arbitration */
162 struct worker *manager; /* L: purely informational */
162 struct mutex attach_mutex; /* attach/detach exclusion */ 163 struct mutex attach_mutex; /* attach/detach exclusion */
163 struct list_head workers; /* A: attached workers */ 164 struct list_head workers; /* A: attached workers */
164 struct completion *detach_completion; /* all workers detached */ 165 struct completion *detach_completion; /* all workers detached */
@@ -230,7 +231,7 @@ struct wq_device;
230 */ 231 */
231struct workqueue_struct { 232struct workqueue_struct {
232 struct list_head pwqs; /* WR: all pwqs of this wq */ 233 struct list_head pwqs; /* WR: all pwqs of this wq */
233 struct list_head list; /* PL: list of all workqueues */ 234 struct list_head list; /* PR: list of all workqueues */
234 235
235 struct mutex mutex; /* protects this wq */ 236 struct mutex mutex; /* protects this wq */
236 int work_color; /* WQ: current work color */ 237 int work_color; /* WQ: current work color */
@@ -257,6 +258,13 @@ struct workqueue_struct {
257#endif 258#endif
258 char name[WQ_NAME_LEN]; /* I: workqueue name */ 259 char name[WQ_NAME_LEN]; /* I: workqueue name */
259 260
261 /*
262 * Destruction of workqueue_struct is sched-RCU protected to allow
263 * walking the workqueues list without grabbing wq_pool_mutex.
264 * This is used to dump all workqueues from sysrq.
265 */
266 struct rcu_head rcu;
267
260 /* hot fields used during command issue, aligned to cacheline */ 268 /* hot fields used during command issue, aligned to cacheline */
261 unsigned int flags ____cacheline_aligned; /* WQ: WQ_* flags */ 269 unsigned int flags ____cacheline_aligned; /* WQ: WQ_* flags */
262 struct pool_workqueue __percpu *cpu_pwqs; /* I: per-cpu pwqs */ 270 struct pool_workqueue __percpu *cpu_pwqs; /* I: per-cpu pwqs */
@@ -288,7 +296,7 @@ static struct workqueue_attrs *wq_update_unbound_numa_attrs_buf;
288static DEFINE_MUTEX(wq_pool_mutex); /* protects pools and workqueues list */ 296static DEFINE_MUTEX(wq_pool_mutex); /* protects pools and workqueues list */
289static DEFINE_SPINLOCK(wq_mayday_lock); /* protects wq->maydays list */ 297static DEFINE_SPINLOCK(wq_mayday_lock); /* protects wq->maydays list */
290 298
291static LIST_HEAD(workqueues); /* PL: list of all workqueues */ 299static LIST_HEAD(workqueues); /* PR: list of all workqueues */
292static bool workqueue_freezing; /* PL: have wqs started freezing? */ 300static bool workqueue_freezing; /* PL: have wqs started freezing? */
293 301
294/* the per-cpu worker pools */ 302/* the per-cpu worker pools */
@@ -324,6 +332,7 @@ EXPORT_SYMBOL_GPL(system_freezable_power_efficient_wq);
324static int worker_thread(void *__worker); 332static int worker_thread(void *__worker);
325static void copy_workqueue_attrs(struct workqueue_attrs *to, 333static void copy_workqueue_attrs(struct workqueue_attrs *to,
326 const struct workqueue_attrs *from); 334 const struct workqueue_attrs *from);
335static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
327 336
328#define CREATE_TRACE_POINTS 337#define CREATE_TRACE_POINTS
329#include <trace/events/workqueue.h> 338#include <trace/events/workqueue.h>
@@ -1911,9 +1920,11 @@ static bool manage_workers(struct worker *worker)
1911 */ 1920 */
1912 if (!mutex_trylock(&pool->manager_arb)) 1921 if (!mutex_trylock(&pool->manager_arb))
1913 return false; 1922 return false;
1923 pool->manager = worker;
1914 1924
1915 maybe_create_worker(pool); 1925 maybe_create_worker(pool);
1916 1926
1927 pool->manager = NULL;
1917 mutex_unlock(&pool->manager_arb); 1928 mutex_unlock(&pool->manager_arb);
1918 return true; 1929 return true;
1919} 1930}
@@ -2303,6 +2314,7 @@ repeat:
2303struct wq_barrier { 2314struct wq_barrier {
2304 struct work_struct work; 2315 struct work_struct work;
2305 struct completion done; 2316 struct completion done;
2317 struct task_struct *task; /* purely informational */
2306}; 2318};
2307 2319
2308static void wq_barrier_func(struct work_struct *work) 2320static void wq_barrier_func(struct work_struct *work)
@@ -2351,6 +2363,7 @@ static void insert_wq_barrier(struct pool_workqueue *pwq,
2351 INIT_WORK_ONSTACK(&barr->work, wq_barrier_func); 2363 INIT_WORK_ONSTACK(&barr->work, wq_barrier_func);
2352 __set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&barr->work)); 2364 __set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&barr->work));
2353 init_completion(&barr->done); 2365 init_completion(&barr->done);
2366 barr->task = current;
2354 2367
2355 /* 2368 /*
2356 * If @target is currently being executed, schedule the 2369 * If @target is currently being executed, schedule the
@@ -2989,323 +3002,6 @@ int execute_in_process_context(work_func_t fn, struct execute_work *ew)
2989} 3002}
2990EXPORT_SYMBOL_GPL(execute_in_process_context); 3003EXPORT_SYMBOL_GPL(execute_in_process_context);
2991 3004
2992#ifdef CONFIG_SYSFS
2993/*
2994 * Workqueues with WQ_SYSFS flag set is visible to userland via
2995 * /sys/bus/workqueue/devices/WQ_NAME. All visible workqueues have the
2996 * following attributes.
2997 *
2998 * per_cpu RO bool : whether the workqueue is per-cpu or unbound
2999 * max_active RW int : maximum number of in-flight work items
3000 *
3001 * Unbound workqueues have the following extra attributes.
3002 *
3003 * id RO int : the associated pool ID
3004 * nice RW int : nice value of the workers
3005 * cpumask RW mask : bitmask of allowed CPUs for the workers
3006 */
3007struct wq_device {
3008 struct workqueue_struct *wq;
3009 struct device dev;
3010};
3011
3012static struct workqueue_struct *dev_to_wq(struct device *dev)
3013{
3014 struct wq_device *wq_dev = container_of(dev, struct wq_device, dev);
3015
3016 return wq_dev->wq;
3017}
3018
3019static ssize_t per_cpu_show(struct device *dev, struct device_attribute *attr,
3020 char *buf)
3021{
3022 struct workqueue_struct *wq = dev_to_wq(dev);
3023
3024 return scnprintf(buf, PAGE_SIZE, "%d\n", (bool)!(wq->flags & WQ_UNBOUND));
3025}
3026static DEVICE_ATTR_RO(per_cpu);
3027
3028static ssize_t max_active_show(struct device *dev,
3029 struct device_attribute *attr, char *buf)
3030{
3031 struct workqueue_struct *wq = dev_to_wq(dev);
3032
3033 return scnprintf(buf, PAGE_SIZE, "%d\n", wq->saved_max_active);
3034}
3035
3036static ssize_t max_active_store(struct device *dev,
3037 struct device_attribute *attr, const char *buf,
3038 size_t count)
3039{
3040 struct workqueue_struct *wq = dev_to_wq(dev);
3041 int val;
3042
3043 if (sscanf(buf, "%d", &val) != 1 || val <= 0)
3044 return -EINVAL;
3045
3046 workqueue_set_max_active(wq, val);
3047 return count;
3048}
3049static DEVICE_ATTR_RW(max_active);
3050
3051static struct attribute *wq_sysfs_attrs[] = {
3052 &dev_attr_per_cpu.attr,
3053 &dev_attr_max_active.attr,
3054 NULL,
3055};
3056ATTRIBUTE_GROUPS(wq_sysfs);
3057
3058static ssize_t wq_pool_ids_show(struct device *dev,
3059 struct device_attribute *attr, char *buf)
3060{
3061 struct workqueue_struct *wq = dev_to_wq(dev);
3062 const char *delim = "";
3063 int node, written = 0;
3064
3065 rcu_read_lock_sched();
3066 for_each_node(node) {
3067 written += scnprintf(buf + written, PAGE_SIZE - written,
3068 "%s%d:%d", delim, node,
3069 unbound_pwq_by_node(wq, node)->pool->id);
3070 delim = " ";
3071 }
3072 written += scnprintf(buf + written, PAGE_SIZE - written, "\n");
3073 rcu_read_unlock_sched();
3074
3075 return written;
3076}
3077
3078static ssize_t wq_nice_show(struct device *dev, struct device_attribute *attr,
3079 char *buf)
3080{
3081 struct workqueue_struct *wq = dev_to_wq(dev);
3082 int written;
3083
3084 mutex_lock(&wq->mutex);
3085 written = scnprintf(buf, PAGE_SIZE, "%d\n", wq->unbound_attrs->nice);
3086 mutex_unlock(&wq->mutex);
3087
3088 return written;
3089}
3090
3091/* prepare workqueue_attrs for sysfs store operations */
3092static struct workqueue_attrs *wq_sysfs_prep_attrs(struct workqueue_struct *wq)
3093{
3094 struct workqueue_attrs *attrs;
3095
3096 attrs = alloc_workqueue_attrs(GFP_KERNEL);
3097 if (!attrs)
3098 return NULL;
3099
3100 mutex_lock(&wq->mutex);
3101 copy_workqueue_attrs(attrs, wq->unbound_attrs);
3102 mutex_unlock(&wq->mutex);
3103 return attrs;
3104}
3105
3106static ssize_t wq_nice_store(struct device *dev, struct device_attribute *attr,
3107 const char *buf, size_t count)
3108{
3109 struct workqueue_struct *wq = dev_to_wq(dev);
3110 struct workqueue_attrs *attrs;
3111 int ret;
3112
3113 attrs = wq_sysfs_prep_attrs(wq);
3114 if (!attrs)
3115 return -ENOMEM;
3116
3117 if (sscanf(buf, "%d", &attrs->nice) == 1 &&
3118 attrs->nice >= MIN_NICE && attrs->nice <= MAX_NICE)
3119 ret = apply_workqueue_attrs(wq, attrs);
3120 else
3121 ret = -EINVAL;
3122
3123 free_workqueue_attrs(attrs);
3124 return ret ?: count;
3125}
3126
3127static ssize_t wq_cpumask_show(struct device *dev,
3128 struct device_attribute *attr, char *buf)
3129{
3130 struct workqueue_struct *wq = dev_to_wq(dev);
3131 int written;
3132
3133 mutex_lock(&wq->mutex);
3134 written = scnprintf(buf, PAGE_SIZE, "%*pb\n",
3135 cpumask_pr_args(wq->unbound_attrs->cpumask));
3136 mutex_unlock(&wq->mutex);
3137 return written;
3138}
3139
3140static ssize_t wq_cpumask_store(struct device *dev,
3141 struct device_attribute *attr,
3142 const char *buf, size_t count)
3143{
3144 struct workqueue_struct *wq = dev_to_wq(dev);
3145 struct workqueue_attrs *attrs;
3146 int ret;
3147
3148 attrs = wq_sysfs_prep_attrs(wq);
3149 if (!attrs)
3150 return -ENOMEM;
3151
3152 ret = cpumask_parse(buf, attrs->cpumask);
3153 if (!ret)
3154 ret = apply_workqueue_attrs(wq, attrs);
3155
3156 free_workqueue_attrs(attrs);
3157 return ret ?: count;
3158}
3159
3160static ssize_t wq_numa_show(struct device *dev, struct device_attribute *attr,
3161 char *buf)
3162{
3163 struct workqueue_struct *wq = dev_to_wq(dev);
3164 int written;
3165
3166 mutex_lock(&wq->mutex);
3167 written = scnprintf(buf, PAGE_SIZE, "%d\n",
3168 !wq->unbound_attrs->no_numa);
3169 mutex_unlock(&wq->mutex);
3170
3171 return written;
3172}
3173
3174static ssize_t wq_numa_store(struct device *dev, struct device_attribute *attr,
3175 const char *buf, size_t count)
3176{
3177 struct workqueue_struct *wq = dev_to_wq(dev);
3178 struct workqueue_attrs *attrs;
3179 int v, ret;
3180
3181 attrs = wq_sysfs_prep_attrs(wq);
3182 if (!attrs)
3183 return -ENOMEM;
3184
3185 ret = -EINVAL;
3186 if (sscanf(buf, "%d", &v) == 1) {
3187 attrs->no_numa = !v;
3188 ret = apply_workqueue_attrs(wq, attrs);
3189 }
3190
3191 free_workqueue_attrs(attrs);
3192 return ret ?: count;
3193}
3194
3195static struct device_attribute wq_sysfs_unbound_attrs[] = {
3196 __ATTR(pool_ids, 0444, wq_pool_ids_show, NULL),
3197 __ATTR(nice, 0644, wq_nice_show, wq_nice_store),
3198 __ATTR(cpumask, 0644, wq_cpumask_show, wq_cpumask_store),
3199 __ATTR(numa, 0644, wq_numa_show, wq_numa_store),
3200 __ATTR_NULL,
3201};
3202
3203static struct bus_type wq_subsys = {
3204 .name = "workqueue",
3205 .dev_groups = wq_sysfs_groups,
3206};
3207
3208static int __init wq_sysfs_init(void)
3209{
3210 return subsys_virtual_register(&wq_subsys, NULL);
3211}
3212core_initcall(wq_sysfs_init);
3213
3214static void wq_device_release(struct device *dev)
3215{
3216 struct wq_device *wq_dev = container_of(dev, struct wq_device, dev);
3217
3218 kfree(wq_dev);
3219}
3220
3221/**
3222 * workqueue_sysfs_register - make a workqueue visible in sysfs
3223 * @wq: the workqueue to register
3224 *
3225 * Expose @wq in sysfs under /sys/bus/workqueue/devices.
3226 * alloc_workqueue*() automatically calls this function if WQ_SYSFS is set
3227 * which is the preferred method.
3228 *
3229 * Workqueue user should use this function directly iff it wants to apply
3230 * workqueue_attrs before making the workqueue visible in sysfs; otherwise,
3231 * apply_workqueue_attrs() may race against userland updating the
3232 * attributes.
3233 *
3234 * Return: 0 on success, -errno on failure.
3235 */
3236int workqueue_sysfs_register(struct workqueue_struct *wq)
3237{
3238 struct wq_device *wq_dev;
3239 int ret;
3240
3241 /*
3242 * Adjusting max_active or creating new pwqs by applyting
3243 * attributes breaks ordering guarantee. Disallow exposing ordered
3244 * workqueues.
3245 */
3246 if (WARN_ON(wq->flags & __WQ_ORDERED))
3247 return -EINVAL;
3248
3249 wq->wq_dev = wq_dev = kzalloc(sizeof(*wq_dev), GFP_KERNEL);
3250 if (!wq_dev)
3251 return -ENOMEM;
3252
3253 wq_dev->wq = wq;
3254 wq_dev->dev.bus = &wq_subsys;
3255 wq_dev->dev.init_name = wq->name;
3256 wq_dev->dev.release = wq_device_release;
3257
3258 /*
3259 * unbound_attrs are created separately. Suppress uevent until
3260 * everything is ready.
3261 */
3262 dev_set_uevent_suppress(&wq_dev->dev, true);
3263
3264 ret = device_register(&wq_dev->dev);
3265 if (ret) {
3266 kfree(wq_dev);
3267 wq->wq_dev = NULL;
3268 return ret;
3269 }
3270
3271 if (wq->flags & WQ_UNBOUND) {
3272 struct device_attribute *attr;
3273
3274 for (attr = wq_sysfs_unbound_attrs; attr->attr.name; attr++) {
3275 ret = device_create_file(&wq_dev->dev, attr);
3276 if (ret) {
3277 device_unregister(&wq_dev->dev);
3278 wq->wq_dev = NULL;
3279 return ret;
3280 }
3281 }
3282 }
3283
3284 dev_set_uevent_suppress(&wq_dev->dev, false);
3285 kobject_uevent(&wq_dev->dev.kobj, KOBJ_ADD);
3286 return 0;
3287}
3288
3289/**
3290 * workqueue_sysfs_unregister - undo workqueue_sysfs_register()
3291 * @wq: the workqueue to unregister
3292 *
3293 * If @wq is registered to sysfs by workqueue_sysfs_register(), unregister.
3294 */
3295static void workqueue_sysfs_unregister(struct workqueue_struct *wq)
3296{
3297 struct wq_device *wq_dev = wq->wq_dev;
3298
3299 if (!wq->wq_dev)
3300 return;
3301
3302 wq->wq_dev = NULL;
3303 device_unregister(&wq_dev->dev);
3304}
3305#else /* CONFIG_SYSFS */
3306static void workqueue_sysfs_unregister(struct workqueue_struct *wq) { }
3307#endif /* CONFIG_SYSFS */
3308
3309/** 3005/**
3310 * free_workqueue_attrs - free a workqueue_attrs 3006 * free_workqueue_attrs - free a workqueue_attrs
3311 * @attrs: workqueue_attrs to free 3007 * @attrs: workqueue_attrs to free
@@ -3424,6 +3120,20 @@ static int init_worker_pool(struct worker_pool *pool)
3424 return 0; 3120 return 0;
3425} 3121}
3426 3122
3123static void rcu_free_wq(struct rcu_head *rcu)
3124{
3125 struct workqueue_struct *wq =
3126 container_of(rcu, struct workqueue_struct, rcu);
3127
3128 if (!(wq->flags & WQ_UNBOUND))
3129 free_percpu(wq->cpu_pwqs);
3130 else
3131 free_workqueue_attrs(wq->unbound_attrs);
3132
3133 kfree(wq->rescuer);
3134 kfree(wq);
3135}
3136
3427static void rcu_free_pool(struct rcu_head *rcu) 3137static void rcu_free_pool(struct rcu_head *rcu)
3428{ 3138{
3429 struct worker_pool *pool = container_of(rcu, struct worker_pool, rcu); 3139 struct worker_pool *pool = container_of(rcu, struct worker_pool, rcu);
@@ -3601,12 +3311,10 @@ static void pwq_unbound_release_workfn(struct work_struct *work)
3601 3311
3602 /* 3312 /*
3603 * If we're the last pwq going away, @wq is already dead and no one 3313 * If we're the last pwq going away, @wq is already dead and no one
3604 * is gonna access it anymore. Free it. 3314 * is gonna access it anymore. Schedule RCU free.
3605 */ 3315 */
3606 if (is_last) { 3316 if (is_last)
3607 free_workqueue_attrs(wq->unbound_attrs); 3317 call_rcu_sched(&wq->rcu, rcu_free_wq);
3608 kfree(wq);
3609 }
3610} 3318}
3611 3319
3612/** 3320/**
@@ -4143,7 +3851,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
4143 pwq_adjust_max_active(pwq); 3851 pwq_adjust_max_active(pwq);
4144 mutex_unlock(&wq->mutex); 3852 mutex_unlock(&wq->mutex);
4145 3853
4146 list_add(&wq->list, &workqueues); 3854 list_add_tail_rcu(&wq->list, &workqueues);
4147 3855
4148 mutex_unlock(&wq_pool_mutex); 3856 mutex_unlock(&wq_pool_mutex);
4149 3857
@@ -4199,24 +3907,20 @@ void destroy_workqueue(struct workqueue_struct *wq)
4199 * flushing is complete in case freeze races us. 3907 * flushing is complete in case freeze races us.
4200 */ 3908 */
4201 mutex_lock(&wq_pool_mutex); 3909 mutex_lock(&wq_pool_mutex);
4202 list_del_init(&wq->list); 3910 list_del_rcu(&wq->list);
4203 mutex_unlock(&wq_pool_mutex); 3911 mutex_unlock(&wq_pool_mutex);
4204 3912
4205 workqueue_sysfs_unregister(wq); 3913 workqueue_sysfs_unregister(wq);
4206 3914
4207 if (wq->rescuer) { 3915 if (wq->rescuer)
4208 kthread_stop(wq->rescuer->task); 3916 kthread_stop(wq->rescuer->task);
4209 kfree(wq->rescuer);
4210 wq->rescuer = NULL;
4211 }
4212 3917
4213 if (!(wq->flags & WQ_UNBOUND)) { 3918 if (!(wq->flags & WQ_UNBOUND)) {
4214 /* 3919 /*
4215 * The base ref is never dropped on per-cpu pwqs. Directly 3920 * The base ref is never dropped on per-cpu pwqs. Directly
4216 * free the pwqs and wq. 3921 * schedule RCU free.
4217 */ 3922 */
4218 free_percpu(wq->cpu_pwqs); 3923 call_rcu_sched(&wq->rcu, rcu_free_wq);
4219 kfree(wq);
4220 } else { 3924 } else {
4221 /* 3925 /*
4222 * We're the sole accessor of @wq at this point. Directly 3926 * We're the sole accessor of @wq at this point. Directly
@@ -4437,6 +4141,166 @@ void print_worker_info(const char *log_lvl, struct task_struct *task)
4437 } 4141 }
4438} 4142}
4439 4143
4144static void pr_cont_pool_info(struct worker_pool *pool)
4145{
4146 pr_cont(" cpus=%*pbl", nr_cpumask_bits, pool->attrs->cpumask);
4147 if (pool->node != NUMA_NO_NODE)
4148 pr_cont(" node=%d", pool->node);
4149 pr_cont(" flags=0x%x nice=%d", pool->flags, pool->attrs->nice);
4150}
4151
4152static void pr_cont_work(bool comma, struct work_struct *work)
4153{
4154 if (work->func == wq_barrier_func) {
4155 struct wq_barrier *barr;
4156
4157 barr = container_of(work, struct wq_barrier, work);
4158
4159 pr_cont("%s BAR(%d)", comma ? "," : "",
4160 task_pid_nr(barr->task));
4161 } else {
4162 pr_cont("%s %pf", comma ? "," : "", work->func);
4163 }
4164}
4165
4166static void show_pwq(struct pool_workqueue *pwq)
4167{
4168 struct worker_pool *pool = pwq->pool;
4169 struct work_struct *work;
4170 struct worker *worker;
4171 bool has_in_flight = false, has_pending = false;
4172 int bkt;
4173
4174 pr_info(" pwq %d:", pool->id);
4175 pr_cont_pool_info(pool);
4176
4177 pr_cont(" active=%d/%d%s\n", pwq->nr_active, pwq->max_active,
4178 !list_empty(&pwq->mayday_node) ? " MAYDAY" : "");
4179
4180 hash_for_each(pool->busy_hash, bkt, worker, hentry) {
4181 if (worker->current_pwq == pwq) {
4182 has_in_flight = true;
4183 break;
4184 }
4185 }
4186 if (has_in_flight) {
4187 bool comma = false;
4188
4189 pr_info(" in-flight:");
4190 hash_for_each(pool->busy_hash, bkt, worker, hentry) {
4191 if (worker->current_pwq != pwq)
4192 continue;
4193
4194 pr_cont("%s %d%s:%pf", comma ? "," : "",
4195 task_pid_nr(worker->task),
4196 worker == pwq->wq->rescuer ? "(RESCUER)" : "",
4197 worker->current_func);
4198 list_for_each_entry(work, &worker->scheduled, entry)
4199 pr_cont_work(false, work);
4200 comma = true;
4201 }
4202 pr_cont("\n");
4203 }
4204
4205 list_for_each_entry(work, &pool->worklist, entry) {
4206 if (get_work_pwq(work) == pwq) {
4207 has_pending = true;
4208 break;
4209 }
4210 }
4211 if (has_pending) {
4212 bool comma = false;
4213
4214 pr_info(" pending:");
4215 list_for_each_entry(work, &pool->worklist, entry) {
4216 if (get_work_pwq(work) != pwq)
4217 continue;
4218
4219 pr_cont_work(comma, work);
4220 comma = !(*work_data_bits(work) & WORK_STRUCT_LINKED);
4221 }
4222 pr_cont("\n");
4223 }
4224
4225 if (!list_empty(&pwq->delayed_works)) {
4226 bool comma = false;
4227
4228 pr_info(" delayed:");
4229 list_for_each_entry(work, &pwq->delayed_works, entry) {
4230 pr_cont_work(comma, work);
4231 comma = !(*work_data_bits(work) & WORK_STRUCT_LINKED);
4232 }
4233 pr_cont("\n");
4234 }
4235}
4236
4237/**
4238 * show_workqueue_state - dump workqueue state
4239 *
4240 * Called from a sysrq handler and prints out all busy workqueues and
4241 * pools.
4242 */
4243void show_workqueue_state(void)
4244{
4245 struct workqueue_struct *wq;
4246 struct worker_pool *pool;
4247 unsigned long flags;
4248 int pi;
4249
4250 rcu_read_lock_sched();
4251
4252 pr_info("Showing busy workqueues and worker pools:\n");
4253
4254 list_for_each_entry_rcu(wq, &workqueues, list) {
4255 struct pool_workqueue *pwq;
4256 bool idle = true;
4257
4258 for_each_pwq(pwq, wq) {
4259 if (pwq->nr_active || !list_empty(&pwq->delayed_works)) {
4260 idle = false;
4261 break;
4262 }
4263 }
4264 if (idle)
4265 continue;
4266
4267 pr_info("workqueue %s: flags=0x%x\n", wq->name, wq->flags);
4268
4269 for_each_pwq(pwq, wq) {
4270 spin_lock_irqsave(&pwq->pool->lock, flags);
4271 if (pwq->nr_active || !list_empty(&pwq->delayed_works))
4272 show_pwq(pwq);
4273 spin_unlock_irqrestore(&pwq->pool->lock, flags);
4274 }
4275 }
4276
4277 for_each_pool(pool, pi) {
4278 struct worker *worker;
4279 bool first = true;
4280
4281 spin_lock_irqsave(&pool->lock, flags);
4282 if (pool->nr_workers == pool->nr_idle)
4283 goto next_pool;
4284
4285 pr_info("pool %d:", pool->id);
4286 pr_cont_pool_info(pool);
4287 pr_cont(" workers=%d", pool->nr_workers);
4288 if (pool->manager)
4289 pr_cont(" manager: %d",
4290 task_pid_nr(pool->manager->task));
4291 list_for_each_entry(worker, &pool->idle_list, entry) {
4292 pr_cont(" %s%d", first ? "idle: " : "",
4293 task_pid_nr(worker->task));
4294 first = false;
4295 }
4296 pr_cont("\n");
4297 next_pool:
4298 spin_unlock_irqrestore(&pool->lock, flags);
4299 }
4300
4301 rcu_read_unlock_sched();
4302}
4303
4440/* 4304/*
4441 * CPU hotplug. 4305 * CPU hotplug.
4442 * 4306 *
@@ -4834,6 +4698,323 @@ out_unlock:
4834} 4698}
4835#endif /* CONFIG_FREEZER */ 4699#endif /* CONFIG_FREEZER */
4836 4700
4701#ifdef CONFIG_SYSFS
4702/*
4703 * Workqueues with WQ_SYSFS flag set is visible to userland via
4704 * /sys/bus/workqueue/devices/WQ_NAME. All visible workqueues have the
4705 * following attributes.
4706 *
4707 * per_cpu RO bool : whether the workqueue is per-cpu or unbound
4708 * max_active RW int : maximum number of in-flight work items
4709 *
4710 * Unbound workqueues have the following extra attributes.
4711 *
4712 * id RO int : the associated pool ID
4713 * nice RW int : nice value of the workers
4714 * cpumask RW mask : bitmask of allowed CPUs for the workers
4715 */
4716struct wq_device {
4717 struct workqueue_struct *wq;
4718 struct device dev;
4719};
4720
4721static struct workqueue_struct *dev_to_wq(struct device *dev)
4722{
4723 struct wq_device *wq_dev = container_of(dev, struct wq_device, dev);
4724
4725 return wq_dev->wq;
4726}
4727
4728static ssize_t per_cpu_show(struct device *dev, struct device_attribute *attr,
4729 char *buf)
4730{
4731 struct workqueue_struct *wq = dev_to_wq(dev);
4732
4733 return scnprintf(buf, PAGE_SIZE, "%d\n", (bool)!(wq->flags & WQ_UNBOUND));
4734}
4735static DEVICE_ATTR_RO(per_cpu);
4736
4737static ssize_t max_active_show(struct device *dev,
4738 struct device_attribute *attr, char *buf)
4739{
4740 struct workqueue_struct *wq = dev_to_wq(dev);
4741
4742 return scnprintf(buf, PAGE_SIZE, "%d\n", wq->saved_max_active);
4743}
4744
4745static ssize_t max_active_store(struct device *dev,
4746 struct device_attribute *attr, const char *buf,
4747 size_t count)
4748{
4749 struct workqueue_struct *wq = dev_to_wq(dev);
4750 int val;
4751
4752 if (sscanf(buf, "%d", &val) != 1 || val <= 0)
4753 return -EINVAL;
4754
4755 workqueue_set_max_active(wq, val);
4756 return count;
4757}
4758static DEVICE_ATTR_RW(max_active);
4759
4760static struct attribute *wq_sysfs_attrs[] = {
4761 &dev_attr_per_cpu.attr,
4762 &dev_attr_max_active.attr,
4763 NULL,
4764};
4765ATTRIBUTE_GROUPS(wq_sysfs);
4766
4767static ssize_t wq_pool_ids_show(struct device *dev,
4768 struct device_attribute *attr, char *buf)
4769{
4770 struct workqueue_struct *wq = dev_to_wq(dev);
4771 const char *delim = "";
4772 int node, written = 0;
4773
4774 rcu_read_lock_sched();
4775 for_each_node(node) {
4776 written += scnprintf(buf + written, PAGE_SIZE - written,
4777 "%s%d:%d", delim, node,
4778 unbound_pwq_by_node(wq, node)->pool->id);
4779 delim = " ";
4780 }
4781 written += scnprintf(buf + written, PAGE_SIZE - written, "\n");
4782 rcu_read_unlock_sched();
4783
4784 return written;
4785}
4786
4787static ssize_t wq_nice_show(struct device *dev, struct device_attribute *attr,
4788 char *buf)
4789{
4790 struct workqueue_struct *wq = dev_to_wq(dev);
4791 int written;
4792
4793 mutex_lock(&wq->mutex);
4794 written = scnprintf(buf, PAGE_SIZE, "%d\n", wq->unbound_attrs->nice);
4795 mutex_unlock(&wq->mutex);
4796
4797 return written;
4798}
4799
4800/* prepare workqueue_attrs for sysfs store operations */
4801static struct workqueue_attrs *wq_sysfs_prep_attrs(struct workqueue_struct *wq)
4802{
4803 struct workqueue_attrs *attrs;
4804
4805 attrs = alloc_workqueue_attrs(GFP_KERNEL);
4806 if (!attrs)
4807 return NULL;
4808
4809 mutex_lock(&wq->mutex);
4810 copy_workqueue_attrs(attrs, wq->unbound_attrs);
4811 mutex_unlock(&wq->mutex);
4812 return attrs;
4813}
4814
4815static ssize_t wq_nice_store(struct device *dev, struct device_attribute *attr,
4816 const char *buf, size_t count)
4817{
4818 struct workqueue_struct *wq = dev_to_wq(dev);
4819 struct workqueue_attrs *attrs;
4820 int ret;
4821
4822 attrs = wq_sysfs_prep_attrs(wq);
4823 if (!attrs)
4824 return -ENOMEM;
4825
4826 if (sscanf(buf, "%d", &attrs->nice) == 1 &&
4827 attrs->nice >= MIN_NICE && attrs->nice <= MAX_NICE)
4828 ret = apply_workqueue_attrs(wq, attrs);
4829 else
4830 ret = -EINVAL;
4831
4832 free_workqueue_attrs(attrs);
4833 return ret ?: count;
4834}
4835
4836static ssize_t wq_cpumask_show(struct device *dev,
4837 struct device_attribute *attr, char *buf)
4838{
4839 struct workqueue_struct *wq = dev_to_wq(dev);
4840 int written;
4841
4842 mutex_lock(&wq->mutex);
4843 written = scnprintf(buf, PAGE_SIZE, "%*pb\n",
4844 cpumask_pr_args(wq->unbound_attrs->cpumask));
4845 mutex_unlock(&wq->mutex);
4846 return written;
4847}
4848
4849static ssize_t wq_cpumask_store(struct device *dev,
4850 struct device_attribute *attr,
4851 const char *buf, size_t count)
4852{
4853 struct workqueue_struct *wq = dev_to_wq(dev);
4854 struct workqueue_attrs *attrs;
4855 int ret;
4856
4857 attrs = wq_sysfs_prep_attrs(wq);
4858 if (!attrs)
4859 return -ENOMEM;
4860
4861 ret = cpumask_parse(buf, attrs->cpumask);
4862 if (!ret)
4863 ret = apply_workqueue_attrs(wq, attrs);
4864
4865 free_workqueue_attrs(attrs);
4866 return ret ?: count;
4867}
4868
4869static ssize_t wq_numa_show(struct device *dev, struct device_attribute *attr,
4870 char *buf)
4871{
4872 struct workqueue_struct *wq = dev_to_wq(dev);
4873 int written;
4874
4875 mutex_lock(&wq->mutex);
4876 written = scnprintf(buf, PAGE_SIZE, "%d\n",
4877 !wq->unbound_attrs->no_numa);
4878 mutex_unlock(&wq->mutex);
4879
4880 return written;
4881}
4882
4883static ssize_t wq_numa_store(struct device *dev, struct device_attribute *attr,
4884 const char *buf, size_t count)
4885{
4886 struct workqueue_struct *wq = dev_to_wq(dev);
4887 struct workqueue_attrs *attrs;
4888 int v, ret;
4889
4890 attrs = wq_sysfs_prep_attrs(wq);
4891 if (!attrs)
4892 return -ENOMEM;
4893
4894 ret = -EINVAL;
4895 if (sscanf(buf, "%d", &v) == 1) {
4896 attrs->no_numa = !v;
4897 ret = apply_workqueue_attrs(wq, attrs);
4898 }
4899
4900 free_workqueue_attrs(attrs);
4901 return ret ?: count;
4902}
4903
4904static struct device_attribute wq_sysfs_unbound_attrs[] = {
4905 __ATTR(pool_ids, 0444, wq_pool_ids_show, NULL),
4906 __ATTR(nice, 0644, wq_nice_show, wq_nice_store),
4907 __ATTR(cpumask, 0644, wq_cpumask_show, wq_cpumask_store),
4908 __ATTR(numa, 0644, wq_numa_show, wq_numa_store),
4909 __ATTR_NULL,
4910};
4911
4912static struct bus_type wq_subsys = {
4913 .name = "workqueue",
4914 .dev_groups = wq_sysfs_groups,
4915};
4916
4917static int __init wq_sysfs_init(void)
4918{
4919 return subsys_virtual_register(&wq_subsys, NULL);
4920}
4921core_initcall(wq_sysfs_init);
4922
4923static void wq_device_release(struct device *dev)
4924{
4925 struct wq_device *wq_dev = container_of(dev, struct wq_device, dev);
4926
4927 kfree(wq_dev);
4928}
4929
4930/**
4931 * workqueue_sysfs_register - make a workqueue visible in sysfs
4932 * @wq: the workqueue to register
4933 *
4934 * Expose @wq in sysfs under /sys/bus/workqueue/devices.
4935 * alloc_workqueue*() automatically calls this function if WQ_SYSFS is set
4936 * which is the preferred method.
4937 *
4938 * Workqueue user should use this function directly iff it wants to apply
4939 * workqueue_attrs before making the workqueue visible in sysfs; otherwise,
4940 * apply_workqueue_attrs() may race against userland updating the
4941 * attributes.
4942 *
4943 * Return: 0 on success, -errno on failure.
4944 */
4945int workqueue_sysfs_register(struct workqueue_struct *wq)
4946{
4947 struct wq_device *wq_dev;
4948 int ret;
4949
4950 /*
4951 * Adjusting max_active or creating new pwqs by applyting
4952 * attributes breaks ordering guarantee. Disallow exposing ordered
4953 * workqueues.
4954 */
4955 if (WARN_ON(wq->flags & __WQ_ORDERED))
4956 return -EINVAL;
4957
4958 wq->wq_dev = wq_dev = kzalloc(sizeof(*wq_dev), GFP_KERNEL);
4959 if (!wq_dev)
4960 return -ENOMEM;
4961
4962 wq_dev->wq = wq;
4963 wq_dev->dev.bus = &wq_subsys;
4964 wq_dev->dev.init_name = wq->name;
4965 wq_dev->dev.release = wq_device_release;
4966
4967 /*
4968 * unbound_attrs are created separately. Suppress uevent until
4969 * everything is ready.
4970 */
4971 dev_set_uevent_suppress(&wq_dev->dev, true);
4972
4973 ret = device_register(&wq_dev->dev);
4974 if (ret) {
4975 kfree(wq_dev);
4976 wq->wq_dev = NULL;
4977 return ret;
4978 }
4979
4980 if (wq->flags & WQ_UNBOUND) {
4981 struct device_attribute *attr;
4982
4983 for (attr = wq_sysfs_unbound_attrs; attr->attr.name; attr++) {
4984 ret = device_create_file(&wq_dev->dev, attr);
4985 if (ret) {
4986 device_unregister(&wq_dev->dev);
4987 wq->wq_dev = NULL;
4988 return ret;
4989 }
4990 }
4991 }
4992
4993 dev_set_uevent_suppress(&wq_dev->dev, false);
4994 kobject_uevent(&wq_dev->dev.kobj, KOBJ_ADD);
4995 return 0;
4996}
4997
4998/**
4999 * workqueue_sysfs_unregister - undo workqueue_sysfs_register()
5000 * @wq: the workqueue to unregister
5001 *
5002 * If @wq is registered to sysfs by workqueue_sysfs_register(), unregister.
5003 */
5004static void workqueue_sysfs_unregister(struct workqueue_struct *wq)
5005{
5006 struct wq_device *wq_dev = wq->wq_dev;
5007
5008 if (!wq->wq_dev)
5009 return;
5010
5011 wq->wq_dev = NULL;
5012 device_unregister(&wq_dev->dev);
5013}
5014#else /* CONFIG_SYSFS */
5015static void workqueue_sysfs_unregister(struct workqueue_struct *wq) { }
5016#endif /* CONFIG_SYSFS */
5017
4837static void __init wq_numa_init(void) 5018static void __init wq_numa_init(void)
4838{ 5019{
4839 cpumask_var_t *tbl; 5020 cpumask_var_t *tbl;