aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile2
-rw-r--r--kernel/audit.c8
-rw-r--r--kernel/auditsc.c12
-rw-r--r--kernel/bpf/arraymap.c2
-rw-r--r--kernel/bpf/core.c4
-rw-r--r--kernel/bpf/hashtab.c3
-rw-r--r--kernel/bpf/helpers.c55
-rw-r--r--kernel/bpf/inode.c2
-rw-r--r--kernel/bpf/stackmap.c5
-rw-r--r--kernel/bpf/syscall.c4
-rw-r--r--kernel/bpf/verifier.c943
-rw-r--r--kernel/cgroup.c140
-rw-r--r--kernel/configs/android-base.config7
-rw-r--r--kernel/configs/android-recommended.config4
-rw-r--r--kernel/configs/kvm_guest.config32
-rw-r--r--kernel/cpu.c524
-rw-r--r--kernel/cpuset.c32
-rw-r--r--kernel/events/core.c226
-rw-r--r--kernel/events/uprobes.c42
-rw-r--r--kernel/exit.c29
-rw-r--r--kernel/fork.c199
-rw-r--r--kernel/futex.c15
-rw-r--r--kernel/groups.c67
-rw-r--r--kernel/hung_task.c28
-rw-r--r--kernel/irq/affinity.c167
-rw-r--r--kernel/irq/chip.c29
-rw-r--r--kernel/irq/generic-chip.c72
-rw-r--r--kernel/irq/irqdesc.c224
-rw-r--r--kernel/irq/irqdomain.c11
-rw-r--r--kernel/irq/manage.c10
-rw-r--r--kernel/irq/msi.c26
-rw-r--r--kernel/kcov.c9
-rw-r--r--kernel/kprobes.c2
-rw-r--r--kernel/kthread.c585
-rw-r--r--kernel/livepatch/core.c19
-rw-r--r--kernel/locking/Makefile1
-rw-r--r--kernel/locking/lglock.c111
-rw-r--r--kernel/locking/lockdep_internals.h20
-rw-r--r--kernel/locking/percpu-rwsem.c228
-rw-r--r--kernel/locking/qspinlock_paravirt.h26
-rw-r--r--kernel/locking/qspinlock_stat.h4
-rw-r--r--kernel/locking/rwsem-xadd.c92
-rw-r--r--kernel/module.c13
-rw-r--r--kernel/padata.c88
-rw-r--r--kernel/panic.c47
-rw-r--r--kernel/pid_namespace.c50
-rw-r--r--kernel/power/Kconfig4
-rw-r--r--kernel/power/hibernate.c21
-rw-r--r--kernel/power/main.c1
-rw-r--r--kernel/power/power.h2
-rw-r--r--kernel/power/process.c17
-rw-r--r--kernel/power/snapshot.c22
-rw-r--r--kernel/power/suspend.c18
-rw-r--r--kernel/power/suspend_test.c4
-rw-r--r--kernel/printk/printk.c117
-rw-r--r--kernel/ptrace.c19
-rw-r--r--kernel/rcu/rcuperf.c7
-rw-r--r--kernel/rcu/rcutorture.c62
-rw-r--r--kernel/rcu/sync.c14
-rw-r--r--kernel/rcu/tiny.c2
-rw-r--r--kernel/rcu/tree.c46
-rw-r--r--kernel/rcu/tree.h1
-rw-r--r--kernel/rcu/tree_exp.h124
-rw-r--r--kernel/rcu/tree_plugin.h1
-rw-r--r--kernel/rcu/tree_trace.c7
-rw-r--r--kernel/rcu/update.c3
-rw-r--r--kernel/relay.c226
-rw-r--r--kernel/sched/auto_group.c36
-rw-r--r--kernel/sched/core.c362
-rw-r--r--kernel/sched/cpudeadline.c153
-rw-r--r--kernel/sched/cpudeadline.h3
-rw-r--r--kernel/sched/cpufreq.c2
-rw-r--r--kernel/sched/cpufreq_schedutil.c122
-rw-r--r--kernel/sched/cputime.c87
-rw-r--r--kernel/sched/deadline.c83
-rw-r--r--kernel/sched/debug.c106
-rw-r--r--kernel/sched/fair.c794
-rw-r--r--kernel/sched/idle.c13
-rw-r--r--kernel/sched/idle_task.c4
-rw-r--r--kernel/sched/rt.c5
-rw-r--r--kernel/sched/sched.h136
-rw-r--r--kernel/sched/stats.h24
-rw-r--r--kernel/sched/wait.c123
-rw-r--r--kernel/signal.c7
-rw-r--r--kernel/smp.c52
-rw-r--r--kernel/smpboot.c5
-rw-r--r--kernel/softirq.c49
-rw-r--r--kernel/stop_machine.c47
-rw-r--r--kernel/sys_ni.c5
-rw-r--r--kernel/sysctl.c23
-rw-r--r--kernel/taskstats.c6
-rw-r--r--kernel/time/alarmtimer.c2
-rw-r--r--kernel/time/clocksource.c15
-rw-r--r--kernel/time/hrtimer.c6
-rw-r--r--kernel/time/tick-sched.c7
-rw-r--r--kernel/time/time.c2
-rw-r--r--kernel/time/timekeeping.c7
-rw-r--r--kernel/time/timekeeping_debug.c2
-rw-r--r--kernel/time/timer.c76
-rw-r--r--kernel/torture.c27
-rw-r--r--kernel/trace/Kconfig40
-rw-r--r--kernel/trace/Makefile5
-rw-r--r--kernel/trace/bpf_trace.c160
-rw-r--r--kernel/trace/ftrace.c30
-rw-r--r--kernel/trace/trace.c61
-rw-r--r--kernel/trace/trace.h5
-rw-r--r--kernel/trace/trace_entries.h27
-rw-r--r--kernel/trace/trace_events_trigger.c1
-rw-r--r--kernel/trace/trace_functions_graph.c73
-rw-r--r--kernel/trace/trace_hwlat.c633
-rw-r--r--kernel/trace/trace_kprobe.c4
-rw-r--r--kernel/trace/trace_output.c66
-rw-r--r--kernel/trace/trace_probe.c30
-rw-r--r--kernel/trace/trace_probe.h11
-rw-r--r--kernel/trace/trace_syscalls.c6
-rw-r--r--kernel/trace/trace_uprobe.c8
-rw-r--r--kernel/ucount.c235
-rw-r--r--kernel/uid16.c4
-rw-r--r--kernel/up.c18
-rw-r--r--kernel/user_namespace.c99
-rw-r--r--kernel/utsname.c40
-rw-r--r--kernel/workqueue.c42
122 files changed, 6346 insertions, 2580 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index e2ec54e2b952..eb26e12c6c2a 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -9,7 +9,7 @@ obj-y = fork.o exec_domain.o panic.o \
9 extable.o params.o \ 9 extable.o params.o \
10 kthread.o sys_ni.o nsproxy.o \ 10 kthread.o sys_ni.o nsproxy.o \
11 notifier.o ksysfs.o cred.o reboot.o \ 11 notifier.o ksysfs.o cred.o reboot.o \
12 async.o range.o smpboot.o 12 async.o range.o smpboot.o ucount.o
13 13
14obj-$(CONFIG_MULTIUSER) += groups.o 14obj-$(CONFIG_MULTIUSER) += groups.o
15 15
diff --git a/kernel/audit.c b/kernel/audit.c
index a8a91bd2b2a9..f1ca11613379 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -877,6 +877,12 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
877 return err; 877 return err;
878 } 878 }
879 if (s.mask & AUDIT_STATUS_PID) { 879 if (s.mask & AUDIT_STATUS_PID) {
880 /* NOTE: we are using task_tgid_vnr() below because
881 * the s.pid value is relative to the namespace
882 * of the caller; at present this doesn't matter
883 * much since you can really only run auditd
884 * from the initial pid namespace, but something
885 * to keep in mind if this changes */
880 int new_pid = s.pid; 886 int new_pid = s.pid;
881 pid_t requesting_pid = task_tgid_vnr(current); 887 pid_t requesting_pid = task_tgid_vnr(current);
882 888
@@ -1917,7 +1923,7 @@ void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk)
1917 " euid=%u suid=%u fsuid=%u" 1923 " euid=%u suid=%u fsuid=%u"
1918 " egid=%u sgid=%u fsgid=%u tty=%s ses=%u", 1924 " egid=%u sgid=%u fsgid=%u tty=%s ses=%u",
1919 task_ppid_nr(tsk), 1925 task_ppid_nr(tsk),
1920 task_pid_nr(tsk), 1926 task_tgid_nr(tsk),
1921 from_kuid(&init_user_ns, audit_get_loginuid(tsk)), 1927 from_kuid(&init_user_ns, audit_get_loginuid(tsk)),
1922 from_kuid(&init_user_ns, cred->uid), 1928 from_kuid(&init_user_ns, cred->uid),
1923 from_kgid(&init_user_ns, cred->gid), 1929 from_kgid(&init_user_ns, cred->gid),
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 5abf1dc1f91c..2cd5256dbff7 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -457,7 +457,7 @@ static int audit_filter_rules(struct task_struct *tsk,
457 457
458 switch (f->type) { 458 switch (f->type) {
459 case AUDIT_PID: 459 case AUDIT_PID:
460 pid = task_pid_nr(tsk); 460 pid = task_tgid_nr(tsk);
461 result = audit_comparator(pid, f->op, f->val); 461 result = audit_comparator(pid, f->op, f->val);
462 break; 462 break;
463 case AUDIT_PPID: 463 case AUDIT_PPID:
@@ -1993,7 +1993,7 @@ static void audit_log_set_loginuid(kuid_t koldloginuid, kuid_t kloginuid,
1993 loginuid = from_kuid(&init_user_ns, kloginuid), 1993 loginuid = from_kuid(&init_user_ns, kloginuid),
1994 tty = audit_get_tty(current); 1994 tty = audit_get_tty(current);
1995 1995
1996 audit_log_format(ab, "pid=%d uid=%u", task_pid_nr(current), uid); 1996 audit_log_format(ab, "pid=%d uid=%u", task_tgid_nr(current), uid);
1997 audit_log_task_context(ab); 1997 audit_log_task_context(ab);
1998 audit_log_format(ab, " old-auid=%u auid=%u tty=%s old-ses=%u ses=%u res=%d", 1998 audit_log_format(ab, " old-auid=%u auid=%u tty=%s old-ses=%u ses=%u res=%d",
1999 oldloginuid, loginuid, tty ? tty_name(tty) : "(none)", 1999 oldloginuid, loginuid, tty ? tty_name(tty) : "(none)",
@@ -2220,7 +2220,7 @@ void __audit_ptrace(struct task_struct *t)
2220{ 2220{
2221 struct audit_context *context = current->audit_context; 2221 struct audit_context *context = current->audit_context;
2222 2222
2223 context->target_pid = task_pid_nr(t); 2223 context->target_pid = task_tgid_nr(t);
2224 context->target_auid = audit_get_loginuid(t); 2224 context->target_auid = audit_get_loginuid(t);
2225 context->target_uid = task_uid(t); 2225 context->target_uid = task_uid(t);
2226 context->target_sessionid = audit_get_sessionid(t); 2226 context->target_sessionid = audit_get_sessionid(t);
@@ -2245,7 +2245,7 @@ int __audit_signal_info(int sig, struct task_struct *t)
2245 2245
2246 if (audit_pid && t->tgid == audit_pid) { 2246 if (audit_pid && t->tgid == audit_pid) {
2247 if (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1 || sig == SIGUSR2) { 2247 if (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1 || sig == SIGUSR2) {
2248 audit_sig_pid = task_pid_nr(tsk); 2248 audit_sig_pid = task_tgid_nr(tsk);
2249 if (uid_valid(tsk->loginuid)) 2249 if (uid_valid(tsk->loginuid))
2250 audit_sig_uid = tsk->loginuid; 2250 audit_sig_uid = tsk->loginuid;
2251 else 2251 else
@@ -2345,7 +2345,7 @@ int __audit_log_bprm_fcaps(struct linux_binprm *bprm,
2345void __audit_log_capset(const struct cred *new, const struct cred *old) 2345void __audit_log_capset(const struct cred *new, const struct cred *old)
2346{ 2346{
2347 struct audit_context *context = current->audit_context; 2347 struct audit_context *context = current->audit_context;
2348 context->capset.pid = task_pid_nr(current); 2348 context->capset.pid = task_tgid_nr(current);
2349 context->capset.cap.effective = new->cap_effective; 2349 context->capset.cap.effective = new->cap_effective;
2350 context->capset.cap.inheritable = new->cap_effective; 2350 context->capset.cap.inheritable = new->cap_effective;
2351 context->capset.cap.permitted = new->cap_permitted; 2351 context->capset.cap.permitted = new->cap_permitted;
@@ -2377,7 +2377,7 @@ static void audit_log_task(struct audit_buffer *ab)
2377 from_kgid(&init_user_ns, gid), 2377 from_kgid(&init_user_ns, gid),
2378 sessionid); 2378 sessionid);
2379 audit_log_task_context(ab); 2379 audit_log_task_context(ab);
2380 audit_log_format(ab, " pid=%d comm=", task_pid_nr(current)); 2380 audit_log_format(ab, " pid=%d comm=", task_tgid_nr(current));
2381 audit_log_untrustedstring(ab, get_task_comm(comm, current)); 2381 audit_log_untrustedstring(ab, get_task_comm(comm, current));
2382 audit_log_d_path_exe(ab, current->mm); 2382 audit_log_d_path_exe(ab, current->mm);
2383} 2383}
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 633a650d7aeb..a2ac051c342f 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -538,7 +538,7 @@ static int __init register_perf_event_array_map(void)
538} 538}
539late_initcall(register_perf_event_array_map); 539late_initcall(register_perf_event_array_map);
540 540
541#ifdef CONFIG_SOCK_CGROUP_DATA 541#ifdef CONFIG_CGROUPS
542static void *cgroup_fd_array_get_ptr(struct bpf_map *map, 542static void *cgroup_fd_array_get_ptr(struct bpf_map *map,
543 struct file *map_file /* not used */, 543 struct file *map_file /* not used */,
544 int fd) 544 int fd)
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 03fd23d4d587..aa6d98154106 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1018,7 +1018,7 @@ void bpf_user_rnd_init_once(void)
1018 prandom_init_once(&bpf_user_rnd_state); 1018 prandom_init_once(&bpf_user_rnd_state);
1019} 1019}
1020 1020
1021u64 bpf_user_rnd_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) 1021BPF_CALL_0(bpf_user_rnd_u32)
1022{ 1022{
1023 /* Should someone ever have the rather unwise idea to use some 1023 /* Should someone ever have the rather unwise idea to use some
1024 * of the registers passed into this function, then note that 1024 * of the registers passed into this function, then note that
@@ -1031,7 +1031,7 @@ u64 bpf_user_rnd_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
1031 1031
1032 state = &get_cpu_var(bpf_user_rnd_state); 1032 state = &get_cpu_var(bpf_user_rnd_state);
1033 res = prandom_u32_state(state); 1033 res = prandom_u32_state(state);
1034 put_cpu_var(state); 1034 put_cpu_var(bpf_user_rnd_state);
1035 1035
1036 return res; 1036 return res;
1037} 1037}
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 570eeca7bdfa..ad1bc67aff1b 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -687,7 +687,8 @@ static void delete_all_elements(struct bpf_htab *htab)
687 687
688 hlist_for_each_entry_safe(l, n, head, hash_node) { 688 hlist_for_each_entry_safe(l, n, head, hash_node) {
689 hlist_del_rcu(&l->hash_node); 689 hlist_del_rcu(&l->hash_node);
690 htab_elem_free(htab, l); 690 if (l->state != HTAB_EXTRA_ELEM_USED)
691 htab_elem_free(htab, l);
691 } 692 }
692 } 693 }
693} 694}
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 1ea3afba1a4f..39918402e6e9 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -16,6 +16,7 @@
16#include <linux/ktime.h> 16#include <linux/ktime.h>
17#include <linux/sched.h> 17#include <linux/sched.h>
18#include <linux/uidgid.h> 18#include <linux/uidgid.h>
19#include <linux/filter.h>
19 20
20/* If kernel subsystem is allowing eBPF programs to call this function, 21/* If kernel subsystem is allowing eBPF programs to call this function,
21 * inside its own verifier_ops->get_func_proto() callback it should return 22 * inside its own verifier_ops->get_func_proto() callback it should return
@@ -26,48 +27,32 @@
26 * if program is allowed to access maps, so check rcu_read_lock_held in 27 * if program is allowed to access maps, so check rcu_read_lock_held in
27 * all three functions. 28 * all three functions.
28 */ 29 */
29static u64 bpf_map_lookup_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) 30BPF_CALL_2(bpf_map_lookup_elem, struct bpf_map *, map, void *, key)
30{ 31{
31 /* verifier checked that R1 contains a valid pointer to bpf_map
32 * and R2 points to a program stack and map->key_size bytes were
33 * initialized
34 */
35 struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
36 void *key = (void *) (unsigned long) r2;
37 void *value;
38
39 WARN_ON_ONCE(!rcu_read_lock_held()); 32 WARN_ON_ONCE(!rcu_read_lock_held());
40 33 return (unsigned long) map->ops->map_lookup_elem(map, key);
41 value = map->ops->map_lookup_elem(map, key);
42
43 /* lookup() returns either pointer to element value or NULL
44 * which is the meaning of PTR_TO_MAP_VALUE_OR_NULL type
45 */
46 return (unsigned long) value;
47} 34}
48 35
49const struct bpf_func_proto bpf_map_lookup_elem_proto = { 36const struct bpf_func_proto bpf_map_lookup_elem_proto = {
50 .func = bpf_map_lookup_elem, 37 .func = bpf_map_lookup_elem,
51 .gpl_only = false, 38 .gpl_only = false,
39 .pkt_access = true,
52 .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, 40 .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
53 .arg1_type = ARG_CONST_MAP_PTR, 41 .arg1_type = ARG_CONST_MAP_PTR,
54 .arg2_type = ARG_PTR_TO_MAP_KEY, 42 .arg2_type = ARG_PTR_TO_MAP_KEY,
55}; 43};
56 44
57static u64 bpf_map_update_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) 45BPF_CALL_4(bpf_map_update_elem, struct bpf_map *, map, void *, key,
46 void *, value, u64, flags)
58{ 47{
59 struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
60 void *key = (void *) (unsigned long) r2;
61 void *value = (void *) (unsigned long) r3;
62
63 WARN_ON_ONCE(!rcu_read_lock_held()); 48 WARN_ON_ONCE(!rcu_read_lock_held());
64 49 return map->ops->map_update_elem(map, key, value, flags);
65 return map->ops->map_update_elem(map, key, value, r4);
66} 50}
67 51
68const struct bpf_func_proto bpf_map_update_elem_proto = { 52const struct bpf_func_proto bpf_map_update_elem_proto = {
69 .func = bpf_map_update_elem, 53 .func = bpf_map_update_elem,
70 .gpl_only = false, 54 .gpl_only = false,
55 .pkt_access = true,
71 .ret_type = RET_INTEGER, 56 .ret_type = RET_INTEGER,
72 .arg1_type = ARG_CONST_MAP_PTR, 57 .arg1_type = ARG_CONST_MAP_PTR,
73 .arg2_type = ARG_PTR_TO_MAP_KEY, 58 .arg2_type = ARG_PTR_TO_MAP_KEY,
@@ -75,19 +60,16 @@ const struct bpf_func_proto bpf_map_update_elem_proto = {
75 .arg4_type = ARG_ANYTHING, 60 .arg4_type = ARG_ANYTHING,
76}; 61};
77 62
78static u64 bpf_map_delete_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) 63BPF_CALL_2(bpf_map_delete_elem, struct bpf_map *, map, void *, key)
79{ 64{
80 struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
81 void *key = (void *) (unsigned long) r2;
82
83 WARN_ON_ONCE(!rcu_read_lock_held()); 65 WARN_ON_ONCE(!rcu_read_lock_held());
84
85 return map->ops->map_delete_elem(map, key); 66 return map->ops->map_delete_elem(map, key);
86} 67}
87 68
88const struct bpf_func_proto bpf_map_delete_elem_proto = { 69const struct bpf_func_proto bpf_map_delete_elem_proto = {
89 .func = bpf_map_delete_elem, 70 .func = bpf_map_delete_elem,
90 .gpl_only = false, 71 .gpl_only = false,
72 .pkt_access = true,
91 .ret_type = RET_INTEGER, 73 .ret_type = RET_INTEGER,
92 .arg1_type = ARG_CONST_MAP_PTR, 74 .arg1_type = ARG_CONST_MAP_PTR,
93 .arg2_type = ARG_PTR_TO_MAP_KEY, 75 .arg2_type = ARG_PTR_TO_MAP_KEY,
@@ -99,7 +81,7 @@ const struct bpf_func_proto bpf_get_prandom_u32_proto = {
99 .ret_type = RET_INTEGER, 81 .ret_type = RET_INTEGER,
100}; 82};
101 83
102static u64 bpf_get_smp_processor_id(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) 84BPF_CALL_0(bpf_get_smp_processor_id)
103{ 85{
104 return smp_processor_id(); 86 return smp_processor_id();
105} 87}
@@ -110,7 +92,7 @@ const struct bpf_func_proto bpf_get_smp_processor_id_proto = {
110 .ret_type = RET_INTEGER, 92 .ret_type = RET_INTEGER,
111}; 93};
112 94
113static u64 bpf_ktime_get_ns(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) 95BPF_CALL_0(bpf_ktime_get_ns)
114{ 96{
115 /* NMI safe access to clock monotonic */ 97 /* NMI safe access to clock monotonic */
116 return ktime_get_mono_fast_ns(); 98 return ktime_get_mono_fast_ns();
@@ -122,11 +104,11 @@ const struct bpf_func_proto bpf_ktime_get_ns_proto = {
122 .ret_type = RET_INTEGER, 104 .ret_type = RET_INTEGER,
123}; 105};
124 106
125static u64 bpf_get_current_pid_tgid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) 107BPF_CALL_0(bpf_get_current_pid_tgid)
126{ 108{
127 struct task_struct *task = current; 109 struct task_struct *task = current;
128 110
129 if (!task) 111 if (unlikely(!task))
130 return -EINVAL; 112 return -EINVAL;
131 113
132 return (u64) task->tgid << 32 | task->pid; 114 return (u64) task->tgid << 32 | task->pid;
@@ -138,18 +120,18 @@ const struct bpf_func_proto bpf_get_current_pid_tgid_proto = {
138 .ret_type = RET_INTEGER, 120 .ret_type = RET_INTEGER,
139}; 121};
140 122
141static u64 bpf_get_current_uid_gid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) 123BPF_CALL_0(bpf_get_current_uid_gid)
142{ 124{
143 struct task_struct *task = current; 125 struct task_struct *task = current;
144 kuid_t uid; 126 kuid_t uid;
145 kgid_t gid; 127 kgid_t gid;
146 128
147 if (!task) 129 if (unlikely(!task))
148 return -EINVAL; 130 return -EINVAL;
149 131
150 current_uid_gid(&uid, &gid); 132 current_uid_gid(&uid, &gid);
151 return (u64) from_kgid(&init_user_ns, gid) << 32 | 133 return (u64) from_kgid(&init_user_ns, gid) << 32 |
152 from_kuid(&init_user_ns, uid); 134 from_kuid(&init_user_ns, uid);
153} 135}
154 136
155const struct bpf_func_proto bpf_get_current_uid_gid_proto = { 137const struct bpf_func_proto bpf_get_current_uid_gid_proto = {
@@ -158,10 +140,9 @@ const struct bpf_func_proto bpf_get_current_uid_gid_proto = {
158 .ret_type = RET_INTEGER, 140 .ret_type = RET_INTEGER,
159}; 141};
160 142
161static u64 bpf_get_current_comm(u64 r1, u64 size, u64 r3, u64 r4, u64 r5) 143BPF_CALL_2(bpf_get_current_comm, char *, buf, u32, size)
162{ 144{
163 struct task_struct *task = current; 145 struct task_struct *task = current;
164 char *buf = (char *) (long) r1;
165 146
166 if (unlikely(!task)) 147 if (unlikely(!task))
167 goto err_clear; 148 goto err_clear;
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index 5967b870a895..1ed8473ec537 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -97,7 +97,7 @@ static struct inode *bpf_get_inode(struct super_block *sb,
97 return ERR_PTR(-ENOSPC); 97 return ERR_PTR(-ENOSPC);
98 98
99 inode->i_ino = get_next_ino(); 99 inode->i_ino = get_next_ino();
100 inode->i_atime = CURRENT_TIME; 100 inode->i_atime = current_time(inode);
101 inode->i_mtime = inode->i_atime; 101 inode->i_mtime = inode->i_atime;
102 inode->i_ctime = inode->i_atime; 102 inode->i_ctime = inode->i_atime;
103 103
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index bf4495fcd25d..732ae16d12b7 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -116,10 +116,9 @@ free_smap:
116 return ERR_PTR(err); 116 return ERR_PTR(err);
117} 117}
118 118
119u64 bpf_get_stackid(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5) 119BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
120 u64, flags)
120{ 121{
121 struct pt_regs *regs = (struct pt_regs *) (long) r1;
122 struct bpf_map *map = (struct bpf_map *) (long) r2;
123 struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map); 122 struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);
124 struct perf_callchain_entry *trace; 123 struct perf_callchain_entry *trace;
125 struct stack_map_bucket *bucket, *new_bucket, *old_bucket; 124 struct stack_map_bucket *bucket, *new_bucket, *old_bucket;
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 228f962447a5..237f3d6a7ddc 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -194,7 +194,7 @@ static int map_create(union bpf_attr *attr)
194 194
195 err = bpf_map_charge_memlock(map); 195 err = bpf_map_charge_memlock(map);
196 if (err) 196 if (err)
197 goto free_map; 197 goto free_map_nouncharge;
198 198
199 err = bpf_map_new_fd(map); 199 err = bpf_map_new_fd(map);
200 if (err < 0) 200 if (err < 0)
@@ -204,6 +204,8 @@ static int map_create(union bpf_attr *attr)
204 return err; 204 return err;
205 205
206free_map: 206free_map:
207 bpf_map_uncharge_memlock(map);
208free_map_nouncharge:
207 map->ops->map_free(map); 209 map->ops->map_free(map);
208 return err; 210 return err;
209} 211}
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index daea765d72e6..6a936159c6e0 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -14,6 +14,7 @@
14#include <linux/types.h> 14#include <linux/types.h>
15#include <linux/slab.h> 15#include <linux/slab.h>
16#include <linux/bpf.h> 16#include <linux/bpf.h>
17#include <linux/bpf_verifier.h>
17#include <linux/filter.h> 18#include <linux/filter.h>
18#include <net/netlink.h> 19#include <net/netlink.h>
19#include <linux/file.h> 20#include <linux/file.h>
@@ -126,76 +127,16 @@
126 * are set to NOT_INIT to indicate that they are no longer readable. 127 * are set to NOT_INIT to indicate that they are no longer readable.
127 */ 128 */
128 129
129struct reg_state {
130 enum bpf_reg_type type;
131 union {
132 /* valid when type == CONST_IMM | PTR_TO_STACK | UNKNOWN_VALUE */
133 s64 imm;
134
135 /* valid when type == PTR_TO_PACKET* */
136 struct {
137 u32 id;
138 u16 off;
139 u16 range;
140 };
141
142 /* valid when type == CONST_PTR_TO_MAP | PTR_TO_MAP_VALUE |
143 * PTR_TO_MAP_VALUE_OR_NULL
144 */
145 struct bpf_map *map_ptr;
146 };
147};
148
149enum bpf_stack_slot_type {
150 STACK_INVALID, /* nothing was stored in this stack slot */
151 STACK_SPILL, /* register spilled into stack */
152 STACK_MISC /* BPF program wrote some data into this slot */
153};
154
155#define BPF_REG_SIZE 8 /* size of eBPF register in bytes */
156
157/* state of the program:
158 * type of all registers and stack info
159 */
160struct verifier_state {
161 struct reg_state regs[MAX_BPF_REG];
162 u8 stack_slot_type[MAX_BPF_STACK];
163 struct reg_state spilled_regs[MAX_BPF_STACK / BPF_REG_SIZE];
164};
165
166/* linked list of verifier states used to prune search */
167struct verifier_state_list {
168 struct verifier_state state;
169 struct verifier_state_list *next;
170};
171
172/* verifier_state + insn_idx are pushed to stack when branch is encountered */ 130/* verifier_state + insn_idx are pushed to stack when branch is encountered */
173struct verifier_stack_elem { 131struct bpf_verifier_stack_elem {
174 /* verifer state is 'st' 132 /* verifer state is 'st'
175 * before processing instruction 'insn_idx' 133 * before processing instruction 'insn_idx'
176 * and after processing instruction 'prev_insn_idx' 134 * and after processing instruction 'prev_insn_idx'
177 */ 135 */
178 struct verifier_state st; 136 struct bpf_verifier_state st;
179 int insn_idx; 137 int insn_idx;
180 int prev_insn_idx; 138 int prev_insn_idx;
181 struct verifier_stack_elem *next; 139 struct bpf_verifier_stack_elem *next;
182};
183
184#define MAX_USED_MAPS 64 /* max number of maps accessed by one eBPF program */
185
186/* single container for all structs
187 * one verifier_env per bpf_check() call
188 */
189struct verifier_env {
190 struct bpf_prog *prog; /* eBPF program being verified */
191 struct verifier_stack_elem *head; /* stack of verifier states to be processed */
192 int stack_size; /* number of states to be processed */
193 struct verifier_state cur_state; /* current verifier state */
194 struct verifier_state_list **explored_states; /* search pruning optimization */
195 struct bpf_map *used_maps[MAX_USED_MAPS]; /* array of map's used by eBPF program */
196 u32 used_map_cnt; /* number of used maps */
197 u32 id_gen; /* used to generate unique reg IDs */
198 bool allow_ptr_leaks;
199}; 140};
200 141
201#define BPF_COMPLEXITY_LIMIT_INSNS 65536 142#define BPF_COMPLEXITY_LIMIT_INSNS 65536
@@ -204,6 +145,7 @@ struct verifier_env {
204struct bpf_call_arg_meta { 145struct bpf_call_arg_meta {
205 struct bpf_map *map_ptr; 146 struct bpf_map *map_ptr;
206 bool raw_mode; 147 bool raw_mode;
148 bool pkt_access;
207 int regno; 149 int regno;
208 int access_size; 150 int access_size;
209}; 151};
@@ -240,6 +182,7 @@ static const char * const reg_type_str[] = {
240 [CONST_PTR_TO_MAP] = "map_ptr", 182 [CONST_PTR_TO_MAP] = "map_ptr",
241 [PTR_TO_MAP_VALUE] = "map_value", 183 [PTR_TO_MAP_VALUE] = "map_value",
242 [PTR_TO_MAP_VALUE_OR_NULL] = "map_value_or_null", 184 [PTR_TO_MAP_VALUE_OR_NULL] = "map_value_or_null",
185 [PTR_TO_MAP_VALUE_ADJ] = "map_value_adj",
243 [FRAME_PTR] = "fp", 186 [FRAME_PTR] = "fp",
244 [PTR_TO_STACK] = "fp", 187 [PTR_TO_STACK] = "fp",
245 [CONST_IMM] = "imm", 188 [CONST_IMM] = "imm",
@@ -247,9 +190,9 @@ static const char * const reg_type_str[] = {
247 [PTR_TO_PACKET_END] = "pkt_end", 190 [PTR_TO_PACKET_END] = "pkt_end",
248}; 191};
249 192
250static void print_verifier_state(struct verifier_state *state) 193static void print_verifier_state(struct bpf_verifier_state *state)
251{ 194{
252 struct reg_state *reg; 195 struct bpf_reg_state *reg;
253 enum bpf_reg_type t; 196 enum bpf_reg_type t;
254 int i; 197 int i;
255 198
@@ -267,10 +210,17 @@ static void print_verifier_state(struct verifier_state *state)
267 else if (t == UNKNOWN_VALUE && reg->imm) 210 else if (t == UNKNOWN_VALUE && reg->imm)
268 verbose("%lld", reg->imm); 211 verbose("%lld", reg->imm);
269 else if (t == CONST_PTR_TO_MAP || t == PTR_TO_MAP_VALUE || 212 else if (t == CONST_PTR_TO_MAP || t == PTR_TO_MAP_VALUE ||
270 t == PTR_TO_MAP_VALUE_OR_NULL) 213 t == PTR_TO_MAP_VALUE_OR_NULL ||
214 t == PTR_TO_MAP_VALUE_ADJ)
271 verbose("(ks=%d,vs=%d)", 215 verbose("(ks=%d,vs=%d)",
272 reg->map_ptr->key_size, 216 reg->map_ptr->key_size,
273 reg->map_ptr->value_size); 217 reg->map_ptr->value_size);
218 if (reg->min_value != BPF_REGISTER_MIN_RANGE)
219 verbose(",min_value=%lld",
220 (long long)reg->min_value);
221 if (reg->max_value != BPF_REGISTER_MAX_RANGE)
222 verbose(",max_value=%llu",
223 (unsigned long long)reg->max_value);
274 } 224 }
275 for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) { 225 for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {
276 if (state->stack_slot_type[i] == STACK_SPILL) 226 if (state->stack_slot_type[i] == STACK_SPILL)
@@ -425,9 +375,9 @@ static void print_bpf_insn(struct bpf_insn *insn)
425 } 375 }
426} 376}
427 377
428static int pop_stack(struct verifier_env *env, int *prev_insn_idx) 378static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx)
429{ 379{
430 struct verifier_stack_elem *elem; 380 struct bpf_verifier_stack_elem *elem;
431 int insn_idx; 381 int insn_idx;
432 382
433 if (env->head == NULL) 383 if (env->head == NULL)
@@ -444,12 +394,12 @@ static int pop_stack(struct verifier_env *env, int *prev_insn_idx)
444 return insn_idx; 394 return insn_idx;
445} 395}
446 396
447static struct verifier_state *push_stack(struct verifier_env *env, int insn_idx, 397static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env,
448 int prev_insn_idx) 398 int insn_idx, int prev_insn_idx)
449{ 399{
450 struct verifier_stack_elem *elem; 400 struct bpf_verifier_stack_elem *elem;
451 401
452 elem = kmalloc(sizeof(struct verifier_stack_elem), GFP_KERNEL); 402 elem = kmalloc(sizeof(struct bpf_verifier_stack_elem), GFP_KERNEL);
453 if (!elem) 403 if (!elem)
454 goto err; 404 goto err;
455 405
@@ -475,13 +425,15 @@ static const int caller_saved[CALLER_SAVED_REGS] = {
475 BPF_REG_0, BPF_REG_1, BPF_REG_2, BPF_REG_3, BPF_REG_4, BPF_REG_5 425 BPF_REG_0, BPF_REG_1, BPF_REG_2, BPF_REG_3, BPF_REG_4, BPF_REG_5
476}; 426};
477 427
478static void init_reg_state(struct reg_state *regs) 428static void init_reg_state(struct bpf_reg_state *regs)
479{ 429{
480 int i; 430 int i;
481 431
482 for (i = 0; i < MAX_BPF_REG; i++) { 432 for (i = 0; i < MAX_BPF_REG; i++) {
483 regs[i].type = NOT_INIT; 433 regs[i].type = NOT_INIT;
484 regs[i].imm = 0; 434 regs[i].imm = 0;
435 regs[i].min_value = BPF_REGISTER_MIN_RANGE;
436 regs[i].max_value = BPF_REGISTER_MAX_RANGE;
485 } 437 }
486 438
487 /* frame pointer */ 439 /* frame pointer */
@@ -491,20 +443,26 @@ static void init_reg_state(struct reg_state *regs)
491 regs[BPF_REG_1].type = PTR_TO_CTX; 443 regs[BPF_REG_1].type = PTR_TO_CTX;
492} 444}
493 445
494static void mark_reg_unknown_value(struct reg_state *regs, u32 regno) 446static void mark_reg_unknown_value(struct bpf_reg_state *regs, u32 regno)
495{ 447{
496 BUG_ON(regno >= MAX_BPF_REG); 448 BUG_ON(regno >= MAX_BPF_REG);
497 regs[regno].type = UNKNOWN_VALUE; 449 regs[regno].type = UNKNOWN_VALUE;
498 regs[regno].imm = 0; 450 regs[regno].imm = 0;
499} 451}
500 452
453static void reset_reg_range_values(struct bpf_reg_state *regs, u32 regno)
454{
455 regs[regno].min_value = BPF_REGISTER_MIN_RANGE;
456 regs[regno].max_value = BPF_REGISTER_MAX_RANGE;
457}
458
501enum reg_arg_type { 459enum reg_arg_type {
502 SRC_OP, /* register is used as source operand */ 460 SRC_OP, /* register is used as source operand */
503 DST_OP, /* register is used as destination operand */ 461 DST_OP, /* register is used as destination operand */
504 DST_OP_NO_MARK /* same as above, check only, don't mark */ 462 DST_OP_NO_MARK /* same as above, check only, don't mark */
505}; 463};
506 464
507static int check_reg_arg(struct reg_state *regs, u32 regno, 465static int check_reg_arg(struct bpf_reg_state *regs, u32 regno,
508 enum reg_arg_type t) 466 enum reg_arg_type t)
509{ 467{
510 if (regno >= MAX_BPF_REG) { 468 if (regno >= MAX_BPF_REG) {
@@ -564,8 +522,8 @@ static bool is_spillable_regtype(enum bpf_reg_type type)
564/* check_stack_read/write functions track spill/fill of registers, 522/* check_stack_read/write functions track spill/fill of registers,
565 * stack boundary and alignment are checked in check_mem_access() 523 * stack boundary and alignment are checked in check_mem_access()
566 */ 524 */
567static int check_stack_write(struct verifier_state *state, int off, int size, 525static int check_stack_write(struct bpf_verifier_state *state, int off,
568 int value_regno) 526 int size, int value_regno)
569{ 527{
570 int i; 528 int i;
571 /* caller checked that off % size == 0 and -MAX_BPF_STACK <= off < 0, 529 /* caller checked that off % size == 0 and -MAX_BPF_STACK <= off < 0,
@@ -590,7 +548,7 @@ static int check_stack_write(struct verifier_state *state, int off, int size,
590 } else { 548 } else {
591 /* regular write of data into stack */ 549 /* regular write of data into stack */
592 state->spilled_regs[(MAX_BPF_STACK + off) / BPF_REG_SIZE] = 550 state->spilled_regs[(MAX_BPF_STACK + off) / BPF_REG_SIZE] =
593 (struct reg_state) {}; 551 (struct bpf_reg_state) {};
594 552
595 for (i = 0; i < size; i++) 553 for (i = 0; i < size; i++)
596 state->stack_slot_type[MAX_BPF_STACK + off + i] = STACK_MISC; 554 state->stack_slot_type[MAX_BPF_STACK + off + i] = STACK_MISC;
@@ -598,7 +556,7 @@ static int check_stack_write(struct verifier_state *state, int off, int size,
598 return 0; 556 return 0;
599} 557}
600 558
601static int check_stack_read(struct verifier_state *state, int off, int size, 559static int check_stack_read(struct bpf_verifier_state *state, int off, int size,
602 int value_regno) 560 int value_regno)
603{ 561{
604 u8 *slot_type; 562 u8 *slot_type;
@@ -639,7 +597,7 @@ static int check_stack_read(struct verifier_state *state, int off, int size,
639} 597}
640 598
641/* check read/write into map element returned by bpf_map_lookup_elem() */ 599/* check read/write into map element returned by bpf_map_lookup_elem() */
642static int check_map_access(struct verifier_env *env, u32 regno, int off, 600static int check_map_access(struct bpf_verifier_env *env, u32 regno, int off,
643 int size) 601 int size)
644{ 602{
645 struct bpf_map *map = env->cur_state.regs[regno].map_ptr; 603 struct bpf_map *map = env->cur_state.regs[regno].map_ptr;
@@ -654,24 +612,31 @@ static int check_map_access(struct verifier_env *env, u32 regno, int off,
654 612
655#define MAX_PACKET_OFF 0xffff 613#define MAX_PACKET_OFF 0xffff
656 614
657static bool may_write_pkt_data(enum bpf_prog_type type) 615static bool may_access_direct_pkt_data(struct bpf_verifier_env *env,
616 const struct bpf_call_arg_meta *meta)
658{ 617{
659 switch (type) { 618 switch (env->prog->type) {
619 case BPF_PROG_TYPE_SCHED_CLS:
620 case BPF_PROG_TYPE_SCHED_ACT:
660 case BPF_PROG_TYPE_XDP: 621 case BPF_PROG_TYPE_XDP:
622 if (meta)
623 return meta->pkt_access;
624
625 env->seen_direct_write = true;
661 return true; 626 return true;
662 default: 627 default:
663 return false; 628 return false;
664 } 629 }
665} 630}
666 631
667static int check_packet_access(struct verifier_env *env, u32 regno, int off, 632static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off,
668 int size) 633 int size)
669{ 634{
670 struct reg_state *regs = env->cur_state.regs; 635 struct bpf_reg_state *regs = env->cur_state.regs;
671 struct reg_state *reg = &regs[regno]; 636 struct bpf_reg_state *reg = &regs[regno];
672 637
673 off += reg->off; 638 off += reg->off;
674 if (off < 0 || off + size > reg->range) { 639 if (off < 0 || size <= 0 || off + size > reg->range) {
675 verbose("invalid access to packet, off=%d size=%d, R%d(id=%d,off=%d,r=%d)\n", 640 verbose("invalid access to packet, off=%d size=%d, R%d(id=%d,off=%d,r=%d)\n",
676 off, size, regno, reg->id, reg->off, reg->range); 641 off, size, regno, reg->id, reg->off, reg->range);
677 return -EACCES; 642 return -EACCES;
@@ -680,9 +645,13 @@ static int check_packet_access(struct verifier_env *env, u32 regno, int off,
680} 645}
681 646
682/* check access to 'struct bpf_context' fields */ 647/* check access to 'struct bpf_context' fields */
683static int check_ctx_access(struct verifier_env *env, int off, int size, 648static int check_ctx_access(struct bpf_verifier_env *env, int off, int size,
684 enum bpf_access_type t, enum bpf_reg_type *reg_type) 649 enum bpf_access_type t, enum bpf_reg_type *reg_type)
685{ 650{
651 /* for analyzer ctx accesses are already validated and converted */
652 if (env->analyzer_ops)
653 return 0;
654
686 if (env->prog->aux->ops->is_valid_access && 655 if (env->prog->aux->ops->is_valid_access &&
687 env->prog->aux->ops->is_valid_access(off, size, t, reg_type)) { 656 env->prog->aux->ops->is_valid_access(off, size, t, reg_type)) {
688 /* remember the offset of last byte accessed in ctx */ 657 /* remember the offset of last byte accessed in ctx */
@@ -695,7 +664,7 @@ static int check_ctx_access(struct verifier_env *env, int off, int size,
695 return -EACCES; 664 return -EACCES;
696} 665}
697 666
698static bool is_pointer_value(struct verifier_env *env, int regno) 667static bool is_pointer_value(struct bpf_verifier_env *env, int regno)
699{ 668{
700 if (env->allow_ptr_leaks) 669 if (env->allow_ptr_leaks)
701 return false; 670 return false;
@@ -709,28 +678,19 @@ static bool is_pointer_value(struct verifier_env *env, int regno)
709 } 678 }
710} 679}
711 680
712static int check_ptr_alignment(struct verifier_env *env, struct reg_state *reg, 681static int check_ptr_alignment(struct bpf_verifier_env *env,
713 int off, int size) 682 struct bpf_reg_state *reg, int off, int size)
714{ 683{
715 if (reg->type != PTR_TO_PACKET) { 684 if (reg->type != PTR_TO_PACKET && reg->type != PTR_TO_MAP_VALUE_ADJ) {
716 if (off % size != 0) { 685 if (off % size != 0) {
717 verbose("misaligned access off %d size %d\n", off, size); 686 verbose("misaligned access off %d size %d\n",
687 off, size);
718 return -EACCES; 688 return -EACCES;
719 } else { 689 } else {
720 return 0; 690 return 0;
721 } 691 }
722 } 692 }
723 693
724 switch (env->prog->type) {
725 case BPF_PROG_TYPE_SCHED_CLS:
726 case BPF_PROG_TYPE_SCHED_ACT:
727 case BPF_PROG_TYPE_XDP:
728 break;
729 default:
730 verbose("verifier is misconfigured\n");
731 return -EACCES;
732 }
733
734 if (IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) 694 if (IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS))
735 /* misaligned access to packet is ok on x86,arm,arm64 */ 695 /* misaligned access to packet is ok on x86,arm,arm64 */
736 return 0; 696 return 0;
@@ -741,7 +701,8 @@ static int check_ptr_alignment(struct verifier_env *env, struct reg_state *reg,
741 } 701 }
742 702
743 /* skb->data is NET_IP_ALIGN-ed */ 703 /* skb->data is NET_IP_ALIGN-ed */
744 if ((NET_IP_ALIGN + reg->off + off) % size != 0) { 704 if (reg->type == PTR_TO_PACKET &&
705 (NET_IP_ALIGN + reg->off + off) % size != 0) {
745 verbose("misaligned packet access off %d+%d+%d size %d\n", 706 verbose("misaligned packet access off %d+%d+%d size %d\n",
746 NET_IP_ALIGN, reg->off, off, size); 707 NET_IP_ALIGN, reg->off, off, size);
747 return -EACCES; 708 return -EACCES;
@@ -755,12 +716,12 @@ static int check_ptr_alignment(struct verifier_env *env, struct reg_state *reg,
755 * if t==write && value_regno==-1, some unknown value is stored into memory 716 * if t==write && value_regno==-1, some unknown value is stored into memory
756 * if t==read && value_regno==-1, don't care what we read from memory 717 * if t==read && value_regno==-1, don't care what we read from memory
757 */ 718 */
758static int check_mem_access(struct verifier_env *env, u32 regno, int off, 719static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off,
759 int bpf_size, enum bpf_access_type t, 720 int bpf_size, enum bpf_access_type t,
760 int value_regno) 721 int value_regno)
761{ 722{
762 struct verifier_state *state = &env->cur_state; 723 struct bpf_verifier_state *state = &env->cur_state;
763 struct reg_state *reg = &state->regs[regno]; 724 struct bpf_reg_state *reg = &state->regs[regno];
764 int size, err = 0; 725 int size, err = 0;
765 726
766 if (reg->type == PTR_TO_STACK) 727 if (reg->type == PTR_TO_STACK)
@@ -774,12 +735,52 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off,
774 if (err) 735 if (err)
775 return err; 736 return err;
776 737
777 if (reg->type == PTR_TO_MAP_VALUE) { 738 if (reg->type == PTR_TO_MAP_VALUE ||
739 reg->type == PTR_TO_MAP_VALUE_ADJ) {
778 if (t == BPF_WRITE && value_regno >= 0 && 740 if (t == BPF_WRITE && value_regno >= 0 &&
779 is_pointer_value(env, value_regno)) { 741 is_pointer_value(env, value_regno)) {
780 verbose("R%d leaks addr into map\n", value_regno); 742 verbose("R%d leaks addr into map\n", value_regno);
781 return -EACCES; 743 return -EACCES;
782 } 744 }
745
746 /* If we adjusted the register to this map value at all then we
747 * need to change off and size to min_value and max_value
748 * respectively to make sure our theoretical access will be
749 * safe.
750 */
751 if (reg->type == PTR_TO_MAP_VALUE_ADJ) {
752 if (log_level)
753 print_verifier_state(state);
754 env->varlen_map_value_access = true;
755 /* The minimum value is only important with signed
756 * comparisons where we can't assume the floor of a
757 * value is 0. If we are using signed variables for our
758 * index'es we need to make sure that whatever we use
759 * will have a set floor within our range.
760 */
761 if (reg->min_value < 0) {
762 verbose("R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n",
763 regno);
764 return -EACCES;
765 }
766 err = check_map_access(env, regno, reg->min_value + off,
767 size);
768 if (err) {
769 verbose("R%d min value is outside of the array range\n",
770 regno);
771 return err;
772 }
773
774 /* If we haven't set a max value then we need to bail
775 * since we can't be sure we won't do bad things.
776 */
777 if (reg->max_value == BPF_REGISTER_MAX_RANGE) {
778 verbose("R%d unbounded memory access, make sure to bounds check any array access into a map\n",
779 regno);
780 return -EACCES;
781 }
782 off += reg->max_value;
783 }
783 err = check_map_access(env, regno, off, size); 784 err = check_map_access(env, regno, off, size);
784 if (!err && t == BPF_READ && value_regno >= 0) 785 if (!err && t == BPF_READ && value_regno >= 0)
785 mark_reg_unknown_value(state->regs, value_regno); 786 mark_reg_unknown_value(state->regs, value_regno);
@@ -795,9 +796,8 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off,
795 err = check_ctx_access(env, off, size, t, &reg_type); 796 err = check_ctx_access(env, off, size, t, &reg_type);
796 if (!err && t == BPF_READ && value_regno >= 0) { 797 if (!err && t == BPF_READ && value_regno >= 0) {
797 mark_reg_unknown_value(state->regs, value_regno); 798 mark_reg_unknown_value(state->regs, value_regno);
798 if (env->allow_ptr_leaks) 799 /* note that reg.[id|off|range] == 0 */
799 /* note that reg.[id|off|range] == 0 */ 800 state->regs[value_regno].type = reg_type;
800 state->regs[value_regno].type = reg_type;
801 } 801 }
802 802
803 } else if (reg->type == FRAME_PTR || reg->type == PTR_TO_STACK) { 803 } else if (reg->type == FRAME_PTR || reg->type == PTR_TO_STACK) {
@@ -817,7 +817,7 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off,
817 err = check_stack_read(state, off, size, value_regno); 817 err = check_stack_read(state, off, size, value_regno);
818 } 818 }
819 } else if (state->regs[regno].type == PTR_TO_PACKET) { 819 } else if (state->regs[regno].type == PTR_TO_PACKET) {
820 if (t == BPF_WRITE && !may_write_pkt_data(env->prog->type)) { 820 if (t == BPF_WRITE && !may_access_direct_pkt_data(env, NULL)) {
821 verbose("cannot write into packet\n"); 821 verbose("cannot write into packet\n");
822 return -EACCES; 822 return -EACCES;
823 } 823 }
@@ -846,9 +846,9 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off,
846 return err; 846 return err;
847} 847}
848 848
849static int check_xadd(struct verifier_env *env, struct bpf_insn *insn) 849static int check_xadd(struct bpf_verifier_env *env, struct bpf_insn *insn)
850{ 850{
851 struct reg_state *regs = env->cur_state.regs; 851 struct bpf_reg_state *regs = env->cur_state.regs;
852 int err; 852 int err;
853 853
854 if ((BPF_SIZE(insn->code) != BPF_W && BPF_SIZE(insn->code) != BPF_DW) || 854 if ((BPF_SIZE(insn->code) != BPF_W && BPF_SIZE(insn->code) != BPF_DW) ||
@@ -882,12 +882,12 @@ static int check_xadd(struct verifier_env *env, struct bpf_insn *insn)
882 * bytes from that pointer, make sure that it's within stack boundary 882 * bytes from that pointer, make sure that it's within stack boundary
883 * and all elements of stack are initialized 883 * and all elements of stack are initialized
884 */ 884 */
885static int check_stack_boundary(struct verifier_env *env, int regno, 885static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
886 int access_size, bool zero_size_allowed, 886 int access_size, bool zero_size_allowed,
887 struct bpf_call_arg_meta *meta) 887 struct bpf_call_arg_meta *meta)
888{ 888{
889 struct verifier_state *state = &env->cur_state; 889 struct bpf_verifier_state *state = &env->cur_state;
890 struct reg_state *regs = state->regs; 890 struct bpf_reg_state *regs = state->regs;
891 int off, i; 891 int off, i;
892 892
893 if (regs[regno].type != PTR_TO_STACK) { 893 if (regs[regno].type != PTR_TO_STACK) {
@@ -926,18 +926,18 @@ static int check_stack_boundary(struct verifier_env *env, int regno,
926 return 0; 926 return 0;
927} 927}
928 928
929static int check_func_arg(struct verifier_env *env, u32 regno, 929static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
930 enum bpf_arg_type arg_type, 930 enum bpf_arg_type arg_type,
931 struct bpf_call_arg_meta *meta) 931 struct bpf_call_arg_meta *meta)
932{ 932{
933 struct reg_state *reg = env->cur_state.regs + regno; 933 struct bpf_reg_state *regs = env->cur_state.regs, *reg = &regs[regno];
934 enum bpf_reg_type expected_type; 934 enum bpf_reg_type expected_type, type = reg->type;
935 int err = 0; 935 int err = 0;
936 936
937 if (arg_type == ARG_DONTCARE) 937 if (arg_type == ARG_DONTCARE)
938 return 0; 938 return 0;
939 939
940 if (reg->type == NOT_INIT) { 940 if (type == NOT_INIT) {
941 verbose("R%d !read_ok\n", regno); 941 verbose("R%d !read_ok\n", regno);
942 return -EACCES; 942 return -EACCES;
943 } 943 }
@@ -950,16 +950,29 @@ static int check_func_arg(struct verifier_env *env, u32 regno,
950 return 0; 950 return 0;
951 } 951 }
952 952
953 if (type == PTR_TO_PACKET && !may_access_direct_pkt_data(env, meta)) {
954 verbose("helper access to the packet is not allowed\n");
955 return -EACCES;
956 }
957
953 if (arg_type == ARG_PTR_TO_MAP_KEY || 958 if (arg_type == ARG_PTR_TO_MAP_KEY ||
954 arg_type == ARG_PTR_TO_MAP_VALUE) { 959 arg_type == ARG_PTR_TO_MAP_VALUE) {
955 expected_type = PTR_TO_STACK; 960 expected_type = PTR_TO_STACK;
961 if (type != PTR_TO_PACKET && type != expected_type)
962 goto err_type;
956 } else if (arg_type == ARG_CONST_STACK_SIZE || 963 } else if (arg_type == ARG_CONST_STACK_SIZE ||
957 arg_type == ARG_CONST_STACK_SIZE_OR_ZERO) { 964 arg_type == ARG_CONST_STACK_SIZE_OR_ZERO) {
958 expected_type = CONST_IMM; 965 expected_type = CONST_IMM;
966 if (type != expected_type)
967 goto err_type;
959 } else if (arg_type == ARG_CONST_MAP_PTR) { 968 } else if (arg_type == ARG_CONST_MAP_PTR) {
960 expected_type = CONST_PTR_TO_MAP; 969 expected_type = CONST_PTR_TO_MAP;
970 if (type != expected_type)
971 goto err_type;
961 } else if (arg_type == ARG_PTR_TO_CTX) { 972 } else if (arg_type == ARG_PTR_TO_CTX) {
962 expected_type = PTR_TO_CTX; 973 expected_type = PTR_TO_CTX;
974 if (type != expected_type)
975 goto err_type;
963 } else if (arg_type == ARG_PTR_TO_STACK || 976 } else if (arg_type == ARG_PTR_TO_STACK ||
964 arg_type == ARG_PTR_TO_RAW_STACK) { 977 arg_type == ARG_PTR_TO_RAW_STACK) {
965 expected_type = PTR_TO_STACK; 978 expected_type = PTR_TO_STACK;
@@ -967,20 +980,16 @@ static int check_func_arg(struct verifier_env *env, u32 regno,
967 * passed in as argument, it's a CONST_IMM type. Final test 980 * passed in as argument, it's a CONST_IMM type. Final test
968 * happens during stack boundary checking. 981 * happens during stack boundary checking.
969 */ 982 */
970 if (reg->type == CONST_IMM && reg->imm == 0) 983 if (type == CONST_IMM && reg->imm == 0)
971 expected_type = CONST_IMM; 984 /* final test in check_stack_boundary() */;
985 else if (type != PTR_TO_PACKET && type != expected_type)
986 goto err_type;
972 meta->raw_mode = arg_type == ARG_PTR_TO_RAW_STACK; 987 meta->raw_mode = arg_type == ARG_PTR_TO_RAW_STACK;
973 } else { 988 } else {
974 verbose("unsupported arg_type %d\n", arg_type); 989 verbose("unsupported arg_type %d\n", arg_type);
975 return -EFAULT; 990 return -EFAULT;
976 } 991 }
977 992
978 if (reg->type != expected_type) {
979 verbose("R%d type=%s expected=%s\n", regno,
980 reg_type_str[reg->type], reg_type_str[expected_type]);
981 return -EACCES;
982 }
983
984 if (arg_type == ARG_CONST_MAP_PTR) { 993 if (arg_type == ARG_CONST_MAP_PTR) {
985 /* bpf_map_xxx(map_ptr) call: remember that map_ptr */ 994 /* bpf_map_xxx(map_ptr) call: remember that map_ptr */
986 meta->map_ptr = reg->map_ptr; 995 meta->map_ptr = reg->map_ptr;
@@ -998,8 +1007,13 @@ static int check_func_arg(struct verifier_env *env, u32 regno,
998 verbose("invalid map_ptr to access map->key\n"); 1007 verbose("invalid map_ptr to access map->key\n");
999 return -EACCES; 1008 return -EACCES;
1000 } 1009 }
1001 err = check_stack_boundary(env, regno, meta->map_ptr->key_size, 1010 if (type == PTR_TO_PACKET)
1002 false, NULL); 1011 err = check_packet_access(env, regno, 0,
1012 meta->map_ptr->key_size);
1013 else
1014 err = check_stack_boundary(env, regno,
1015 meta->map_ptr->key_size,
1016 false, NULL);
1003 } else if (arg_type == ARG_PTR_TO_MAP_VALUE) { 1017 } else if (arg_type == ARG_PTR_TO_MAP_VALUE) {
1004 /* bpf_map_xxx(..., map_ptr, ..., value) call: 1018 /* bpf_map_xxx(..., map_ptr, ..., value) call:
1005 * check [value, value + map->value_size) validity 1019 * check [value, value + map->value_size) validity
@@ -1009,9 +1023,13 @@ static int check_func_arg(struct verifier_env *env, u32 regno,
1009 verbose("invalid map_ptr to access map->value\n"); 1023 verbose("invalid map_ptr to access map->value\n");
1010 return -EACCES; 1024 return -EACCES;
1011 } 1025 }
1012 err = check_stack_boundary(env, regno, 1026 if (type == PTR_TO_PACKET)
1013 meta->map_ptr->value_size, 1027 err = check_packet_access(env, regno, 0,
1014 false, NULL); 1028 meta->map_ptr->value_size);
1029 else
1030 err = check_stack_boundary(env, regno,
1031 meta->map_ptr->value_size,
1032 false, NULL);
1015 } else if (arg_type == ARG_CONST_STACK_SIZE || 1033 } else if (arg_type == ARG_CONST_STACK_SIZE ||
1016 arg_type == ARG_CONST_STACK_SIZE_OR_ZERO) { 1034 arg_type == ARG_CONST_STACK_SIZE_OR_ZERO) {
1017 bool zero_size_allowed = (arg_type == ARG_CONST_STACK_SIZE_OR_ZERO); 1035 bool zero_size_allowed = (arg_type == ARG_CONST_STACK_SIZE_OR_ZERO);
@@ -1025,11 +1043,18 @@ static int check_func_arg(struct verifier_env *env, u32 regno,
1025 verbose("ARG_CONST_STACK_SIZE cannot be first argument\n"); 1043 verbose("ARG_CONST_STACK_SIZE cannot be first argument\n");
1026 return -EACCES; 1044 return -EACCES;
1027 } 1045 }
1028 err = check_stack_boundary(env, regno - 1, reg->imm, 1046 if (regs[regno - 1].type == PTR_TO_PACKET)
1029 zero_size_allowed, meta); 1047 err = check_packet_access(env, regno - 1, 0, reg->imm);
1048 else
1049 err = check_stack_boundary(env, regno - 1, reg->imm,
1050 zero_size_allowed, meta);
1030 } 1051 }
1031 1052
1032 return err; 1053 return err;
1054err_type:
1055 verbose("R%d type=%s expected=%s\n", regno,
1056 reg_type_str[type], reg_type_str[expected_type]);
1057 return -EACCES;
1033} 1058}
1034 1059
1035static int check_map_func_compatibility(struct bpf_map *map, int func_id) 1060static int check_map_func_compatibility(struct bpf_map *map, int func_id)
@@ -1053,7 +1078,8 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
1053 goto error; 1078 goto error;
1054 break; 1079 break;
1055 case BPF_MAP_TYPE_CGROUP_ARRAY: 1080 case BPF_MAP_TYPE_CGROUP_ARRAY:
1056 if (func_id != BPF_FUNC_skb_under_cgroup) 1081 if (func_id != BPF_FUNC_skb_under_cgroup &&
1082 func_id != BPF_FUNC_current_task_under_cgroup)
1057 goto error; 1083 goto error;
1058 break; 1084 break;
1059 default: 1085 default:
@@ -1075,6 +1101,7 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
1075 if (map->map_type != BPF_MAP_TYPE_STACK_TRACE) 1101 if (map->map_type != BPF_MAP_TYPE_STACK_TRACE)
1076 goto error; 1102 goto error;
1077 break; 1103 break;
1104 case BPF_FUNC_current_task_under_cgroup:
1078 case BPF_FUNC_skb_under_cgroup: 1105 case BPF_FUNC_skb_under_cgroup:
1079 if (map->map_type != BPF_MAP_TYPE_CGROUP_ARRAY) 1106 if (map->map_type != BPF_MAP_TYPE_CGROUP_ARRAY)
1080 goto error; 1107 goto error;
@@ -1108,10 +1135,10 @@ static int check_raw_mode(const struct bpf_func_proto *fn)
1108 return count > 1 ? -EINVAL : 0; 1135 return count > 1 ? -EINVAL : 0;
1109} 1136}
1110 1137
1111static void clear_all_pkt_pointers(struct verifier_env *env) 1138static void clear_all_pkt_pointers(struct bpf_verifier_env *env)
1112{ 1139{
1113 struct verifier_state *state = &env->cur_state; 1140 struct bpf_verifier_state *state = &env->cur_state;
1114 struct reg_state *regs = state->regs, *reg; 1141 struct bpf_reg_state *regs = state->regs, *reg;
1115 int i; 1142 int i;
1116 1143
1117 for (i = 0; i < MAX_BPF_REG; i++) 1144 for (i = 0; i < MAX_BPF_REG; i++)
@@ -1131,12 +1158,12 @@ static void clear_all_pkt_pointers(struct verifier_env *env)
1131 } 1158 }
1132} 1159}
1133 1160
1134static int check_call(struct verifier_env *env, int func_id) 1161static int check_call(struct bpf_verifier_env *env, int func_id)
1135{ 1162{
1136 struct verifier_state *state = &env->cur_state; 1163 struct bpf_verifier_state *state = &env->cur_state;
1137 const struct bpf_func_proto *fn = NULL; 1164 const struct bpf_func_proto *fn = NULL;
1138 struct reg_state *regs = state->regs; 1165 struct bpf_reg_state *regs = state->regs;
1139 struct reg_state *reg; 1166 struct bpf_reg_state *reg;
1140 struct bpf_call_arg_meta meta; 1167 struct bpf_call_arg_meta meta;
1141 bool changes_data; 1168 bool changes_data;
1142 int i, err; 1169 int i, err;
@@ -1164,6 +1191,7 @@ static int check_call(struct verifier_env *env, int func_id)
1164 changes_data = bpf_helper_changes_skb_data(fn->func); 1191 changes_data = bpf_helper_changes_skb_data(fn->func);
1165 1192
1166 memset(&meta, 0, sizeof(meta)); 1193 memset(&meta, 0, sizeof(meta));
1194 meta.pkt_access = fn->pkt_access;
1167 1195
1168 /* We only support one arg being in raw mode at the moment, which 1196 /* We only support one arg being in raw mode at the moment, which
1169 * is sufficient for the helper functions we have right now. 1197 * is sufficient for the helper functions we have right now.
@@ -1214,6 +1242,7 @@ static int check_call(struct verifier_env *env, int func_id)
1214 regs[BPF_REG_0].type = NOT_INIT; 1242 regs[BPF_REG_0].type = NOT_INIT;
1215 } else if (fn->ret_type == RET_PTR_TO_MAP_VALUE_OR_NULL) { 1243 } else if (fn->ret_type == RET_PTR_TO_MAP_VALUE_OR_NULL) {
1216 regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL; 1244 regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL;
1245 regs[BPF_REG_0].max_value = regs[BPF_REG_0].min_value = 0;
1217 /* remember map_ptr, so that check_map_access() 1246 /* remember map_ptr, so that check_map_access()
1218 * can check 'value_size' boundary of memory access 1247 * can check 'value_size' boundary of memory access
1219 * to map element returned from bpf_map_lookup_elem() 1248 * to map element returned from bpf_map_lookup_elem()
@@ -1238,12 +1267,13 @@ static int check_call(struct verifier_env *env, int func_id)
1238 return 0; 1267 return 0;
1239} 1268}
1240 1269
1241static int check_packet_ptr_add(struct verifier_env *env, struct bpf_insn *insn) 1270static int check_packet_ptr_add(struct bpf_verifier_env *env,
1271 struct bpf_insn *insn)
1242{ 1272{
1243 struct reg_state *regs = env->cur_state.regs; 1273 struct bpf_reg_state *regs = env->cur_state.regs;
1244 struct reg_state *dst_reg = &regs[insn->dst_reg]; 1274 struct bpf_reg_state *dst_reg = &regs[insn->dst_reg];
1245 struct reg_state *src_reg = &regs[insn->src_reg]; 1275 struct bpf_reg_state *src_reg = &regs[insn->src_reg];
1246 struct reg_state tmp_reg; 1276 struct bpf_reg_state tmp_reg;
1247 s32 imm; 1277 s32 imm;
1248 1278
1249 if (BPF_SRC(insn->code) == BPF_K) { 1279 if (BPF_SRC(insn->code) == BPF_K) {
@@ -1311,10 +1341,10 @@ add_imm:
1311 return 0; 1341 return 0;
1312} 1342}
1313 1343
1314static int evaluate_reg_alu(struct verifier_env *env, struct bpf_insn *insn) 1344static int evaluate_reg_alu(struct bpf_verifier_env *env, struct bpf_insn *insn)
1315{ 1345{
1316 struct reg_state *regs = env->cur_state.regs; 1346 struct bpf_reg_state *regs = env->cur_state.regs;
1317 struct reg_state *dst_reg = &regs[insn->dst_reg]; 1347 struct bpf_reg_state *dst_reg = &regs[insn->dst_reg];
1318 u8 opcode = BPF_OP(insn->code); 1348 u8 opcode = BPF_OP(insn->code);
1319 s64 imm_log2; 1349 s64 imm_log2;
1320 1350
@@ -1324,7 +1354,7 @@ static int evaluate_reg_alu(struct verifier_env *env, struct bpf_insn *insn)
1324 */ 1354 */
1325 1355
1326 if (BPF_SRC(insn->code) == BPF_X) { 1356 if (BPF_SRC(insn->code) == BPF_X) {
1327 struct reg_state *src_reg = &regs[insn->src_reg]; 1357 struct bpf_reg_state *src_reg = &regs[insn->src_reg];
1328 1358
1329 if (src_reg->type == UNKNOWN_VALUE && src_reg->imm > 0 && 1359 if (src_reg->type == UNKNOWN_VALUE && src_reg->imm > 0 &&
1330 dst_reg->imm && opcode == BPF_ADD) { 1360 dst_reg->imm && opcode == BPF_ADD) {
@@ -1413,11 +1443,12 @@ static int evaluate_reg_alu(struct verifier_env *env, struct bpf_insn *insn)
1413 return 0; 1443 return 0;
1414} 1444}
1415 1445
1416static int evaluate_reg_imm_alu(struct verifier_env *env, struct bpf_insn *insn) 1446static int evaluate_reg_imm_alu(struct bpf_verifier_env *env,
1447 struct bpf_insn *insn)
1417{ 1448{
1418 struct reg_state *regs = env->cur_state.regs; 1449 struct bpf_reg_state *regs = env->cur_state.regs;
1419 struct reg_state *dst_reg = &regs[insn->dst_reg]; 1450 struct bpf_reg_state *dst_reg = &regs[insn->dst_reg];
1420 struct reg_state *src_reg = &regs[insn->src_reg]; 1451 struct bpf_reg_state *src_reg = &regs[insn->src_reg];
1421 u8 opcode = BPF_OP(insn->code); 1452 u8 opcode = BPF_OP(insn->code);
1422 1453
1423 /* dst_reg->type == CONST_IMM here, simulate execution of 'add' insn. 1454 /* dst_reg->type == CONST_IMM here, simulate execution of 'add' insn.
@@ -1433,10 +1464,134 @@ static int evaluate_reg_imm_alu(struct verifier_env *env, struct bpf_insn *insn)
1433 return 0; 1464 return 0;
1434} 1465}
1435 1466
1467static void check_reg_overflow(struct bpf_reg_state *reg)
1468{
1469 if (reg->max_value > BPF_REGISTER_MAX_RANGE)
1470 reg->max_value = BPF_REGISTER_MAX_RANGE;
1471 if (reg->min_value < BPF_REGISTER_MIN_RANGE ||
1472 reg->min_value > BPF_REGISTER_MAX_RANGE)
1473 reg->min_value = BPF_REGISTER_MIN_RANGE;
1474}
1475
1476static void adjust_reg_min_max_vals(struct bpf_verifier_env *env,
1477 struct bpf_insn *insn)
1478{
1479 struct bpf_reg_state *regs = env->cur_state.regs, *dst_reg;
1480 s64 min_val = BPF_REGISTER_MIN_RANGE;
1481 u64 max_val = BPF_REGISTER_MAX_RANGE;
1482 bool min_set = false, max_set = false;
1483 u8 opcode = BPF_OP(insn->code);
1484
1485 dst_reg = &regs[insn->dst_reg];
1486 if (BPF_SRC(insn->code) == BPF_X) {
1487 check_reg_overflow(&regs[insn->src_reg]);
1488 min_val = regs[insn->src_reg].min_value;
1489 max_val = regs[insn->src_reg].max_value;
1490
1491 /* If the source register is a random pointer then the
1492 * min_value/max_value values represent the range of the known
1493 * accesses into that value, not the actual min/max value of the
1494 * register itself. In this case we have to reset the reg range
1495 * values so we know it is not safe to look at.
1496 */
1497 if (regs[insn->src_reg].type != CONST_IMM &&
1498 regs[insn->src_reg].type != UNKNOWN_VALUE) {
1499 min_val = BPF_REGISTER_MIN_RANGE;
1500 max_val = BPF_REGISTER_MAX_RANGE;
1501 }
1502 } else if (insn->imm < BPF_REGISTER_MAX_RANGE &&
1503 (s64)insn->imm > BPF_REGISTER_MIN_RANGE) {
1504 min_val = max_val = insn->imm;
1505 min_set = max_set = true;
1506 }
1507
1508 /* We don't know anything about what was done to this register, mark it
1509 * as unknown.
1510 */
1511 if (min_val == BPF_REGISTER_MIN_RANGE &&
1512 max_val == BPF_REGISTER_MAX_RANGE) {
1513 reset_reg_range_values(regs, insn->dst_reg);
1514 return;
1515 }
1516
1517 /* If one of our values was at the end of our ranges then we can't just
1518 * do our normal operations to the register, we need to set the values
1519 * to the min/max since they are undefined.
1520 */
1521 if (min_val == BPF_REGISTER_MIN_RANGE)
1522 dst_reg->min_value = BPF_REGISTER_MIN_RANGE;
1523 if (max_val == BPF_REGISTER_MAX_RANGE)
1524 dst_reg->max_value = BPF_REGISTER_MAX_RANGE;
1525
1526 switch (opcode) {
1527 case BPF_ADD:
1528 if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE)
1529 dst_reg->min_value += min_val;
1530 if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE)
1531 dst_reg->max_value += max_val;
1532 break;
1533 case BPF_SUB:
1534 if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE)
1535 dst_reg->min_value -= min_val;
1536 if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE)
1537 dst_reg->max_value -= max_val;
1538 break;
1539 case BPF_MUL:
1540 if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE)
1541 dst_reg->min_value *= min_val;
1542 if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE)
1543 dst_reg->max_value *= max_val;
1544 break;
1545 case BPF_AND:
1546 /* Disallow AND'ing of negative numbers, ain't nobody got time
1547 * for that. Otherwise the minimum is 0 and the max is the max
1548 * value we could AND against.
1549 */
1550 if (min_val < 0)
1551 dst_reg->min_value = BPF_REGISTER_MIN_RANGE;
1552 else
1553 dst_reg->min_value = 0;
1554 dst_reg->max_value = max_val;
1555 break;
1556 case BPF_LSH:
1557 /* Gotta have special overflow logic here, if we're shifting
1558 * more than MAX_RANGE then just assume we have an invalid
1559 * range.
1560 */
1561 if (min_val > ilog2(BPF_REGISTER_MAX_RANGE))
1562 dst_reg->min_value = BPF_REGISTER_MIN_RANGE;
1563 else if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE)
1564 dst_reg->min_value <<= min_val;
1565
1566 if (max_val > ilog2(BPF_REGISTER_MAX_RANGE))
1567 dst_reg->max_value = BPF_REGISTER_MAX_RANGE;
1568 else if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE)
1569 dst_reg->max_value <<= max_val;
1570 break;
1571 case BPF_RSH:
1572 /* RSH by a negative number is undefined, and the BPF_RSH is an
1573 * unsigned shift, so make the appropriate casts.
1574 */
1575 if (min_val < 0 || dst_reg->min_value < 0)
1576 dst_reg->min_value = BPF_REGISTER_MIN_RANGE;
1577 else
1578 dst_reg->min_value =
1579 (u64)(dst_reg->min_value) >> min_val;
1580 if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE)
1581 dst_reg->max_value >>= max_val;
1582 break;
1583 default:
1584 reset_reg_range_values(regs, insn->dst_reg);
1585 break;
1586 }
1587
1588 check_reg_overflow(dst_reg);
1589}
1590
1436/* check validity of 32-bit and 64-bit arithmetic operations */ 1591/* check validity of 32-bit and 64-bit arithmetic operations */
1437static int check_alu_op(struct verifier_env *env, struct bpf_insn *insn) 1592static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
1438{ 1593{
1439 struct reg_state *regs = env->cur_state.regs, *dst_reg; 1594 struct bpf_reg_state *regs = env->cur_state.regs, *dst_reg;
1440 u8 opcode = BPF_OP(insn->code); 1595 u8 opcode = BPF_OP(insn->code);
1441 int err; 1596 int err;
1442 1597
@@ -1496,6 +1651,11 @@ static int check_alu_op(struct verifier_env *env, struct bpf_insn *insn)
1496 if (err) 1651 if (err)
1497 return err; 1652 return err;
1498 1653
1654 /* we are setting our register to something new, we need to
1655 * reset its range values.
1656 */
1657 reset_reg_range_values(regs, insn->dst_reg);
1658
1499 if (BPF_SRC(insn->code) == BPF_X) { 1659 if (BPF_SRC(insn->code) == BPF_X) {
1500 if (BPF_CLASS(insn->code) == BPF_ALU64) { 1660 if (BPF_CLASS(insn->code) == BPF_ALU64) {
1501 /* case: R1 = R2 1661 /* case: R1 = R2
@@ -1517,6 +1677,8 @@ static int check_alu_op(struct verifier_env *env, struct bpf_insn *insn)
1517 */ 1677 */
1518 regs[insn->dst_reg].type = CONST_IMM; 1678 regs[insn->dst_reg].type = CONST_IMM;
1519 regs[insn->dst_reg].imm = insn->imm; 1679 regs[insn->dst_reg].imm = insn->imm;
1680 regs[insn->dst_reg].max_value = insn->imm;
1681 regs[insn->dst_reg].min_value = insn->imm;
1520 } 1682 }
1521 1683
1522 } else if (opcode > BPF_END) { 1684 } else if (opcode > BPF_END) {
@@ -1569,6 +1731,9 @@ static int check_alu_op(struct verifier_env *env, struct bpf_insn *insn)
1569 1731
1570 dst_reg = &regs[insn->dst_reg]; 1732 dst_reg = &regs[insn->dst_reg];
1571 1733
1734 /* first we want to adjust our ranges. */
1735 adjust_reg_min_max_vals(env, insn);
1736
1572 /* pattern match 'bpf_add Rx, imm' instruction */ 1737 /* pattern match 'bpf_add Rx, imm' instruction */
1573 if (opcode == BPF_ADD && BPF_CLASS(insn->code) == BPF_ALU64 && 1738 if (opcode == BPF_ADD && BPF_CLASS(insn->code) == BPF_ALU64 &&
1574 dst_reg->type == FRAME_PTR && BPF_SRC(insn->code) == BPF_K) { 1739 dst_reg->type == FRAME_PTR && BPF_SRC(insn->code) == BPF_K) {
@@ -1603,28 +1768,58 @@ static int check_alu_op(struct verifier_env *env, struct bpf_insn *insn)
1603 return -EACCES; 1768 return -EACCES;
1604 } 1769 }
1605 1770
1606 /* mark dest operand */ 1771 /* If we did pointer math on a map value then just set it to our
1607 mark_reg_unknown_value(regs, insn->dst_reg); 1772 * PTR_TO_MAP_VALUE_ADJ type so we can deal with any stores or
1773 * loads to this register appropriately, otherwise just mark the
1774 * register as unknown.
1775 */
1776 if (env->allow_ptr_leaks &&
1777 (dst_reg->type == PTR_TO_MAP_VALUE ||
1778 dst_reg->type == PTR_TO_MAP_VALUE_ADJ))
1779 dst_reg->type = PTR_TO_MAP_VALUE_ADJ;
1780 else
1781 mark_reg_unknown_value(regs, insn->dst_reg);
1608 } 1782 }
1609 1783
1610 return 0; 1784 return 0;
1611} 1785}
1612 1786
1613static void find_good_pkt_pointers(struct verifier_env *env, 1787static void find_good_pkt_pointers(struct bpf_verifier_state *state,
1614 struct reg_state *dst_reg) 1788 struct bpf_reg_state *dst_reg)
1615{ 1789{
1616 struct verifier_state *state = &env->cur_state; 1790 struct bpf_reg_state *regs = state->regs, *reg;
1617 struct reg_state *regs = state->regs, *reg;
1618 int i; 1791 int i;
1619 /* r2 = r3; 1792
1620 * r2 += 8 1793 /* LLVM can generate two kind of checks:
1621 * if (r2 > pkt_end) goto somewhere 1794 *
1622 * r2 == dst_reg, pkt_end == src_reg, 1795 * Type 1:
1623 * r2=pkt(id=n,off=8,r=0) 1796 *
1624 * r3=pkt(id=n,off=0,r=0) 1797 * r2 = r3;
1625 * find register r3 and mark its range as r3=pkt(id=n,off=0,r=8) 1798 * r2 += 8;
1626 * so that range of bytes [r3, r3 + 8) is safe to access 1799 * if (r2 > pkt_end) goto <handle exception>
1800 * <access okay>
1801 *
1802 * Where:
1803 * r2 == dst_reg, pkt_end == src_reg
1804 * r2=pkt(id=n,off=8,r=0)
1805 * r3=pkt(id=n,off=0,r=0)
1806 *
1807 * Type 2:
1808 *
1809 * r2 = r3;
1810 * r2 += 8;
1811 * if (pkt_end >= r2) goto <access okay>
1812 * <handle exception>
1813 *
1814 * Where:
1815 * pkt_end == dst_reg, r2 == src_reg
1816 * r2=pkt(id=n,off=8,r=0)
1817 * r3=pkt(id=n,off=0,r=0)
1818 *
1819 * Find register r3 and mark its range as r3=pkt(id=n,off=0,r=8)
1820 * so that range of bytes [r3, r3 + 8) is safe to access.
1627 */ 1821 */
1822
1628 for (i = 0; i < MAX_BPF_REG; i++) 1823 for (i = 0; i < MAX_BPF_REG; i++)
1629 if (regs[i].type == PTR_TO_PACKET && regs[i].id == dst_reg->id) 1824 if (regs[i].type == PTR_TO_PACKET && regs[i].id == dst_reg->id)
1630 regs[i].range = dst_reg->off; 1825 regs[i].range = dst_reg->off;
@@ -1638,11 +1833,109 @@ static void find_good_pkt_pointers(struct verifier_env *env,
1638 } 1833 }
1639} 1834}
1640 1835
1641static int check_cond_jmp_op(struct verifier_env *env, 1836/* Adjusts the register min/max values in the case that the dst_reg is the
1837 * variable register that we are working on, and src_reg is a constant or we're
1838 * simply doing a BPF_K check.
1839 */
1840static void reg_set_min_max(struct bpf_reg_state *true_reg,
1841 struct bpf_reg_state *false_reg, u64 val,
1842 u8 opcode)
1843{
1844 switch (opcode) {
1845 case BPF_JEQ:
1846 /* If this is false then we know nothing Jon Snow, but if it is
1847 * true then we know for sure.
1848 */
1849 true_reg->max_value = true_reg->min_value = val;
1850 break;
1851 case BPF_JNE:
1852 /* If this is true we know nothing Jon Snow, but if it is false
1853 * we know the value for sure;
1854 */
1855 false_reg->max_value = false_reg->min_value = val;
1856 break;
1857 case BPF_JGT:
1858 /* Unsigned comparison, the minimum value is 0. */
1859 false_reg->min_value = 0;
1860 case BPF_JSGT:
1861 /* If this is false then we know the maximum val is val,
1862 * otherwise we know the min val is val+1.
1863 */
1864 false_reg->max_value = val;
1865 true_reg->min_value = val + 1;
1866 break;
1867 case BPF_JGE:
1868 /* Unsigned comparison, the minimum value is 0. */
1869 false_reg->min_value = 0;
1870 case BPF_JSGE:
1871 /* If this is false then we know the maximum value is val - 1,
1872 * otherwise we know the mimimum value is val.
1873 */
1874 false_reg->max_value = val - 1;
1875 true_reg->min_value = val;
1876 break;
1877 default:
1878 break;
1879 }
1880
1881 check_reg_overflow(false_reg);
1882 check_reg_overflow(true_reg);
1883}
1884
1885/* Same as above, but for the case that dst_reg is a CONST_IMM reg and src_reg
1886 * is the variable reg.
1887 */
1888static void reg_set_min_max_inv(struct bpf_reg_state *true_reg,
1889 struct bpf_reg_state *false_reg, u64 val,
1890 u8 opcode)
1891{
1892 switch (opcode) {
1893 case BPF_JEQ:
1894 /* If this is false then we know nothing Jon Snow, but if it is
1895 * true then we know for sure.
1896 */
1897 true_reg->max_value = true_reg->min_value = val;
1898 break;
1899 case BPF_JNE:
1900 /* If this is true we know nothing Jon Snow, but if it is false
1901 * we know the value for sure;
1902 */
1903 false_reg->max_value = false_reg->min_value = val;
1904 break;
1905 case BPF_JGT:
1906 /* Unsigned comparison, the minimum value is 0. */
1907 true_reg->min_value = 0;
1908 case BPF_JSGT:
1909 /*
1910 * If this is false, then the val is <= the register, if it is
1911 * true the register <= to the val.
1912 */
1913 false_reg->min_value = val;
1914 true_reg->max_value = val - 1;
1915 break;
1916 case BPF_JGE:
1917 /* Unsigned comparison, the minimum value is 0. */
1918 true_reg->min_value = 0;
1919 case BPF_JSGE:
1920 /* If this is false then constant < register, if it is true then
1921 * the register < constant.
1922 */
1923 false_reg->min_value = val + 1;
1924 true_reg->max_value = val;
1925 break;
1926 default:
1927 break;
1928 }
1929
1930 check_reg_overflow(false_reg);
1931 check_reg_overflow(true_reg);
1932}
1933
1934static int check_cond_jmp_op(struct bpf_verifier_env *env,
1642 struct bpf_insn *insn, int *insn_idx) 1935 struct bpf_insn *insn, int *insn_idx)
1643{ 1936{
1644 struct reg_state *regs = env->cur_state.regs, *dst_reg; 1937 struct bpf_verifier_state *other_branch, *this_branch = &env->cur_state;
1645 struct verifier_state *other_branch; 1938 struct bpf_reg_state *regs = this_branch->regs, *dst_reg;
1646 u8 opcode = BPF_OP(insn->code); 1939 u8 opcode = BPF_OP(insn->code);
1647 int err; 1940 int err;
1648 1941
@@ -1704,7 +1997,24 @@ static int check_cond_jmp_op(struct verifier_env *env,
1704 if (!other_branch) 1997 if (!other_branch)
1705 return -EFAULT; 1998 return -EFAULT;
1706 1999
1707 /* detect if R == 0 where R is returned value from bpf_map_lookup_elem() */ 2000 /* detect if we are comparing against a constant value so we can adjust
2001 * our min/max values for our dst register.
2002 */
2003 if (BPF_SRC(insn->code) == BPF_X) {
2004 if (regs[insn->src_reg].type == CONST_IMM)
2005 reg_set_min_max(&other_branch->regs[insn->dst_reg],
2006 dst_reg, regs[insn->src_reg].imm,
2007 opcode);
2008 else if (dst_reg->type == CONST_IMM)
2009 reg_set_min_max_inv(&other_branch->regs[insn->src_reg],
2010 &regs[insn->src_reg], dst_reg->imm,
2011 opcode);
2012 } else {
2013 reg_set_min_max(&other_branch->regs[insn->dst_reg],
2014 dst_reg, insn->imm, opcode);
2015 }
2016
2017 /* detect if R == 0 where R is returned from bpf_map_lookup_elem() */
1708 if (BPF_SRC(insn->code) == BPF_K && 2018 if (BPF_SRC(insn->code) == BPF_K &&
1709 insn->imm == 0 && (opcode == BPF_JEQ || opcode == BPF_JNE) && 2019 insn->imm == 0 && (opcode == BPF_JEQ || opcode == BPF_JNE) &&
1710 dst_reg->type == PTR_TO_MAP_VALUE_OR_NULL) { 2020 dst_reg->type == PTR_TO_MAP_VALUE_OR_NULL) {
@@ -1723,13 +2033,17 @@ static int check_cond_jmp_op(struct verifier_env *env,
1723 } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGT && 2033 } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGT &&
1724 dst_reg->type == PTR_TO_PACKET && 2034 dst_reg->type == PTR_TO_PACKET &&
1725 regs[insn->src_reg].type == PTR_TO_PACKET_END) { 2035 regs[insn->src_reg].type == PTR_TO_PACKET_END) {
1726 find_good_pkt_pointers(env, dst_reg); 2036 find_good_pkt_pointers(this_branch, dst_reg);
2037 } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGE &&
2038 dst_reg->type == PTR_TO_PACKET_END &&
2039 regs[insn->src_reg].type == PTR_TO_PACKET) {
2040 find_good_pkt_pointers(other_branch, &regs[insn->src_reg]);
1727 } else if (is_pointer_value(env, insn->dst_reg)) { 2041 } else if (is_pointer_value(env, insn->dst_reg)) {
1728 verbose("R%d pointer comparison prohibited\n", insn->dst_reg); 2042 verbose("R%d pointer comparison prohibited\n", insn->dst_reg);
1729 return -EACCES; 2043 return -EACCES;
1730 } 2044 }
1731 if (log_level) 2045 if (log_level)
1732 print_verifier_state(&env->cur_state); 2046 print_verifier_state(this_branch);
1733 return 0; 2047 return 0;
1734} 2048}
1735 2049
@@ -1742,9 +2056,9 @@ static struct bpf_map *ld_imm64_to_map_ptr(struct bpf_insn *insn)
1742} 2056}
1743 2057
1744/* verify BPF_LD_IMM64 instruction */ 2058/* verify BPF_LD_IMM64 instruction */
1745static int check_ld_imm(struct verifier_env *env, struct bpf_insn *insn) 2059static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn)
1746{ 2060{
1747 struct reg_state *regs = env->cur_state.regs; 2061 struct bpf_reg_state *regs = env->cur_state.regs;
1748 int err; 2062 int err;
1749 2063
1750 if (BPF_SIZE(insn->code) != BPF_DW) { 2064 if (BPF_SIZE(insn->code) != BPF_DW) {
@@ -1760,9 +2074,19 @@ static int check_ld_imm(struct verifier_env *env, struct bpf_insn *insn)
1760 if (err) 2074 if (err)
1761 return err; 2075 return err;
1762 2076
1763 if (insn->src_reg == 0) 2077 if (insn->src_reg == 0) {
1764 /* generic move 64-bit immediate into a register */ 2078 /* generic move 64-bit immediate into a register,
2079 * only analyzer needs to collect the ld_imm value.
2080 */
2081 u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm;
2082
2083 if (!env->analyzer_ops)
2084 return 0;
2085
2086 regs[insn->dst_reg].type = CONST_IMM;
2087 regs[insn->dst_reg].imm = imm;
1765 return 0; 2088 return 0;
2089 }
1766 2090
1767 /* replace_map_fd_with_map_ptr() should have caught bad ld_imm64 */ 2091 /* replace_map_fd_with_map_ptr() should have caught bad ld_imm64 */
1768 BUG_ON(insn->src_reg != BPF_PSEUDO_MAP_FD); 2092 BUG_ON(insn->src_reg != BPF_PSEUDO_MAP_FD);
@@ -1799,11 +2123,11 @@ static bool may_access_skb(enum bpf_prog_type type)
1799 * Output: 2123 * Output:
1800 * R0 - 8/16/32-bit skb data converted to cpu endianness 2124 * R0 - 8/16/32-bit skb data converted to cpu endianness
1801 */ 2125 */
1802static int check_ld_abs(struct verifier_env *env, struct bpf_insn *insn) 2126static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
1803{ 2127{
1804 struct reg_state *regs = env->cur_state.regs; 2128 struct bpf_reg_state *regs = env->cur_state.regs;
1805 u8 mode = BPF_MODE(insn->code); 2129 u8 mode = BPF_MODE(insn->code);
1806 struct reg_state *reg; 2130 struct bpf_reg_state *reg;
1807 int i, err; 2131 int i, err;
1808 2132
1809 if (!may_access_skb(env->prog->type)) { 2133 if (!may_access_skb(env->prog->type)) {
@@ -1889,7 +2213,7 @@ enum {
1889 BRANCH = 2, 2213 BRANCH = 2,
1890}; 2214};
1891 2215
1892#define STATE_LIST_MARK ((struct verifier_state_list *) -1L) 2216#define STATE_LIST_MARK ((struct bpf_verifier_state_list *) -1L)
1893 2217
1894static int *insn_stack; /* stack of insns to process */ 2218static int *insn_stack; /* stack of insns to process */
1895static int cur_stack; /* current stack index */ 2219static int cur_stack; /* current stack index */
@@ -1900,7 +2224,7 @@ static int *insn_state;
1900 * w - next instruction 2224 * w - next instruction
1901 * e - edge 2225 * e - edge
1902 */ 2226 */
1903static int push_insn(int t, int w, int e, struct verifier_env *env) 2227static int push_insn(int t, int w, int e, struct bpf_verifier_env *env)
1904{ 2228{
1905 if (e == FALLTHROUGH && insn_state[t] >= (DISCOVERED | FALLTHROUGH)) 2229 if (e == FALLTHROUGH && insn_state[t] >= (DISCOVERED | FALLTHROUGH))
1906 return 0; 2230 return 0;
@@ -1941,7 +2265,7 @@ static int push_insn(int t, int w, int e, struct verifier_env *env)
1941/* non-recursive depth-first-search to detect loops in BPF program 2265/* non-recursive depth-first-search to detect loops in BPF program
1942 * loop == back-edge in directed graph 2266 * loop == back-edge in directed graph
1943 */ 2267 */
1944static int check_cfg(struct verifier_env *env) 2268static int check_cfg(struct bpf_verifier_env *env)
1945{ 2269{
1946 struct bpf_insn *insns = env->prog->insnsi; 2270 struct bpf_insn *insns = env->prog->insnsi;
1947 int insn_cnt = env->prog->len; 2271 int insn_cnt = env->prog->len;
@@ -2050,7 +2374,8 @@ err_free:
2050/* the following conditions reduce the number of explored insns 2374/* the following conditions reduce the number of explored insns
2051 * from ~140k to ~80k for ultra large programs that use a lot of ptr_to_packet 2375 * from ~140k to ~80k for ultra large programs that use a lot of ptr_to_packet
2052 */ 2376 */
2053static bool compare_ptrs_to_packet(struct reg_state *old, struct reg_state *cur) 2377static bool compare_ptrs_to_packet(struct bpf_reg_state *old,
2378 struct bpf_reg_state *cur)
2054{ 2379{
2055 if (old->id != cur->id) 2380 if (old->id != cur->id)
2056 return false; 2381 return false;
@@ -2125,9 +2450,11 @@ static bool compare_ptrs_to_packet(struct reg_state *old, struct reg_state *cur)
2125 * whereas register type in current state is meaningful, it means that 2450 * whereas register type in current state is meaningful, it means that
2126 * the current state will reach 'bpf_exit' instruction safely 2451 * the current state will reach 'bpf_exit' instruction safely
2127 */ 2452 */
2128static bool states_equal(struct verifier_state *old, struct verifier_state *cur) 2453static bool states_equal(struct bpf_verifier_env *env,
2454 struct bpf_verifier_state *old,
2455 struct bpf_verifier_state *cur)
2129{ 2456{
2130 struct reg_state *rold, *rcur; 2457 struct bpf_reg_state *rold, *rcur;
2131 int i; 2458 int i;
2132 2459
2133 for (i = 0; i < MAX_BPF_REG; i++) { 2460 for (i = 0; i < MAX_BPF_REG; i++) {
@@ -2137,6 +2464,13 @@ static bool states_equal(struct verifier_state *old, struct verifier_state *cur)
2137 if (memcmp(rold, rcur, sizeof(*rold)) == 0) 2464 if (memcmp(rold, rcur, sizeof(*rold)) == 0)
2138 continue; 2465 continue;
2139 2466
2467 /* If the ranges were not the same, but everything else was and
2468 * we didn't do a variable access into a map then we are a-ok.
2469 */
2470 if (!env->varlen_map_value_access &&
2471 rold->type == rcur->type && rold->imm == rcur->imm)
2472 continue;
2473
2140 if (rold->type == NOT_INIT || 2474 if (rold->type == NOT_INIT ||
2141 (rold->type == UNKNOWN_VALUE && rcur->type != NOT_INIT)) 2475 (rold->type == UNKNOWN_VALUE && rcur->type != NOT_INIT))
2142 continue; 2476 continue;
@@ -2167,9 +2501,9 @@ static bool states_equal(struct verifier_state *old, struct verifier_state *cur)
2167 * the same, check that stored pointers types 2501 * the same, check that stored pointers types
2168 * are the same as well. 2502 * are the same as well.
2169 * Ex: explored safe path could have stored 2503 * Ex: explored safe path could have stored
2170 * (struct reg_state) {.type = PTR_TO_STACK, .imm = -8} 2504 * (bpf_reg_state) {.type = PTR_TO_STACK, .imm = -8}
2171 * but current path has stored: 2505 * but current path has stored:
2172 * (struct reg_state) {.type = PTR_TO_STACK, .imm = -16} 2506 * (bpf_reg_state) {.type = PTR_TO_STACK, .imm = -16}
2173 * such verifier states are not equivalent. 2507 * such verifier states are not equivalent.
2174 * return false to continue verification of this path 2508 * return false to continue verification of this path
2175 */ 2509 */
@@ -2180,10 +2514,10 @@ static bool states_equal(struct verifier_state *old, struct verifier_state *cur)
2180 return true; 2514 return true;
2181} 2515}
2182 2516
2183static int is_state_visited(struct verifier_env *env, int insn_idx) 2517static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
2184{ 2518{
2185 struct verifier_state_list *new_sl; 2519 struct bpf_verifier_state_list *new_sl;
2186 struct verifier_state_list *sl; 2520 struct bpf_verifier_state_list *sl;
2187 2521
2188 sl = env->explored_states[insn_idx]; 2522 sl = env->explored_states[insn_idx];
2189 if (!sl) 2523 if (!sl)
@@ -2193,7 +2527,7 @@ static int is_state_visited(struct verifier_env *env, int insn_idx)
2193 return 0; 2527 return 0;
2194 2528
2195 while (sl != STATE_LIST_MARK) { 2529 while (sl != STATE_LIST_MARK) {
2196 if (states_equal(&sl->state, &env->cur_state)) 2530 if (states_equal(env, &sl->state, &env->cur_state))
2197 /* reached equivalent register/stack state, 2531 /* reached equivalent register/stack state,
2198 * prune the search 2532 * prune the search
2199 */ 2533 */
@@ -2207,7 +2541,7 @@ static int is_state_visited(struct verifier_env *env, int insn_idx)
2207 * it will be rejected. Since there are no loops, we won't be 2541 * it will be rejected. Since there are no loops, we won't be
2208 * seeing this 'insn_idx' instruction again on the way to bpf_exit 2542 * seeing this 'insn_idx' instruction again on the way to bpf_exit
2209 */ 2543 */
2210 new_sl = kmalloc(sizeof(struct verifier_state_list), GFP_USER); 2544 new_sl = kmalloc(sizeof(struct bpf_verifier_state_list), GFP_USER);
2211 if (!new_sl) 2545 if (!new_sl)
2212 return -ENOMEM; 2546 return -ENOMEM;
2213 2547
@@ -2218,11 +2552,20 @@ static int is_state_visited(struct verifier_env *env, int insn_idx)
2218 return 0; 2552 return 0;
2219} 2553}
2220 2554
2221static int do_check(struct verifier_env *env) 2555static int ext_analyzer_insn_hook(struct bpf_verifier_env *env,
2556 int insn_idx, int prev_insn_idx)
2557{
2558 if (!env->analyzer_ops || !env->analyzer_ops->insn_hook)
2559 return 0;
2560
2561 return env->analyzer_ops->insn_hook(env, insn_idx, prev_insn_idx);
2562}
2563
2564static int do_check(struct bpf_verifier_env *env)
2222{ 2565{
2223 struct verifier_state *state = &env->cur_state; 2566 struct bpf_verifier_state *state = &env->cur_state;
2224 struct bpf_insn *insns = env->prog->insnsi; 2567 struct bpf_insn *insns = env->prog->insnsi;
2225 struct reg_state *regs = state->regs; 2568 struct bpf_reg_state *regs = state->regs;
2226 int insn_cnt = env->prog->len; 2569 int insn_cnt = env->prog->len;
2227 int insn_idx, prev_insn_idx = 0; 2570 int insn_idx, prev_insn_idx = 0;
2228 int insn_processed = 0; 2571 int insn_processed = 0;
@@ -2230,6 +2573,7 @@ static int do_check(struct verifier_env *env)
2230 2573
2231 init_reg_state(regs); 2574 init_reg_state(regs);
2232 insn_idx = 0; 2575 insn_idx = 0;
2576 env->varlen_map_value_access = false;
2233 for (;;) { 2577 for (;;) {
2234 struct bpf_insn *insn; 2578 struct bpf_insn *insn;
2235 u8 class; 2579 u8 class;
@@ -2276,13 +2620,17 @@ static int do_check(struct verifier_env *env)
2276 print_bpf_insn(insn); 2620 print_bpf_insn(insn);
2277 } 2621 }
2278 2622
2623 err = ext_analyzer_insn_hook(env, insn_idx, prev_insn_idx);
2624 if (err)
2625 return err;
2626
2279 if (class == BPF_ALU || class == BPF_ALU64) { 2627 if (class == BPF_ALU || class == BPF_ALU64) {
2280 err = check_alu_op(env, insn); 2628 err = check_alu_op(env, insn);
2281 if (err) 2629 if (err)
2282 return err; 2630 return err;
2283 2631
2284 } else if (class == BPF_LDX) { 2632 } else if (class == BPF_LDX) {
2285 enum bpf_reg_type src_reg_type; 2633 enum bpf_reg_type *prev_src_type, src_reg_type;
2286 2634
2287 /* check for reserved fields is already done */ 2635 /* check for reserved fields is already done */
2288 2636
@@ -2306,21 +2654,25 @@ static int do_check(struct verifier_env *env)
2306 if (err) 2654 if (err)
2307 return err; 2655 return err;
2308 2656
2309 if (BPF_SIZE(insn->code) != BPF_W) { 2657 reset_reg_range_values(regs, insn->dst_reg);
2658 if (BPF_SIZE(insn->code) != BPF_W &&
2659 BPF_SIZE(insn->code) != BPF_DW) {
2310 insn_idx++; 2660 insn_idx++;
2311 continue; 2661 continue;
2312 } 2662 }
2313 2663
2314 if (insn->imm == 0) { 2664 prev_src_type = &env->insn_aux_data[insn_idx].ptr_type;
2665
2666 if (*prev_src_type == NOT_INIT) {
2315 /* saw a valid insn 2667 /* saw a valid insn
2316 * dst_reg = *(u32 *)(src_reg + off) 2668 * dst_reg = *(u32 *)(src_reg + off)
2317 * use reserved 'imm' field to mark this insn 2669 * save type to validate intersecting paths
2318 */ 2670 */
2319 insn->imm = src_reg_type; 2671 *prev_src_type = src_reg_type;
2320 2672
2321 } else if (src_reg_type != insn->imm && 2673 } else if (src_reg_type != *prev_src_type &&
2322 (src_reg_type == PTR_TO_CTX || 2674 (src_reg_type == PTR_TO_CTX ||
2323 insn->imm == PTR_TO_CTX)) { 2675 *prev_src_type == PTR_TO_CTX)) {
2324 /* ABuser program is trying to use the same insn 2676 /* ABuser program is trying to use the same insn
2325 * dst_reg = *(u32*) (src_reg + off) 2677 * dst_reg = *(u32*) (src_reg + off)
2326 * with different pointer types: 2678 * with different pointer types:
@@ -2333,7 +2685,7 @@ static int do_check(struct verifier_env *env)
2333 } 2685 }
2334 2686
2335 } else if (class == BPF_STX) { 2687 } else if (class == BPF_STX) {
2336 enum bpf_reg_type dst_reg_type; 2688 enum bpf_reg_type *prev_dst_type, dst_reg_type;
2337 2689
2338 if (BPF_MODE(insn->code) == BPF_XADD) { 2690 if (BPF_MODE(insn->code) == BPF_XADD) {
2339 err = check_xadd(env, insn); 2691 err = check_xadd(env, insn);
@@ -2361,11 +2713,13 @@ static int do_check(struct verifier_env *env)
2361 if (err) 2713 if (err)
2362 return err; 2714 return err;
2363 2715
2364 if (insn->imm == 0) { 2716 prev_dst_type = &env->insn_aux_data[insn_idx].ptr_type;
2365 insn->imm = dst_reg_type; 2717
2366 } else if (dst_reg_type != insn->imm && 2718 if (*prev_dst_type == NOT_INIT) {
2719 *prev_dst_type = dst_reg_type;
2720 } else if (dst_reg_type != *prev_dst_type &&
2367 (dst_reg_type == PTR_TO_CTX || 2721 (dst_reg_type == PTR_TO_CTX ||
2368 insn->imm == PTR_TO_CTX)) { 2722 *prev_dst_type == PTR_TO_CTX)) {
2369 verbose("same insn cannot be used with different pointers\n"); 2723 verbose("same insn cannot be used with different pointers\n");
2370 return -EINVAL; 2724 return -EINVAL;
2371 } 2725 }
@@ -2471,6 +2825,7 @@ process_bpf_exit:
2471 verbose("invalid BPF_LD mode\n"); 2825 verbose("invalid BPF_LD mode\n");
2472 return -EINVAL; 2826 return -EINVAL;
2473 } 2827 }
2828 reset_reg_range_values(regs, insn->dst_reg);
2474 } else { 2829 } else {
2475 verbose("unknown insn class %d\n", class); 2830 verbose("unknown insn class %d\n", class);
2476 return -EINVAL; 2831 return -EINVAL;
@@ -2483,14 +2838,28 @@ process_bpf_exit:
2483 return 0; 2838 return 0;
2484} 2839}
2485 2840
2841static int check_map_prog_compatibility(struct bpf_map *map,
2842 struct bpf_prog *prog)
2843
2844{
2845 if (prog->type == BPF_PROG_TYPE_PERF_EVENT &&
2846 (map->map_type == BPF_MAP_TYPE_HASH ||
2847 map->map_type == BPF_MAP_TYPE_PERCPU_HASH) &&
2848 (map->map_flags & BPF_F_NO_PREALLOC)) {
2849 verbose("perf_event programs can only use preallocated hash map\n");
2850 return -EINVAL;
2851 }
2852 return 0;
2853}
2854
2486/* look for pseudo eBPF instructions that access map FDs and 2855/* look for pseudo eBPF instructions that access map FDs and
2487 * replace them with actual map pointers 2856 * replace them with actual map pointers
2488 */ 2857 */
2489static int replace_map_fd_with_map_ptr(struct verifier_env *env) 2858static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env)
2490{ 2859{
2491 struct bpf_insn *insn = env->prog->insnsi; 2860 struct bpf_insn *insn = env->prog->insnsi;
2492 int insn_cnt = env->prog->len; 2861 int insn_cnt = env->prog->len;
2493 int i, j; 2862 int i, j, err;
2494 2863
2495 for (i = 0; i < insn_cnt; i++, insn++) { 2864 for (i = 0; i < insn_cnt; i++, insn++) {
2496 if (BPF_CLASS(insn->code) == BPF_LDX && 2865 if (BPF_CLASS(insn->code) == BPF_LDX &&
@@ -2534,6 +2903,12 @@ static int replace_map_fd_with_map_ptr(struct verifier_env *env)
2534 return PTR_ERR(map); 2903 return PTR_ERR(map);
2535 } 2904 }
2536 2905
2906 err = check_map_prog_compatibility(map, env->prog);
2907 if (err) {
2908 fdput(f);
2909 return err;
2910 }
2911
2537 /* store map pointer inside BPF_LD_IMM64 instruction */ 2912 /* store map pointer inside BPF_LD_IMM64 instruction */
2538 insn[0].imm = (u32) (unsigned long) map; 2913 insn[0].imm = (u32) (unsigned long) map;
2539 insn[1].imm = ((u64) (unsigned long) map) >> 32; 2914 insn[1].imm = ((u64) (unsigned long) map) >> 32;
@@ -2577,7 +2952,7 @@ next_insn:
2577} 2952}
2578 2953
2579/* drop refcnt of maps used by the rejected program */ 2954/* drop refcnt of maps used by the rejected program */
2580static void release_maps(struct verifier_env *env) 2955static void release_maps(struct bpf_verifier_env *env)
2581{ 2956{
2582 int i; 2957 int i;
2583 2958
@@ -2586,7 +2961,7 @@ static void release_maps(struct verifier_env *env)
2586} 2961}
2587 2962
2588/* convert pseudo BPF_LD_IMM64 into generic BPF_LD_IMM64 */ 2963/* convert pseudo BPF_LD_IMM64 into generic BPF_LD_IMM64 */
2589static void convert_pseudo_ld_imm64(struct verifier_env *env) 2964static void convert_pseudo_ld_imm64(struct bpf_verifier_env *env)
2590{ 2965{
2591 struct bpf_insn *insn = env->prog->insnsi; 2966 struct bpf_insn *insn = env->prog->insnsi;
2592 int insn_cnt = env->prog->len; 2967 int insn_cnt = env->prog->len;
@@ -2600,62 +2975,74 @@ static void convert_pseudo_ld_imm64(struct verifier_env *env)
2600/* convert load instructions that access fields of 'struct __sk_buff' 2975/* convert load instructions that access fields of 'struct __sk_buff'
2601 * into sequence of instructions that access fields of 'struct sk_buff' 2976 * into sequence of instructions that access fields of 'struct sk_buff'
2602 */ 2977 */
2603static int convert_ctx_accesses(struct verifier_env *env) 2978static int convert_ctx_accesses(struct bpf_verifier_env *env)
2604{ 2979{
2605 struct bpf_insn *insn = env->prog->insnsi; 2980 const struct bpf_verifier_ops *ops = env->prog->aux->ops;
2606 int insn_cnt = env->prog->len; 2981 const int insn_cnt = env->prog->len;
2607 struct bpf_insn insn_buf[16]; 2982 struct bpf_insn insn_buf[16], *insn;
2608 struct bpf_prog *new_prog; 2983 struct bpf_prog *new_prog;
2609 enum bpf_access_type type; 2984 enum bpf_access_type type;
2610 int i; 2985 int i, cnt, delta = 0;
2611 2986
2612 if (!env->prog->aux->ops->convert_ctx_access) 2987 if (ops->gen_prologue) {
2988 cnt = ops->gen_prologue(insn_buf, env->seen_direct_write,
2989 env->prog);
2990 if (cnt >= ARRAY_SIZE(insn_buf)) {
2991 verbose("bpf verifier is misconfigured\n");
2992 return -EINVAL;
2993 } else if (cnt) {
2994 new_prog = bpf_patch_insn_single(env->prog, 0,
2995 insn_buf, cnt);
2996 if (!new_prog)
2997 return -ENOMEM;
2998 env->prog = new_prog;
2999 delta += cnt - 1;
3000 }
3001 }
3002
3003 if (!ops->convert_ctx_access)
2613 return 0; 3004 return 0;
2614 3005
2615 for (i = 0; i < insn_cnt; i++, insn++) { 3006 insn = env->prog->insnsi + delta;
2616 u32 insn_delta, cnt;
2617 3007
2618 if (insn->code == (BPF_LDX | BPF_MEM | BPF_W)) 3008 for (i = 0; i < insn_cnt; i++, insn++) {
3009 if (insn->code == (BPF_LDX | BPF_MEM | BPF_W) ||
3010 insn->code == (BPF_LDX | BPF_MEM | BPF_DW))
2619 type = BPF_READ; 3011 type = BPF_READ;
2620 else if (insn->code == (BPF_STX | BPF_MEM | BPF_W)) 3012 else if (insn->code == (BPF_STX | BPF_MEM | BPF_W) ||
3013 insn->code == (BPF_STX | BPF_MEM | BPF_DW))
2621 type = BPF_WRITE; 3014 type = BPF_WRITE;
2622 else 3015 else
2623 continue; 3016 continue;
2624 3017
2625 if (insn->imm != PTR_TO_CTX) { 3018 if (env->insn_aux_data[i].ptr_type != PTR_TO_CTX)
2626 /* clear internal mark */
2627 insn->imm = 0;
2628 continue; 3019 continue;
2629 }
2630 3020
2631 cnt = env->prog->aux->ops-> 3021 cnt = ops->convert_ctx_access(type, insn->dst_reg, insn->src_reg,
2632 convert_ctx_access(type, insn->dst_reg, insn->src_reg, 3022 insn->off, insn_buf, env->prog);
2633 insn->off, insn_buf, env->prog);
2634 if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) { 3023 if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) {
2635 verbose("bpf verifier is misconfigured\n"); 3024 verbose("bpf verifier is misconfigured\n");
2636 return -EINVAL; 3025 return -EINVAL;
2637 } 3026 }
2638 3027
2639 new_prog = bpf_patch_insn_single(env->prog, i, insn_buf, cnt); 3028 new_prog = bpf_patch_insn_single(env->prog, i + delta, insn_buf,
3029 cnt);
2640 if (!new_prog) 3030 if (!new_prog)
2641 return -ENOMEM; 3031 return -ENOMEM;
2642 3032
2643 insn_delta = cnt - 1; 3033 delta += cnt - 1;
2644 3034
2645 /* keep walking new program and skip insns we just inserted */ 3035 /* keep walking new program and skip insns we just inserted */
2646 env->prog = new_prog; 3036 env->prog = new_prog;
2647 insn = new_prog->insnsi + i + insn_delta; 3037 insn = new_prog->insnsi + i + delta;
2648
2649 insn_cnt += insn_delta;
2650 i += insn_delta;
2651 } 3038 }
2652 3039
2653 return 0; 3040 return 0;
2654} 3041}
2655 3042
2656static void free_states(struct verifier_env *env) 3043static void free_states(struct bpf_verifier_env *env)
2657{ 3044{
2658 struct verifier_state_list *sl, *sln; 3045 struct bpf_verifier_state_list *sl, *sln;
2659 int i; 3046 int i;
2660 3047
2661 if (!env->explored_states) 3048 if (!env->explored_states)
@@ -2678,19 +3065,24 @@ static void free_states(struct verifier_env *env)
2678int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) 3065int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
2679{ 3066{
2680 char __user *log_ubuf = NULL; 3067 char __user *log_ubuf = NULL;
2681 struct verifier_env *env; 3068 struct bpf_verifier_env *env;
2682 int ret = -EINVAL; 3069 int ret = -EINVAL;
2683 3070
2684 if ((*prog)->len <= 0 || (*prog)->len > BPF_MAXINSNS) 3071 if ((*prog)->len <= 0 || (*prog)->len > BPF_MAXINSNS)
2685 return -E2BIG; 3072 return -E2BIG;
2686 3073
2687 /* 'struct verifier_env' can be global, but since it's not small, 3074 /* 'struct bpf_verifier_env' can be global, but since it's not small,
2688 * allocate/free it every time bpf_check() is called 3075 * allocate/free it every time bpf_check() is called
2689 */ 3076 */
2690 env = kzalloc(sizeof(struct verifier_env), GFP_KERNEL); 3077 env = kzalloc(sizeof(struct bpf_verifier_env), GFP_KERNEL);
2691 if (!env) 3078 if (!env)
2692 return -ENOMEM; 3079 return -ENOMEM;
2693 3080
3081 env->insn_aux_data = vzalloc(sizeof(struct bpf_insn_aux_data) *
3082 (*prog)->len);
3083 ret = -ENOMEM;
3084 if (!env->insn_aux_data)
3085 goto err_free_env;
2694 env->prog = *prog; 3086 env->prog = *prog;
2695 3087
2696 /* grab the mutex to protect few globals used by verifier */ 3088 /* grab the mutex to protect few globals used by verifier */
@@ -2709,12 +3101,12 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
2709 /* log_* values have to be sane */ 3101 /* log_* values have to be sane */
2710 if (log_size < 128 || log_size > UINT_MAX >> 8 || 3102 if (log_size < 128 || log_size > UINT_MAX >> 8 ||
2711 log_level == 0 || log_ubuf == NULL) 3103 log_level == 0 || log_ubuf == NULL)
2712 goto free_env; 3104 goto err_unlock;
2713 3105
2714 ret = -ENOMEM; 3106 ret = -ENOMEM;
2715 log_buf = vmalloc(log_size); 3107 log_buf = vmalloc(log_size);
2716 if (!log_buf) 3108 if (!log_buf)
2717 goto free_env; 3109 goto err_unlock;
2718 } else { 3110 } else {
2719 log_level = 0; 3111 log_level = 0;
2720 } 3112 }
@@ -2724,7 +3116,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
2724 goto skip_full_check; 3116 goto skip_full_check;
2725 3117
2726 env->explored_states = kcalloc(env->prog->len, 3118 env->explored_states = kcalloc(env->prog->len,
2727 sizeof(struct verifier_state_list *), 3119 sizeof(struct bpf_verifier_state_list *),
2728 GFP_USER); 3120 GFP_USER);
2729 ret = -ENOMEM; 3121 ret = -ENOMEM;
2730 if (!env->explored_states) 3122 if (!env->explored_states)
@@ -2783,14 +3175,67 @@ skip_full_check:
2783free_log_buf: 3175free_log_buf:
2784 if (log_level) 3176 if (log_level)
2785 vfree(log_buf); 3177 vfree(log_buf);
2786free_env:
2787 if (!env->prog->aux->used_maps) 3178 if (!env->prog->aux->used_maps)
2788 /* if we didn't copy map pointers into bpf_prog_info, release 3179 /* if we didn't copy map pointers into bpf_prog_info, release
2789 * them now. Otherwise free_bpf_prog_info() will release them. 3180 * them now. Otherwise free_bpf_prog_info() will release them.
2790 */ 3181 */
2791 release_maps(env); 3182 release_maps(env);
2792 *prog = env->prog; 3183 *prog = env->prog;
3184err_unlock:
3185 mutex_unlock(&bpf_verifier_lock);
3186 vfree(env->insn_aux_data);
3187err_free_env:
2793 kfree(env); 3188 kfree(env);
3189 return ret;
3190}
3191
3192int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops,
3193 void *priv)
3194{
3195 struct bpf_verifier_env *env;
3196 int ret;
3197
3198 env = kzalloc(sizeof(struct bpf_verifier_env), GFP_KERNEL);
3199 if (!env)
3200 return -ENOMEM;
3201
3202 env->insn_aux_data = vzalloc(sizeof(struct bpf_insn_aux_data) *
3203 prog->len);
3204 ret = -ENOMEM;
3205 if (!env->insn_aux_data)
3206 goto err_free_env;
3207 env->prog = prog;
3208 env->analyzer_ops = ops;
3209 env->analyzer_priv = priv;
3210
3211 /* grab the mutex to protect few globals used by verifier */
3212 mutex_lock(&bpf_verifier_lock);
3213
3214 log_level = 0;
3215
3216 env->explored_states = kcalloc(env->prog->len,
3217 sizeof(struct bpf_verifier_state_list *),
3218 GFP_KERNEL);
3219 ret = -ENOMEM;
3220 if (!env->explored_states)
3221 goto skip_full_check;
3222
3223 ret = check_cfg(env);
3224 if (ret < 0)
3225 goto skip_full_check;
3226
3227 env->allow_ptr_leaks = capable(CAP_SYS_ADMIN);
3228
3229 ret = do_check(env);
3230
3231skip_full_check:
3232 while (pop_stack(env, NULL) >= 0);
3233 free_states(env);
3234
2794 mutex_unlock(&bpf_verifier_lock); 3235 mutex_unlock(&bpf_verifier_lock);
3236 vfree(env->insn_aux_data);
3237err_free_env:
3238 kfree(env);
2795 return ret; 3239 return ret;
2796} 3240}
3241EXPORT_SYMBOL_GPL(bpf_analyzer);
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index d1c51b7f5221..85bc9beb046d 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -64,6 +64,9 @@
64#include <linux/file.h> 64#include <linux/file.h>
65#include <net/sock.h> 65#include <net/sock.h>
66 66
67#define CREATE_TRACE_POINTS
68#include <trace/events/cgroup.h>
69
67/* 70/*
68 * pidlists linger the following amount before being destroyed. The goal 71 * pidlists linger the following amount before being destroyed. The goal
69 * is avoiding frequent destruction in the middle of consecutive read calls 72 * is avoiding frequent destruction in the middle of consecutive read calls
@@ -1176,6 +1179,8 @@ static void cgroup_destroy_root(struct cgroup_root *root)
1176 struct cgroup *cgrp = &root->cgrp; 1179 struct cgroup *cgrp = &root->cgrp;
1177 struct cgrp_cset_link *link, *tmp_link; 1180 struct cgrp_cset_link *link, *tmp_link;
1178 1181
1182 trace_cgroup_destroy_root(root);
1183
1179 cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp); 1184 cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
1180 1185
1181 BUG_ON(atomic_read(&root->nr_cgrps)); 1186 BUG_ON(atomic_read(&root->nr_cgrps));
@@ -1874,6 +1879,9 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
1874 strcpy(root->release_agent_path, opts.release_agent); 1879 strcpy(root->release_agent_path, opts.release_agent);
1875 spin_unlock(&release_agent_path_lock); 1880 spin_unlock(&release_agent_path_lock);
1876 } 1881 }
1882
1883 trace_cgroup_remount(root);
1884
1877 out_unlock: 1885 out_unlock:
1878 kfree(opts.release_agent); 1886 kfree(opts.release_agent);
1879 kfree(opts.name); 1887 kfree(opts.name);
@@ -2031,6 +2039,8 @@ static int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
2031 if (ret) 2039 if (ret)
2032 goto destroy_root; 2040 goto destroy_root;
2033 2041
2042 trace_cgroup_setup_root(root);
2043
2034 /* 2044 /*
2035 * There must be no failure case after here, since rebinding takes 2045 * There must be no failure case after here, since rebinding takes
2036 * care of subsystems' refcounts, which are explicitly dropped in 2046 * care of subsystems' refcounts, which are explicitly dropped in
@@ -2315,22 +2325,18 @@ static struct file_system_type cgroup2_fs_type = {
2315 .fs_flags = FS_USERNS_MOUNT, 2325 .fs_flags = FS_USERNS_MOUNT,
2316}; 2326};
2317 2327
2318static char *cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen, 2328static int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen,
2319 struct cgroup_namespace *ns) 2329 struct cgroup_namespace *ns)
2320{ 2330{
2321 struct cgroup *root = cset_cgroup_from_root(ns->root_cset, cgrp->root); 2331 struct cgroup *root = cset_cgroup_from_root(ns->root_cset, cgrp->root);
2322 int ret;
2323 2332
2324 ret = kernfs_path_from_node(cgrp->kn, root->kn, buf, buflen); 2333 return kernfs_path_from_node(cgrp->kn, root->kn, buf, buflen);
2325 if (ret < 0 || ret >= buflen)
2326 return NULL;
2327 return buf;
2328} 2334}
2329 2335
2330char *cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen, 2336int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen,
2331 struct cgroup_namespace *ns) 2337 struct cgroup_namespace *ns)
2332{ 2338{
2333 char *ret; 2339 int ret;
2334 2340
2335 mutex_lock(&cgroup_mutex); 2341 mutex_lock(&cgroup_mutex);
2336 spin_lock_irq(&css_set_lock); 2342 spin_lock_irq(&css_set_lock);
@@ -2357,12 +2363,12 @@ EXPORT_SYMBOL_GPL(cgroup_path_ns);
2357 * 2363 *
2358 * Return value is the same as kernfs_path(). 2364 * Return value is the same as kernfs_path().
2359 */ 2365 */
2360char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen) 2366int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
2361{ 2367{
2362 struct cgroup_root *root; 2368 struct cgroup_root *root;
2363 struct cgroup *cgrp; 2369 struct cgroup *cgrp;
2364 int hierarchy_id = 1; 2370 int hierarchy_id = 1;
2365 char *path = NULL; 2371 int ret;
2366 2372
2367 mutex_lock(&cgroup_mutex); 2373 mutex_lock(&cgroup_mutex);
2368 spin_lock_irq(&css_set_lock); 2374 spin_lock_irq(&css_set_lock);
@@ -2371,16 +2377,15 @@ char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
2371 2377
2372 if (root) { 2378 if (root) {
2373 cgrp = task_cgroup_from_root(task, root); 2379 cgrp = task_cgroup_from_root(task, root);
2374 path = cgroup_path_ns_locked(cgrp, buf, buflen, &init_cgroup_ns); 2380 ret = cgroup_path_ns_locked(cgrp, buf, buflen, &init_cgroup_ns);
2375 } else { 2381 } else {
2376 /* if no hierarchy exists, everyone is in "/" */ 2382 /* if no hierarchy exists, everyone is in "/" */
2377 if (strlcpy(buf, "/", buflen) < buflen) 2383 ret = strlcpy(buf, "/", buflen);
2378 path = buf;
2379 } 2384 }
2380 2385
2381 spin_unlock_irq(&css_set_lock); 2386 spin_unlock_irq(&css_set_lock);
2382 mutex_unlock(&cgroup_mutex); 2387 mutex_unlock(&cgroup_mutex);
2383 return path; 2388 return ret;
2384} 2389}
2385EXPORT_SYMBOL_GPL(task_cgroup_path); 2390EXPORT_SYMBOL_GPL(task_cgroup_path);
2386 2391
@@ -2830,6 +2835,10 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp,
2830 ret = cgroup_migrate(leader, threadgroup, dst_cgrp->root); 2835 ret = cgroup_migrate(leader, threadgroup, dst_cgrp->root);
2831 2836
2832 cgroup_migrate_finish(&preloaded_csets); 2837 cgroup_migrate_finish(&preloaded_csets);
2838
2839 if (!ret)
2840 trace_cgroup_attach_task(dst_cgrp, leader, threadgroup);
2841
2833 return ret; 2842 return ret;
2834} 2843}
2835 2844
@@ -3446,9 +3455,28 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
3446 * Except for the root, subtree_control must be zero for a cgroup 3455 * Except for the root, subtree_control must be zero for a cgroup
3447 * with tasks so that child cgroups don't compete against tasks. 3456 * with tasks so that child cgroups don't compete against tasks.
3448 */ 3457 */
3449 if (enable && cgroup_parent(cgrp) && !list_empty(&cgrp->cset_links)) { 3458 if (enable && cgroup_parent(cgrp)) {
3450 ret = -EBUSY; 3459 struct cgrp_cset_link *link;
3451 goto out_unlock; 3460
3461 /*
3462 * Because namespaces pin csets too, @cgrp->cset_links
3463 * might not be empty even when @cgrp is empty. Walk and
3464 * verify each cset.
3465 */
3466 spin_lock_irq(&css_set_lock);
3467
3468 ret = 0;
3469 list_for_each_entry(link, &cgrp->cset_links, cset_link) {
3470 if (css_set_populated(link->cset)) {
3471 ret = -EBUSY;
3472 break;
3473 }
3474 }
3475
3476 spin_unlock_irq(&css_set_lock);
3477
3478 if (ret)
3479 goto out_unlock;
3452 } 3480 }
3453 3481
3454 /* save and update control masks and prepare csses */ 3482 /* save and update control masks and prepare csses */
@@ -3592,6 +3620,8 @@ static int cgroup_rename(struct kernfs_node *kn, struct kernfs_node *new_parent,
3592 mutex_lock(&cgroup_mutex); 3620 mutex_lock(&cgroup_mutex);
3593 3621
3594 ret = kernfs_rename(kn, new_parent, new_name_str); 3622 ret = kernfs_rename(kn, new_parent, new_name_str);
3623 if (!ret)
3624 trace_cgroup_rename(cgrp);
3595 3625
3596 mutex_unlock(&cgroup_mutex); 3626 mutex_unlock(&cgroup_mutex);
3597 3627
@@ -3899,7 +3929,9 @@ void cgroup_file_notify(struct cgroup_file *cfile)
3899 * cgroup_task_count - count the number of tasks in a cgroup. 3929 * cgroup_task_count - count the number of tasks in a cgroup.
3900 * @cgrp: the cgroup in question 3930 * @cgrp: the cgroup in question
3901 * 3931 *
3902 * Return the number of tasks in the cgroup. 3932 * Return the number of tasks in the cgroup. The returned number can be
3933 * higher than the actual number of tasks due to css_set references from
3934 * namespace roots and temporary usages.
3903 */ 3935 */
3904static int cgroup_task_count(const struct cgroup *cgrp) 3936static int cgroup_task_count(const struct cgroup *cgrp)
3905{ 3937{
@@ -4360,6 +4392,8 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
4360 4392
4361 if (task) { 4393 if (task) {
4362 ret = cgroup_migrate(task, false, to->root); 4394 ret = cgroup_migrate(task, false, to->root);
4395 if (!ret)
4396 trace_cgroup_transfer_tasks(to, task, false);
4363 put_task_struct(task); 4397 put_task_struct(task);
4364 } 4398 }
4365 } while (task && !ret); 4399 } while (task && !ret);
@@ -5025,6 +5059,8 @@ static void css_release_work_fn(struct work_struct *work)
5025 ss->css_released(css); 5059 ss->css_released(css);
5026 } else { 5060 } else {
5027 /* cgroup release path */ 5061 /* cgroup release path */
5062 trace_cgroup_release(cgrp);
5063
5028 cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id); 5064 cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
5029 cgrp->id = -1; 5065 cgrp->id = -1;
5030 5066
@@ -5311,6 +5347,8 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
5311 if (ret) 5347 if (ret)
5312 goto out_destroy; 5348 goto out_destroy;
5313 5349
5350 trace_cgroup_mkdir(cgrp);
5351
5314 /* let's create and online css's */ 5352 /* let's create and online css's */
5315 kernfs_activate(kn); 5353 kernfs_activate(kn);
5316 5354
@@ -5486,6 +5524,9 @@ static int cgroup_rmdir(struct kernfs_node *kn)
5486 5524
5487 ret = cgroup_destroy_locked(cgrp); 5525 ret = cgroup_destroy_locked(cgrp);
5488 5526
5527 if (!ret)
5528 trace_cgroup_rmdir(cgrp);
5529
5489 cgroup_kn_unlock(kn); 5530 cgroup_kn_unlock(kn);
5490 return ret; 5531 return ret;
5491} 5532}
@@ -5606,6 +5647,12 @@ int __init cgroup_init(void)
5606 BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files)); 5647 BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files));
5607 BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files)); 5648 BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files));
5608 5649
5650 /*
5651 * The latency of the synchronize_sched() is too high for cgroups,
5652 * avoid it at the cost of forcing all readers into the slow path.
5653 */
5654 rcu_sync_enter_start(&cgroup_threadgroup_rwsem.rss);
5655
5609 get_user_ns(init_cgroup_ns.user_ns); 5656 get_user_ns(init_cgroup_ns.user_ns);
5610 5657
5611 mutex_lock(&cgroup_mutex); 5658 mutex_lock(&cgroup_mutex);
@@ -5716,7 +5763,7 @@ core_initcall(cgroup_wq_init);
5716int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, 5763int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
5717 struct pid *pid, struct task_struct *tsk) 5764 struct pid *pid, struct task_struct *tsk)
5718{ 5765{
5719 char *buf, *path; 5766 char *buf;
5720 int retval; 5767 int retval;
5721 struct cgroup_root *root; 5768 struct cgroup_root *root;
5722 5769
@@ -5759,18 +5806,18 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
5759 * " (deleted)" is appended to the cgroup path. 5806 * " (deleted)" is appended to the cgroup path.
5760 */ 5807 */
5761 if (cgroup_on_dfl(cgrp) || !(tsk->flags & PF_EXITING)) { 5808 if (cgroup_on_dfl(cgrp) || !(tsk->flags & PF_EXITING)) {
5762 path = cgroup_path_ns_locked(cgrp, buf, PATH_MAX, 5809 retval = cgroup_path_ns_locked(cgrp, buf, PATH_MAX,
5763 current->nsproxy->cgroup_ns); 5810 current->nsproxy->cgroup_ns);
5764 if (!path) { 5811 if (retval >= PATH_MAX)
5765 retval = -ENAMETOOLONG; 5812 retval = -ENAMETOOLONG;
5813 if (retval < 0)
5766 goto out_unlock; 5814 goto out_unlock;
5767 } 5815
5816 seq_puts(m, buf);
5768 } else { 5817 } else {
5769 path = "/"; 5818 seq_puts(m, "/");
5770 } 5819 }
5771 5820
5772 seq_puts(m, path);
5773
5774 if (cgroup_on_dfl(cgrp) && cgroup_is_dead(cgrp)) 5821 if (cgroup_on_dfl(cgrp) && cgroup_is_dead(cgrp))
5775 seq_puts(m, " (deleted)\n"); 5822 seq_puts(m, " (deleted)\n");
5776 else 5823 else
@@ -6035,8 +6082,9 @@ static void cgroup_release_agent(struct work_struct *work)
6035{ 6082{
6036 struct cgroup *cgrp = 6083 struct cgroup *cgrp =
6037 container_of(work, struct cgroup, release_agent_work); 6084 container_of(work, struct cgroup, release_agent_work);
6038 char *pathbuf = NULL, *agentbuf = NULL, *path; 6085 char *pathbuf = NULL, *agentbuf = NULL;
6039 char *argv[3], *envp[3]; 6086 char *argv[3], *envp[3];
6087 int ret;
6040 6088
6041 mutex_lock(&cgroup_mutex); 6089 mutex_lock(&cgroup_mutex);
6042 6090
@@ -6046,13 +6094,13 @@ static void cgroup_release_agent(struct work_struct *work)
6046 goto out; 6094 goto out;
6047 6095
6048 spin_lock_irq(&css_set_lock); 6096 spin_lock_irq(&css_set_lock);
6049 path = cgroup_path_ns_locked(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns); 6097 ret = cgroup_path_ns_locked(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns);
6050 spin_unlock_irq(&css_set_lock); 6098 spin_unlock_irq(&css_set_lock);
6051 if (!path) 6099 if (ret < 0 || ret >= PATH_MAX)
6052 goto out; 6100 goto out;
6053 6101
6054 argv[0] = agentbuf; 6102 argv[0] = agentbuf;
6055 argv[1] = path; 6103 argv[1] = pathbuf;
6056 argv[2] = NULL; 6104 argv[2] = NULL;
6057 6105
6058 /* minimal command environment */ 6106 /* minimal command environment */
@@ -6270,6 +6318,12 @@ void cgroup_sk_alloc(struct sock_cgroup_data *skcd)
6270 if (cgroup_sk_alloc_disabled) 6318 if (cgroup_sk_alloc_disabled)
6271 return; 6319 return;
6272 6320
6321 /* Socket clone path */
6322 if (skcd->val) {
6323 cgroup_get(sock_cgroup_ptr(skcd));
6324 return;
6325 }
6326
6273 rcu_read_lock(); 6327 rcu_read_lock();
6274 6328
6275 while (true) { 6329 while (true) {
@@ -6295,6 +6349,16 @@ void cgroup_sk_free(struct sock_cgroup_data *skcd)
6295 6349
6296/* cgroup namespaces */ 6350/* cgroup namespaces */
6297 6351
6352static struct ucounts *inc_cgroup_namespaces(struct user_namespace *ns)
6353{
6354 return inc_ucount(ns, current_euid(), UCOUNT_CGROUP_NAMESPACES);
6355}
6356
6357static void dec_cgroup_namespaces(struct ucounts *ucounts)
6358{
6359 dec_ucount(ucounts, UCOUNT_CGROUP_NAMESPACES);
6360}
6361
6298static struct cgroup_namespace *alloc_cgroup_ns(void) 6362static struct cgroup_namespace *alloc_cgroup_ns(void)
6299{ 6363{
6300 struct cgroup_namespace *new_ns; 6364 struct cgroup_namespace *new_ns;
@@ -6316,6 +6380,7 @@ static struct cgroup_namespace *alloc_cgroup_ns(void)
6316void free_cgroup_ns(struct cgroup_namespace *ns) 6380void free_cgroup_ns(struct cgroup_namespace *ns)
6317{ 6381{
6318 put_css_set(ns->root_cset); 6382 put_css_set(ns->root_cset);
6383 dec_cgroup_namespaces(ns->ucounts);
6319 put_user_ns(ns->user_ns); 6384 put_user_ns(ns->user_ns);
6320 ns_free_inum(&ns->ns); 6385 ns_free_inum(&ns->ns);
6321 kfree(ns); 6386 kfree(ns);
@@ -6327,6 +6392,7 @@ struct cgroup_namespace *copy_cgroup_ns(unsigned long flags,
6327 struct cgroup_namespace *old_ns) 6392 struct cgroup_namespace *old_ns)
6328{ 6393{
6329 struct cgroup_namespace *new_ns; 6394 struct cgroup_namespace *new_ns;
6395 struct ucounts *ucounts;
6330 struct css_set *cset; 6396 struct css_set *cset;
6331 6397
6332 BUG_ON(!old_ns); 6398 BUG_ON(!old_ns);
@@ -6340,6 +6406,10 @@ struct cgroup_namespace *copy_cgroup_ns(unsigned long flags,
6340 if (!ns_capable(user_ns, CAP_SYS_ADMIN)) 6406 if (!ns_capable(user_ns, CAP_SYS_ADMIN))
6341 return ERR_PTR(-EPERM); 6407 return ERR_PTR(-EPERM);
6342 6408
6409 ucounts = inc_cgroup_namespaces(user_ns);
6410 if (!ucounts)
6411 return ERR_PTR(-ENOSPC);
6412
6343 /* It is not safe to take cgroup_mutex here */ 6413 /* It is not safe to take cgroup_mutex here */
6344 spin_lock_irq(&css_set_lock); 6414 spin_lock_irq(&css_set_lock);
6345 cset = task_css_set(current); 6415 cset = task_css_set(current);
@@ -6349,10 +6419,12 @@ struct cgroup_namespace *copy_cgroup_ns(unsigned long flags,
6349 new_ns = alloc_cgroup_ns(); 6419 new_ns = alloc_cgroup_ns();
6350 if (IS_ERR(new_ns)) { 6420 if (IS_ERR(new_ns)) {
6351 put_css_set(cset); 6421 put_css_set(cset);
6422 dec_cgroup_namespaces(ucounts);
6352 return new_ns; 6423 return new_ns;
6353 } 6424 }
6354 6425
6355 new_ns->user_ns = get_user_ns(user_ns); 6426 new_ns->user_ns = get_user_ns(user_ns);
6427 new_ns->ucounts = ucounts;
6356 new_ns->root_cset = cset; 6428 new_ns->root_cset = cset;
6357 6429
6358 return new_ns; 6430 return new_ns;
@@ -6403,12 +6475,18 @@ static void cgroupns_put(struct ns_common *ns)
6403 put_cgroup_ns(to_cg_ns(ns)); 6475 put_cgroup_ns(to_cg_ns(ns));
6404} 6476}
6405 6477
6478static struct user_namespace *cgroupns_owner(struct ns_common *ns)
6479{
6480 return to_cg_ns(ns)->user_ns;
6481}
6482
6406const struct proc_ns_operations cgroupns_operations = { 6483const struct proc_ns_operations cgroupns_operations = {
6407 .name = "cgroup", 6484 .name = "cgroup",
6408 .type = CLONE_NEWCGROUP, 6485 .type = CLONE_NEWCGROUP,
6409 .get = cgroupns_get, 6486 .get = cgroupns_get,
6410 .put = cgroupns_put, 6487 .put = cgroupns_put,
6411 .install = cgroupns_install, 6488 .install = cgroupns_install,
6489 .owner = cgroupns_owner,
6412}; 6490};
6413 6491
6414static __init int cgroup_namespaces_init(void) 6492static __init int cgroup_namespaces_init(void)
diff --git a/kernel/configs/android-base.config b/kernel/configs/android-base.config
index 9f748ed7bea8..1a8f34f63601 100644
--- a/kernel/configs/android-base.config
+++ b/kernel/configs/android-base.config
@@ -11,7 +11,6 @@ CONFIG_ANDROID_LOW_MEMORY_KILLER=y
11CONFIG_ARMV8_DEPRECATED=y 11CONFIG_ARMV8_DEPRECATED=y
12CONFIG_ASHMEM=y 12CONFIG_ASHMEM=y
13CONFIG_AUDIT=y 13CONFIG_AUDIT=y
14CONFIG_BLK_DEV_DM=y
15CONFIG_BLK_DEV_INITRD=y 14CONFIG_BLK_DEV_INITRD=y
16CONFIG_CGROUPS=y 15CONFIG_CGROUPS=y
17CONFIG_CGROUP_CPUACCT=y 16CONFIG_CGROUP_CPUACCT=y
@@ -19,9 +18,7 @@ CONFIG_CGROUP_DEBUG=y
19CONFIG_CGROUP_FREEZER=y 18CONFIG_CGROUP_FREEZER=y
20CONFIG_CGROUP_SCHED=y 19CONFIG_CGROUP_SCHED=y
21CONFIG_CP15_BARRIER_EMULATION=y 20CONFIG_CP15_BARRIER_EMULATION=y
22CONFIG_DM_CRYPT=y 21CONFIG_DEFAULT_SECURITY_SELINUX=y
23CONFIG_DM_VERITY=y
24CONFIG_DM_VERITY_FEC=y
25CONFIG_EMBEDDED=y 22CONFIG_EMBEDDED=y
26CONFIG_FB=y 23CONFIG_FB=y
27CONFIG_HIGH_RES_TIMERS=y 24CONFIG_HIGH_RES_TIMERS=y
@@ -41,7 +38,6 @@ CONFIG_IPV6=y
41CONFIG_IPV6_MIP6=y 38CONFIG_IPV6_MIP6=y
42CONFIG_IPV6_MULTIPLE_TABLES=y 39CONFIG_IPV6_MULTIPLE_TABLES=y
43CONFIG_IPV6_OPTIMISTIC_DAD=y 40CONFIG_IPV6_OPTIMISTIC_DAD=y
44CONFIG_IPV6_PRIVACY=y
45CONFIG_IPV6_ROUTER_PREF=y 41CONFIG_IPV6_ROUTER_PREF=y
46CONFIG_IPV6_ROUTE_INFO=y 42CONFIG_IPV6_ROUTE_INFO=y
47CONFIG_IP_ADVANCED_ROUTER=y 43CONFIG_IP_ADVANCED_ROUTER=y
@@ -135,6 +131,7 @@ CONFIG_PREEMPT=y
135CONFIG_QUOTA=y 131CONFIG_QUOTA=y
136CONFIG_RTC_CLASS=y 132CONFIG_RTC_CLASS=y
137CONFIG_RT_GROUP_SCHED=y 133CONFIG_RT_GROUP_SCHED=y
134CONFIG_SECCOMP=y
138CONFIG_SECURITY=y 135CONFIG_SECURITY=y
139CONFIG_SECURITY_NETWORK=y 136CONFIG_SECURITY_NETWORK=y
140CONFIG_SECURITY_SELINUX=y 137CONFIG_SECURITY_SELINUX=y
diff --git a/kernel/configs/android-recommended.config b/kernel/configs/android-recommended.config
index e3b953e966d2..297756be369c 100644
--- a/kernel/configs/android-recommended.config
+++ b/kernel/configs/android-recommended.config
@@ -6,12 +6,16 @@
6# CONFIG_PM_WAKELOCKS_GC is not set 6# CONFIG_PM_WAKELOCKS_GC is not set
7# CONFIG_VT is not set 7# CONFIG_VT is not set
8CONFIG_BACKLIGHT_LCD_SUPPORT=y 8CONFIG_BACKLIGHT_LCD_SUPPORT=y
9CONFIG_BLK_DEV_DM=y
9CONFIG_BLK_DEV_LOOP=y 10CONFIG_BLK_DEV_LOOP=y
10CONFIG_BLK_DEV_RAM=y 11CONFIG_BLK_DEV_RAM=y
11CONFIG_BLK_DEV_RAM_SIZE=8192 12CONFIG_BLK_DEV_RAM_SIZE=8192
12CONFIG_COMPACTION=y 13CONFIG_COMPACTION=y
13CONFIG_DEBUG_RODATA=y 14CONFIG_DEBUG_RODATA=y
15CONFIG_DM_CRYPT=y
14CONFIG_DM_UEVENT=y 16CONFIG_DM_UEVENT=y
17CONFIG_DM_VERITY=y
18CONFIG_DM_VERITY_FEC=y
15CONFIG_DRAGONRISE_FF=y 19CONFIG_DRAGONRISE_FF=y
16CONFIG_ENABLE_DEFAULT_TRACERS=y 20CONFIG_ENABLE_DEFAULT_TRACERS=y
17CONFIG_EXT4_FS=y 21CONFIG_EXT4_FS=y
diff --git a/kernel/configs/kvm_guest.config b/kernel/configs/kvm_guest.config
new file mode 100644
index 000000000000..8d9643767142
--- /dev/null
+++ b/kernel/configs/kvm_guest.config
@@ -0,0 +1,32 @@
1CONFIG_NET=y
2CONFIG_NET_CORE=y
3CONFIG_NETDEVICES=y
4CONFIG_BLOCK=y
5CONFIG_BLK_DEV=y
6CONFIG_NETWORK_FILESYSTEMS=y
7CONFIG_INET=y
8CONFIG_TTY=y
9CONFIG_SERIAL_8250=y
10CONFIG_SERIAL_8250_CONSOLE=y
11CONFIG_IP_PNP=y
12CONFIG_IP_PNP_DHCP=y
13CONFIG_BINFMT_ELF=y
14CONFIG_PCI=y
15CONFIG_PCI_MSI=y
16CONFIG_DEBUG_KERNEL=y
17CONFIG_VIRTUALIZATION=y
18CONFIG_HYPERVISOR_GUEST=y
19CONFIG_PARAVIRT=y
20CONFIG_KVM_GUEST=y
21CONFIG_VIRTIO=y
22CONFIG_VIRTIO_PCI=y
23CONFIG_VIRTIO_BLK=y
24CONFIG_VIRTIO_CONSOLE=y
25CONFIG_VIRTIO_NET=y
26CONFIG_9P_FS=y
27CONFIG_NET_9P=y
28CONFIG_NET_9P_VIRTIO=y
29CONFIG_SCSI_LOWLEVEL=y
30CONFIG_SCSI_VIRTIO=y
31CONFIG_VIRTIO_INPUT=y
32CONFIG_DRM_VIRTIO_GPU=y
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 341bf80f80bd..29de1a9352c0 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -23,6 +23,8 @@
23#include <linux/tick.h> 23#include <linux/tick.h>
24#include <linux/irq.h> 24#include <linux/irq.h>
25#include <linux/smpboot.h> 25#include <linux/smpboot.h>
26#include <linux/relay.h>
27#include <linux/slab.h>
26 28
27#include <trace/events/power.h> 29#include <trace/events/power.h>
28#define CREATE_TRACE_POINTS 30#define CREATE_TRACE_POINTS
@@ -37,8 +39,9 @@
37 * @thread: Pointer to the hotplug thread 39 * @thread: Pointer to the hotplug thread
38 * @should_run: Thread should execute 40 * @should_run: Thread should execute
39 * @rollback: Perform a rollback 41 * @rollback: Perform a rollback
40 * @cb_stat: The state for a single callback (install/uninstall) 42 * @single: Single callback invocation
41 * @cb: Single callback function (install/uninstall) 43 * @bringup: Single callback bringup or teardown selector
44 * @cb_state: The state for a single callback (install/uninstall)
42 * @result: Result of the operation 45 * @result: Result of the operation
43 * @done: Signal completion to the issuer of the task 46 * @done: Signal completion to the issuer of the task
44 */ 47 */
@@ -49,8 +52,10 @@ struct cpuhp_cpu_state {
49 struct task_struct *thread; 52 struct task_struct *thread;
50 bool should_run; 53 bool should_run;
51 bool rollback; 54 bool rollback;
55 bool single;
56 bool bringup;
57 struct hlist_node *node;
52 enum cpuhp_state cb_state; 58 enum cpuhp_state cb_state;
53 int (*cb)(unsigned int cpu);
54 int result; 59 int result;
55 struct completion done; 60 struct completion done;
56#endif 61#endif
@@ -68,35 +73,103 @@ static DEFINE_PER_CPU(struct cpuhp_cpu_state, cpuhp_state);
68 * @cant_stop: Bringup/teardown can't be stopped at this step 73 * @cant_stop: Bringup/teardown can't be stopped at this step
69 */ 74 */
70struct cpuhp_step { 75struct cpuhp_step {
71 const char *name; 76 const char *name;
72 int (*startup)(unsigned int cpu); 77 union {
73 int (*teardown)(unsigned int cpu); 78 int (*single)(unsigned int cpu);
74 bool skip_onerr; 79 int (*multi)(unsigned int cpu,
75 bool cant_stop; 80 struct hlist_node *node);
81 } startup;
82 union {
83 int (*single)(unsigned int cpu);
84 int (*multi)(unsigned int cpu,
85 struct hlist_node *node);
86 } teardown;
87 struct hlist_head list;
88 bool skip_onerr;
89 bool cant_stop;
90 bool multi_instance;
76}; 91};
77 92
78static DEFINE_MUTEX(cpuhp_state_mutex); 93static DEFINE_MUTEX(cpuhp_state_mutex);
79static struct cpuhp_step cpuhp_bp_states[]; 94static struct cpuhp_step cpuhp_bp_states[];
80static struct cpuhp_step cpuhp_ap_states[]; 95static struct cpuhp_step cpuhp_ap_states[];
81 96
97static bool cpuhp_is_ap_state(enum cpuhp_state state)
98{
99 /*
100 * The extra check for CPUHP_TEARDOWN_CPU is only for documentation
101 * purposes as that state is handled explicitly in cpu_down.
102 */
103 return state > CPUHP_BRINGUP_CPU && state != CPUHP_TEARDOWN_CPU;
104}
105
106static struct cpuhp_step *cpuhp_get_step(enum cpuhp_state state)
107{
108 struct cpuhp_step *sp;
109
110 sp = cpuhp_is_ap_state(state) ? cpuhp_ap_states : cpuhp_bp_states;
111 return sp + state;
112}
113
82/** 114/**
83 * cpuhp_invoke_callback _ Invoke the callbacks for a given state 115 * cpuhp_invoke_callback _ Invoke the callbacks for a given state
84 * @cpu: The cpu for which the callback should be invoked 116 * @cpu: The cpu for which the callback should be invoked
85 * @step: The step in the state machine 117 * @step: The step in the state machine
86 * @cb: The callback function to invoke 118 * @bringup: True if the bringup callback should be invoked
87 * 119 *
88 * Called from cpu hotplug and from the state register machinery 120 * Called from cpu hotplug and from the state register machinery.
89 */ 121 */
90static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state step, 122static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state state,
91 int (*cb)(unsigned int)) 123 bool bringup, struct hlist_node *node)
92{ 124{
93 struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); 125 struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
94 int ret = 0; 126 struct cpuhp_step *step = cpuhp_get_step(state);
95 127 int (*cbm)(unsigned int cpu, struct hlist_node *node);
96 if (cb) { 128 int (*cb)(unsigned int cpu);
97 trace_cpuhp_enter(cpu, st->target, step, cb); 129 int ret, cnt;
130
131 if (!step->multi_instance) {
132 cb = bringup ? step->startup.single : step->teardown.single;
133 if (!cb)
134 return 0;
135 trace_cpuhp_enter(cpu, st->target, state, cb);
98 ret = cb(cpu); 136 ret = cb(cpu);
99 trace_cpuhp_exit(cpu, st->state, step, ret); 137 trace_cpuhp_exit(cpu, st->state, state, ret);
138 return ret;
139 }
140 cbm = bringup ? step->startup.multi : step->teardown.multi;
141 if (!cbm)
142 return 0;
143
144 /* Single invocation for instance add/remove */
145 if (node) {
146 trace_cpuhp_multi_enter(cpu, st->target, state, cbm, node);
147 ret = cbm(cpu, node);
148 trace_cpuhp_exit(cpu, st->state, state, ret);
149 return ret;
150 }
151
152 /* State transition. Invoke on all instances */
153 cnt = 0;
154 hlist_for_each(node, &step->list) {
155 trace_cpuhp_multi_enter(cpu, st->target, state, cbm, node);
156 ret = cbm(cpu, node);
157 trace_cpuhp_exit(cpu, st->state, state, ret);
158 if (ret)
159 goto err;
160 cnt++;
161 }
162 return 0;
163err:
164 /* Rollback the instances if one failed */
165 cbm = !bringup ? step->startup.multi : step->teardown.multi;
166 if (!cbm)
167 return ret;
168
169 hlist_for_each(node, &step->list) {
170 if (!cnt--)
171 break;
172 cbm(cpu, node);
100 } 173 }
101 return ret; 174 return ret;
102} 175}
@@ -155,7 +228,7 @@ static struct {
155 .wq = __WAIT_QUEUE_HEAD_INITIALIZER(cpu_hotplug.wq), 228 .wq = __WAIT_QUEUE_HEAD_INITIALIZER(cpu_hotplug.wq),
156 .lock = __MUTEX_INITIALIZER(cpu_hotplug.lock), 229 .lock = __MUTEX_INITIALIZER(cpu_hotplug.lock),
157#ifdef CONFIG_DEBUG_LOCK_ALLOC 230#ifdef CONFIG_DEBUG_LOCK_ALLOC
158 .dep_map = {.name = "cpu_hotplug.lock" }, 231 .dep_map = STATIC_LOCKDEP_MAP_INIT("cpu_hotplug.dep_map", &cpu_hotplug.dep_map),
159#endif 232#endif
160}; 233};
161 234
@@ -260,10 +333,17 @@ void cpu_hotplug_disable(void)
260} 333}
261EXPORT_SYMBOL_GPL(cpu_hotplug_disable); 334EXPORT_SYMBOL_GPL(cpu_hotplug_disable);
262 335
336static void __cpu_hotplug_enable(void)
337{
338 if (WARN_ONCE(!cpu_hotplug_disabled, "Unbalanced cpu hotplug enable\n"))
339 return;
340 cpu_hotplug_disabled--;
341}
342
263void cpu_hotplug_enable(void) 343void cpu_hotplug_enable(void)
264{ 344{
265 cpu_maps_update_begin(); 345 cpu_maps_update_begin();
266 WARN_ON(--cpu_hotplug_disabled < 0); 346 __cpu_hotplug_enable();
267 cpu_maps_update_done(); 347 cpu_maps_update_done();
268} 348}
269EXPORT_SYMBOL_GPL(cpu_hotplug_enable); 349EXPORT_SYMBOL_GPL(cpu_hotplug_enable);
@@ -330,12 +410,6 @@ static int notify_online(unsigned int cpu)
330 return 0; 410 return 0;
331} 411}
332 412
333static int notify_starting(unsigned int cpu)
334{
335 cpu_notify(CPU_STARTING, cpu);
336 return 0;
337}
338
339static int bringup_wait_for_ap(unsigned int cpu) 413static int bringup_wait_for_ap(unsigned int cpu)
340{ 414{
341 struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); 415 struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
@@ -349,8 +423,16 @@ static int bringup_cpu(unsigned int cpu)
349 struct task_struct *idle = idle_thread_get(cpu); 423 struct task_struct *idle = idle_thread_get(cpu);
350 int ret; 424 int ret;
351 425
426 /*
427 * Some architectures have to walk the irq descriptors to
428 * setup the vector space for the cpu which comes online.
429 * Prevent irq alloc/free across the bringup.
430 */
431 irq_lock_sparse();
432
352 /* Arch-specific enabling code. */ 433 /* Arch-specific enabling code. */
353 ret = __cpu_up(cpu, idle); 434 ret = __cpu_up(cpu, idle);
435 irq_unlock_sparse();
354 if (ret) { 436 if (ret) {
355 cpu_notify(CPU_UP_CANCELED, cpu); 437 cpu_notify(CPU_UP_CANCELED, cpu);
356 return ret; 438 return ret;
@@ -363,62 +445,55 @@ static int bringup_cpu(unsigned int cpu)
363/* 445/*
364 * Hotplug state machine related functions 446 * Hotplug state machine related functions
365 */ 447 */
366static void undo_cpu_down(unsigned int cpu, struct cpuhp_cpu_state *st, 448static void undo_cpu_down(unsigned int cpu, struct cpuhp_cpu_state *st)
367 struct cpuhp_step *steps)
368{ 449{
369 for (st->state++; st->state < st->target; st->state++) { 450 for (st->state++; st->state < st->target; st->state++) {
370 struct cpuhp_step *step = steps + st->state; 451 struct cpuhp_step *step = cpuhp_get_step(st->state);
371 452
372 if (!step->skip_onerr) 453 if (!step->skip_onerr)
373 cpuhp_invoke_callback(cpu, st->state, step->startup); 454 cpuhp_invoke_callback(cpu, st->state, true, NULL);
374 } 455 }
375} 456}
376 457
377static int cpuhp_down_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st, 458static int cpuhp_down_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st,
378 struct cpuhp_step *steps, enum cpuhp_state target) 459 enum cpuhp_state target)
379{ 460{
380 enum cpuhp_state prev_state = st->state; 461 enum cpuhp_state prev_state = st->state;
381 int ret = 0; 462 int ret = 0;
382 463
383 for (; st->state > target; st->state--) { 464 for (; st->state > target; st->state--) {
384 struct cpuhp_step *step = steps + st->state; 465 ret = cpuhp_invoke_callback(cpu, st->state, false, NULL);
385
386 ret = cpuhp_invoke_callback(cpu, st->state, step->teardown);
387 if (ret) { 466 if (ret) {
388 st->target = prev_state; 467 st->target = prev_state;
389 undo_cpu_down(cpu, st, steps); 468 undo_cpu_down(cpu, st);
390 break; 469 break;
391 } 470 }
392 } 471 }
393 return ret; 472 return ret;
394} 473}
395 474
396static void undo_cpu_up(unsigned int cpu, struct cpuhp_cpu_state *st, 475static void undo_cpu_up(unsigned int cpu, struct cpuhp_cpu_state *st)
397 struct cpuhp_step *steps)
398{ 476{
399 for (st->state--; st->state > st->target; st->state--) { 477 for (st->state--; st->state > st->target; st->state--) {
400 struct cpuhp_step *step = steps + st->state; 478 struct cpuhp_step *step = cpuhp_get_step(st->state);
401 479
402 if (!step->skip_onerr) 480 if (!step->skip_onerr)
403 cpuhp_invoke_callback(cpu, st->state, step->teardown); 481 cpuhp_invoke_callback(cpu, st->state, false, NULL);
404 } 482 }
405} 483}
406 484
407static int cpuhp_up_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st, 485static int cpuhp_up_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st,
408 struct cpuhp_step *steps, enum cpuhp_state target) 486 enum cpuhp_state target)
409{ 487{
410 enum cpuhp_state prev_state = st->state; 488 enum cpuhp_state prev_state = st->state;
411 int ret = 0; 489 int ret = 0;
412 490
413 while (st->state < target) { 491 while (st->state < target) {
414 struct cpuhp_step *step;
415
416 st->state++; 492 st->state++;
417 step = steps + st->state; 493 ret = cpuhp_invoke_callback(cpu, st->state, true, NULL);
418 ret = cpuhp_invoke_callback(cpu, st->state, step->startup);
419 if (ret) { 494 if (ret) {
420 st->target = prev_state; 495 st->target = prev_state;
421 undo_cpu_up(cpu, st, steps); 496 undo_cpu_up(cpu, st);
422 break; 497 break;
423 } 498 }
424 } 499 }
@@ -447,13 +522,13 @@ static int cpuhp_ap_offline(unsigned int cpu, struct cpuhp_cpu_state *st)
447{ 522{
448 enum cpuhp_state target = max((int)st->target, CPUHP_TEARDOWN_CPU); 523 enum cpuhp_state target = max((int)st->target, CPUHP_TEARDOWN_CPU);
449 524
450 return cpuhp_down_callbacks(cpu, st, cpuhp_ap_states, target); 525 return cpuhp_down_callbacks(cpu, st, target);
451} 526}
452 527
453/* Execute the online startup callbacks. Used to be CPU_ONLINE */ 528/* Execute the online startup callbacks. Used to be CPU_ONLINE */
454static int cpuhp_ap_online(unsigned int cpu, struct cpuhp_cpu_state *st) 529static int cpuhp_ap_online(unsigned int cpu, struct cpuhp_cpu_state *st)
455{ 530{
456 return cpuhp_up_callbacks(cpu, st, cpuhp_ap_states, st->target); 531 return cpuhp_up_callbacks(cpu, st, st->target);
457} 532}
458 533
459/* 534/*
@@ -476,18 +551,20 @@ static void cpuhp_thread_fun(unsigned int cpu)
476 st->should_run = false; 551 st->should_run = false;
477 552
478 /* Single callback invocation for [un]install ? */ 553 /* Single callback invocation for [un]install ? */
479 if (st->cb) { 554 if (st->single) {
480 if (st->cb_state < CPUHP_AP_ONLINE) { 555 if (st->cb_state < CPUHP_AP_ONLINE) {
481 local_irq_disable(); 556 local_irq_disable();
482 ret = cpuhp_invoke_callback(cpu, st->cb_state, st->cb); 557 ret = cpuhp_invoke_callback(cpu, st->cb_state,
558 st->bringup, st->node);
483 local_irq_enable(); 559 local_irq_enable();
484 } else { 560 } else {
485 ret = cpuhp_invoke_callback(cpu, st->cb_state, st->cb); 561 ret = cpuhp_invoke_callback(cpu, st->cb_state,
562 st->bringup, st->node);
486 } 563 }
487 } else if (st->rollback) { 564 } else if (st->rollback) {
488 BUG_ON(st->state < CPUHP_AP_ONLINE_IDLE); 565 BUG_ON(st->state < CPUHP_AP_ONLINE_IDLE);
489 566
490 undo_cpu_down(cpu, st, cpuhp_ap_states); 567 undo_cpu_down(cpu, st);
491 /* 568 /*
492 * This is a momentary workaround to keep the notifier users 569 * This is a momentary workaround to keep the notifier users
493 * happy. Will go away once we got rid of the notifiers. 570 * happy. Will go away once we got rid of the notifiers.
@@ -509,8 +586,9 @@ static void cpuhp_thread_fun(unsigned int cpu)
509} 586}
510 587
511/* Invoke a single callback on a remote cpu */ 588/* Invoke a single callback on a remote cpu */
512static int cpuhp_invoke_ap_callback(int cpu, enum cpuhp_state state, 589static int
513 int (*cb)(unsigned int)) 590cpuhp_invoke_ap_callback(int cpu, enum cpuhp_state state, bool bringup,
591 struct hlist_node *node)
514{ 592{
515 struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); 593 struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
516 594
@@ -522,10 +600,13 @@ static int cpuhp_invoke_ap_callback(int cpu, enum cpuhp_state state,
522 * we invoke the thread function directly. 600 * we invoke the thread function directly.
523 */ 601 */
524 if (!st->thread) 602 if (!st->thread)
525 return cpuhp_invoke_callback(cpu, state, cb); 603 return cpuhp_invoke_callback(cpu, state, bringup, node);
526 604
527 st->cb_state = state; 605 st->cb_state = state;
528 st->cb = cb; 606 st->single = true;
607 st->bringup = bringup;
608 st->node = node;
609
529 /* 610 /*
530 * Make sure the above stores are visible before should_run becomes 611 * Make sure the above stores are visible before should_run becomes
531 * true. Paired with the mb() above in cpuhp_thread_fun() 612 * true. Paired with the mb() above in cpuhp_thread_fun()
@@ -541,7 +622,7 @@ static int cpuhp_invoke_ap_callback(int cpu, enum cpuhp_state state,
541static void __cpuhp_kick_ap_work(struct cpuhp_cpu_state *st) 622static void __cpuhp_kick_ap_work(struct cpuhp_cpu_state *st)
542{ 623{
543 st->result = 0; 624 st->result = 0;
544 st->cb = NULL; 625 st->single = false;
545 /* 626 /*
546 * Make sure the above stores are visible before should_run becomes 627 * Make sure the above stores are visible before should_run becomes
547 * true. Paired with the mb() above in cpuhp_thread_fun() 628 * true. Paired with the mb() above in cpuhp_thread_fun()
@@ -674,12 +755,6 @@ static int notify_down_prepare(unsigned int cpu)
674 return err; 755 return err;
675} 756}
676 757
677static int notify_dying(unsigned int cpu)
678{
679 cpu_notify(CPU_DYING, cpu);
680 return 0;
681}
682
683/* Take this CPU down. */ 758/* Take this CPU down. */
684static int take_cpu_down(void *_param) 759static int take_cpu_down(void *_param)
685{ 760{
@@ -692,12 +767,16 @@ static int take_cpu_down(void *_param)
692 if (err < 0) 767 if (err < 0)
693 return err; 768 return err;
694 769
770 /*
771 * We get here while we are in CPUHP_TEARDOWN_CPU state and we must not
772 * do this step again.
773 */
774 WARN_ON(st->state != CPUHP_TEARDOWN_CPU);
775 st->state--;
695 /* Invoke the former CPU_DYING callbacks */ 776 /* Invoke the former CPU_DYING callbacks */
696 for (; st->state > target; st->state--) { 777 for (; st->state > target; st->state--)
697 struct cpuhp_step *step = cpuhp_ap_states + st->state; 778 cpuhp_invoke_callback(cpu, st->state, false, NULL);
698 779
699 cpuhp_invoke_callback(cpu, st->state, step->teardown);
700 }
701 /* Give up timekeeping duties */ 780 /* Give up timekeeping duties */
702 tick_handover_do_timer(); 781 tick_handover_do_timer();
703 /* Park the stopper thread */ 782 /* Park the stopper thread */
@@ -734,7 +813,7 @@ static int takedown_cpu(unsigned int cpu)
734 BUG_ON(cpu_online(cpu)); 813 BUG_ON(cpu_online(cpu));
735 814
736 /* 815 /*
737 * The migration_call() CPU_DYING callback will have removed all 816 * The CPUHP_AP_SCHED_MIGRATE_DYING callback will have removed all
738 * runnable tasks from the cpu, there's only the idle task left now 817 * runnable tasks from the cpu, there's only the idle task left now
739 * that the migration thread is done doing the stop_machine thing. 818 * that the migration thread is done doing the stop_machine thing.
740 * 819 *
@@ -787,7 +866,6 @@ void cpuhp_report_idle_dead(void)
787#define notify_down_prepare NULL 866#define notify_down_prepare NULL
788#define takedown_cpu NULL 867#define takedown_cpu NULL
789#define notify_dead NULL 868#define notify_dead NULL
790#define notify_dying NULL
791#endif 869#endif
792 870
793#ifdef CONFIG_HOTPLUG_CPU 871#ifdef CONFIG_HOTPLUG_CPU
@@ -836,7 +914,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
836 * The AP brought itself down to CPUHP_TEARDOWN_CPU. So we need 914 * The AP brought itself down to CPUHP_TEARDOWN_CPU. So we need
837 * to do the further cleanups. 915 * to do the further cleanups.
838 */ 916 */
839 ret = cpuhp_down_callbacks(cpu, st, cpuhp_bp_states, target); 917 ret = cpuhp_down_callbacks(cpu, st, target);
840 if (ret && st->state > CPUHP_TEARDOWN_CPU && st->state < prev_state) { 918 if (ret && st->state > CPUHP_TEARDOWN_CPU && st->state < prev_state) {
841 st->target = prev_state; 919 st->target = prev_state;
842 st->rollback = true; 920 st->rollback = true;
@@ -877,10 +955,9 @@ EXPORT_SYMBOL(cpu_down);
877#endif /*CONFIG_HOTPLUG_CPU*/ 955#endif /*CONFIG_HOTPLUG_CPU*/
878 956
879/** 957/**
880 * notify_cpu_starting(cpu) - call the CPU_STARTING notifiers 958 * notify_cpu_starting(cpu) - Invoke the callbacks on the starting CPU
881 * @cpu: cpu that just started 959 * @cpu: cpu that just started
882 * 960 *
883 * This function calls the cpu_chain notifiers with CPU_STARTING.
884 * It must be called by the arch code on the new cpu, before the new cpu 961 * It must be called by the arch code on the new cpu, before the new cpu
885 * enables interrupts and before the "boot" cpu returns from __cpu_up(). 962 * enables interrupts and before the "boot" cpu returns from __cpu_up().
886 */ 963 */
@@ -889,12 +966,10 @@ void notify_cpu_starting(unsigned int cpu)
889 struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); 966 struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
890 enum cpuhp_state target = min((int)st->target, CPUHP_AP_ONLINE); 967 enum cpuhp_state target = min((int)st->target, CPUHP_AP_ONLINE);
891 968
969 rcu_cpu_starting(cpu); /* Enables RCU usage on this CPU. */
892 while (st->state < target) { 970 while (st->state < target) {
893 struct cpuhp_step *step;
894
895 st->state++; 971 st->state++;
896 step = cpuhp_ap_states + st->state; 972 cpuhp_invoke_callback(cpu, st->state, true, NULL);
897 cpuhp_invoke_callback(cpu, st->state, step->startup);
898 } 973 }
899} 974}
900 975
@@ -979,7 +1054,7 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen, enum cpuhp_state target)
979 * responsible for bringing it up to the target state. 1054 * responsible for bringing it up to the target state.
980 */ 1055 */
981 target = min((int)target, CPUHP_BRINGUP_CPU); 1056 target = min((int)target, CPUHP_BRINGUP_CPU);
982 ret = cpuhp_up_callbacks(cpu, st, cpuhp_bp_states, target); 1057 ret = cpuhp_up_callbacks(cpu, st, target);
983out: 1058out:
984 cpu_hotplug_done(); 1059 cpu_hotplug_done();
985 return ret; 1060 return ret;
@@ -1024,12 +1099,13 @@ EXPORT_SYMBOL_GPL(cpu_up);
1024#ifdef CONFIG_PM_SLEEP_SMP 1099#ifdef CONFIG_PM_SLEEP_SMP
1025static cpumask_var_t frozen_cpus; 1100static cpumask_var_t frozen_cpus;
1026 1101
1027int disable_nonboot_cpus(void) 1102int freeze_secondary_cpus(int primary)
1028{ 1103{
1029 int cpu, first_cpu, error = 0; 1104 int cpu, error = 0;
1030 1105
1031 cpu_maps_update_begin(); 1106 cpu_maps_update_begin();
1032 first_cpu = cpumask_first(cpu_online_mask); 1107 if (!cpu_online(primary))
1108 primary = cpumask_first(cpu_online_mask);
1033 /* 1109 /*
1034 * We take down all of the non-boot CPUs in one shot to avoid races 1110 * We take down all of the non-boot CPUs in one shot to avoid races
1035 * with the userspace trying to use the CPU hotplug at the same time 1111 * with the userspace trying to use the CPU hotplug at the same time
@@ -1038,7 +1114,7 @@ int disable_nonboot_cpus(void)
1038 1114
1039 pr_info("Disabling non-boot CPUs ...\n"); 1115 pr_info("Disabling non-boot CPUs ...\n");
1040 for_each_online_cpu(cpu) { 1116 for_each_online_cpu(cpu) {
1041 if (cpu == first_cpu) 1117 if (cpu == primary)
1042 continue; 1118 continue;
1043 trace_suspend_resume(TPS("CPU_OFF"), cpu, true); 1119 trace_suspend_resume(TPS("CPU_OFF"), cpu, true);
1044 error = _cpu_down(cpu, 1, CPUHP_OFFLINE); 1120 error = _cpu_down(cpu, 1, CPUHP_OFFLINE);
@@ -1081,7 +1157,7 @@ void enable_nonboot_cpus(void)
1081 1157
1082 /* Allow everyone to use the CPU hotplug again */ 1158 /* Allow everyone to use the CPU hotplug again */
1083 cpu_maps_update_begin(); 1159 cpu_maps_update_begin();
1084 WARN_ON(--cpu_hotplug_disabled < 0); 1160 __cpu_hotplug_enable();
1085 if (cpumask_empty(frozen_cpus)) 1161 if (cpumask_empty(frozen_cpus))
1086 goto out; 1162 goto out;
1087 1163
@@ -1170,40 +1246,50 @@ core_initcall(cpu_hotplug_pm_sync_init);
1170static struct cpuhp_step cpuhp_bp_states[] = { 1246static struct cpuhp_step cpuhp_bp_states[] = {
1171 [CPUHP_OFFLINE] = { 1247 [CPUHP_OFFLINE] = {
1172 .name = "offline", 1248 .name = "offline",
1173 .startup = NULL, 1249 .startup.single = NULL,
1174 .teardown = NULL, 1250 .teardown.single = NULL,
1175 }, 1251 },
1176#ifdef CONFIG_SMP 1252#ifdef CONFIG_SMP
1177 [CPUHP_CREATE_THREADS]= { 1253 [CPUHP_CREATE_THREADS]= {
1178 .name = "threads:create", 1254 .name = "threads:prepare",
1179 .startup = smpboot_create_threads, 1255 .startup.single = smpboot_create_threads,
1180 .teardown = NULL, 1256 .teardown.single = NULL,
1181 .cant_stop = true, 1257 .cant_stop = true,
1182 }, 1258 },
1183 [CPUHP_PERF_PREPARE] = { 1259 [CPUHP_PERF_PREPARE] = {
1184 .name = "perf prepare", 1260 .name = "perf:prepare",
1185 .startup = perf_event_init_cpu, 1261 .startup.single = perf_event_init_cpu,
1186 .teardown = perf_event_exit_cpu, 1262 .teardown.single = perf_event_exit_cpu,
1187 }, 1263 },
1188 [CPUHP_WORKQUEUE_PREP] = { 1264 [CPUHP_WORKQUEUE_PREP] = {
1189 .name = "workqueue prepare", 1265 .name = "workqueue:prepare",
1190 .startup = workqueue_prepare_cpu, 1266 .startup.single = workqueue_prepare_cpu,
1191 .teardown = NULL, 1267 .teardown.single = NULL,
1192 }, 1268 },
1193 [CPUHP_HRTIMERS_PREPARE] = { 1269 [CPUHP_HRTIMERS_PREPARE] = {
1194 .name = "hrtimers prepare", 1270 .name = "hrtimers:prepare",
1195 .startup = hrtimers_prepare_cpu, 1271 .startup.single = hrtimers_prepare_cpu,
1196 .teardown = hrtimers_dead_cpu, 1272 .teardown.single = hrtimers_dead_cpu,
1197 }, 1273 },
1198 [CPUHP_SMPCFD_PREPARE] = { 1274 [CPUHP_SMPCFD_PREPARE] = {
1199 .name = "SMPCFD prepare", 1275 .name = "smpcfd:prepare",
1200 .startup = smpcfd_prepare_cpu, 1276 .startup.single = smpcfd_prepare_cpu,
1201 .teardown = smpcfd_dead_cpu, 1277 .teardown.single = smpcfd_dead_cpu,
1278 },
1279 [CPUHP_RELAY_PREPARE] = {
1280 .name = "relay:prepare",
1281 .startup.single = relay_prepare_cpu,
1282 .teardown.single = NULL,
1283 },
1284 [CPUHP_SLAB_PREPARE] = {
1285 .name = "slab:prepare",
1286 .startup.single = slab_prepare_cpu,
1287 .teardown.single = slab_dead_cpu,
1202 }, 1288 },
1203 [CPUHP_RCUTREE_PREP] = { 1289 [CPUHP_RCUTREE_PREP] = {
1204 .name = "RCU-tree prepare", 1290 .name = "RCU/tree:prepare",
1205 .startup = rcutree_prepare_cpu, 1291 .startup.single = rcutree_prepare_cpu,
1206 .teardown = rcutree_dead_cpu, 1292 .teardown.single = rcutree_dead_cpu,
1207 }, 1293 },
1208 /* 1294 /*
1209 * Preparatory and dead notifiers. Will be replaced once the notifiers 1295 * Preparatory and dead notifiers. Will be replaced once the notifiers
@@ -1211,8 +1297,8 @@ static struct cpuhp_step cpuhp_bp_states[] = {
1211 */ 1297 */
1212 [CPUHP_NOTIFY_PREPARE] = { 1298 [CPUHP_NOTIFY_PREPARE] = {
1213 .name = "notify:prepare", 1299 .name = "notify:prepare",
1214 .startup = notify_prepare, 1300 .startup.single = notify_prepare,
1215 .teardown = notify_dead, 1301 .teardown.single = notify_dead,
1216 .skip_onerr = true, 1302 .skip_onerr = true,
1217 .cant_stop = true, 1303 .cant_stop = true,
1218 }, 1304 },
@@ -1222,20 +1308,21 @@ static struct cpuhp_step cpuhp_bp_states[] = {
1222 * otherwise a RCU stall occurs. 1308 * otherwise a RCU stall occurs.
1223 */ 1309 */
1224 [CPUHP_TIMERS_DEAD] = { 1310 [CPUHP_TIMERS_DEAD] = {
1225 .name = "timers dead", 1311 .name = "timers:dead",
1226 .startup = NULL, 1312 .startup.single = NULL,
1227 .teardown = timers_dead_cpu, 1313 .teardown.single = timers_dead_cpu,
1228 }, 1314 },
1229 /* Kicks the plugged cpu into life */ 1315 /* Kicks the plugged cpu into life */
1230 [CPUHP_BRINGUP_CPU] = { 1316 [CPUHP_BRINGUP_CPU] = {
1231 .name = "cpu:bringup", 1317 .name = "cpu:bringup",
1232 .startup = bringup_cpu, 1318 .startup.single = bringup_cpu,
1233 .teardown = NULL, 1319 .teardown.single = NULL,
1234 .cant_stop = true, 1320 .cant_stop = true,
1235 }, 1321 },
1236 [CPUHP_AP_SMPCFD_DYING] = { 1322 [CPUHP_AP_SMPCFD_DYING] = {
1237 .startup = NULL, 1323 .name = "smpcfd:dying",
1238 .teardown = smpcfd_dying_cpu, 1324 .startup.single = NULL,
1325 .teardown.single = smpcfd_dying_cpu,
1239 }, 1326 },
1240 /* 1327 /*
1241 * Handled on controll processor until the plugged processor manages 1328 * Handled on controll processor until the plugged processor manages
@@ -1243,8 +1330,8 @@ static struct cpuhp_step cpuhp_bp_states[] = {
1243 */ 1330 */
1244 [CPUHP_TEARDOWN_CPU] = { 1331 [CPUHP_TEARDOWN_CPU] = {
1245 .name = "cpu:teardown", 1332 .name = "cpu:teardown",
1246 .startup = NULL, 1333 .startup.single = NULL,
1247 .teardown = takedown_cpu, 1334 .teardown.single = takedown_cpu,
1248 .cant_stop = true, 1335 .cant_stop = true,
1249 }, 1336 },
1250#else 1337#else
@@ -1270,24 +1357,13 @@ static struct cpuhp_step cpuhp_ap_states[] = {
1270 /* First state is scheduler control. Interrupts are disabled */ 1357 /* First state is scheduler control. Interrupts are disabled */
1271 [CPUHP_AP_SCHED_STARTING] = { 1358 [CPUHP_AP_SCHED_STARTING] = {
1272 .name = "sched:starting", 1359 .name = "sched:starting",
1273 .startup = sched_cpu_starting, 1360 .startup.single = sched_cpu_starting,
1274 .teardown = sched_cpu_dying, 1361 .teardown.single = sched_cpu_dying,
1275 }, 1362 },
1276 [CPUHP_AP_RCUTREE_DYING] = { 1363 [CPUHP_AP_RCUTREE_DYING] = {
1277 .startup = NULL, 1364 .name = "RCU/tree:dying",
1278 .teardown = rcutree_dying_cpu, 1365 .startup.single = NULL,
1279 }, 1366 .teardown.single = rcutree_dying_cpu,
1280 /*
1281 * Low level startup/teardown notifiers. Run with interrupts
1282 * disabled. Will be removed once the notifiers are converted to
1283 * states.
1284 */
1285 [CPUHP_AP_NOTIFY_STARTING] = {
1286 .name = "notify:starting",
1287 .startup = notify_starting,
1288 .teardown = notify_dying,
1289 .skip_onerr = true,
1290 .cant_stop = true,
1291 }, 1367 },
1292 /* Entry state on starting. Interrupts enabled from here on. Transient 1368 /* Entry state on starting. Interrupts enabled from here on. Transient
1293 * state for synchronsization */ 1369 * state for synchronsization */
@@ -1296,24 +1372,24 @@ static struct cpuhp_step cpuhp_ap_states[] = {
1296 }, 1372 },
1297 /* Handle smpboot threads park/unpark */ 1373 /* Handle smpboot threads park/unpark */
1298 [CPUHP_AP_SMPBOOT_THREADS] = { 1374 [CPUHP_AP_SMPBOOT_THREADS] = {
1299 .name = "smpboot:threads", 1375 .name = "smpboot/threads:online",
1300 .startup = smpboot_unpark_threads, 1376 .startup.single = smpboot_unpark_threads,
1301 .teardown = NULL, 1377 .teardown.single = NULL,
1302 }, 1378 },
1303 [CPUHP_AP_PERF_ONLINE] = { 1379 [CPUHP_AP_PERF_ONLINE] = {
1304 .name = "perf online", 1380 .name = "perf:online",
1305 .startup = perf_event_init_cpu, 1381 .startup.single = perf_event_init_cpu,
1306 .teardown = perf_event_exit_cpu, 1382 .teardown.single = perf_event_exit_cpu,
1307 }, 1383 },
1308 [CPUHP_AP_WORKQUEUE_ONLINE] = { 1384 [CPUHP_AP_WORKQUEUE_ONLINE] = {
1309 .name = "workqueue online", 1385 .name = "workqueue:online",
1310 .startup = workqueue_online_cpu, 1386 .startup.single = workqueue_online_cpu,
1311 .teardown = workqueue_offline_cpu, 1387 .teardown.single = workqueue_offline_cpu,
1312 }, 1388 },
1313 [CPUHP_AP_RCUTREE_ONLINE] = { 1389 [CPUHP_AP_RCUTREE_ONLINE] = {
1314 .name = "RCU-tree online", 1390 .name = "RCU/tree:online",
1315 .startup = rcutree_online_cpu, 1391 .startup.single = rcutree_online_cpu,
1316 .teardown = rcutree_offline_cpu, 1392 .teardown.single = rcutree_offline_cpu,
1317 }, 1393 },
1318 1394
1319 /* 1395 /*
@@ -1322,8 +1398,8 @@ static struct cpuhp_step cpuhp_ap_states[] = {
1322 */ 1398 */
1323 [CPUHP_AP_NOTIFY_ONLINE] = { 1399 [CPUHP_AP_NOTIFY_ONLINE] = {
1324 .name = "notify:online", 1400 .name = "notify:online",
1325 .startup = notify_online, 1401 .startup.single = notify_online,
1326 .teardown = notify_down_prepare, 1402 .teardown.single = notify_down_prepare,
1327 .skip_onerr = true, 1403 .skip_onerr = true,
1328 }, 1404 },
1329#endif 1405#endif
@@ -1335,16 +1411,16 @@ static struct cpuhp_step cpuhp_ap_states[] = {
1335 /* Last state is scheduler control setting the cpu active */ 1411 /* Last state is scheduler control setting the cpu active */
1336 [CPUHP_AP_ACTIVE] = { 1412 [CPUHP_AP_ACTIVE] = {
1337 .name = "sched:active", 1413 .name = "sched:active",
1338 .startup = sched_cpu_activate, 1414 .startup.single = sched_cpu_activate,
1339 .teardown = sched_cpu_deactivate, 1415 .teardown.single = sched_cpu_deactivate,
1340 }, 1416 },
1341#endif 1417#endif
1342 1418
1343 /* CPU is fully up and running. */ 1419 /* CPU is fully up and running. */
1344 [CPUHP_ONLINE] = { 1420 [CPUHP_ONLINE] = {
1345 .name = "online", 1421 .name = "online",
1346 .startup = NULL, 1422 .startup.single = NULL,
1347 .teardown = NULL, 1423 .teardown.single = NULL,
1348 }, 1424 },
1349}; 1425};
1350 1426
@@ -1356,54 +1432,42 @@ static int cpuhp_cb_check(enum cpuhp_state state)
1356 return 0; 1432 return 0;
1357} 1433}
1358 1434
1359static bool cpuhp_is_ap_state(enum cpuhp_state state)
1360{
1361 /*
1362 * The extra check for CPUHP_TEARDOWN_CPU is only for documentation
1363 * purposes as that state is handled explicitely in cpu_down.
1364 */
1365 return state > CPUHP_BRINGUP_CPU && state != CPUHP_TEARDOWN_CPU;
1366}
1367
1368static struct cpuhp_step *cpuhp_get_step(enum cpuhp_state state)
1369{
1370 struct cpuhp_step *sp;
1371
1372 sp = cpuhp_is_ap_state(state) ? cpuhp_ap_states : cpuhp_bp_states;
1373 return sp + state;
1374}
1375
1376static void cpuhp_store_callbacks(enum cpuhp_state state, 1435static void cpuhp_store_callbacks(enum cpuhp_state state,
1377 const char *name, 1436 const char *name,
1378 int (*startup)(unsigned int cpu), 1437 int (*startup)(unsigned int cpu),
1379 int (*teardown)(unsigned int cpu)) 1438 int (*teardown)(unsigned int cpu),
1439 bool multi_instance)
1380{ 1440{
1381 /* (Un)Install the callbacks for further cpu hotplug operations */ 1441 /* (Un)Install the callbacks for further cpu hotplug operations */
1382 struct cpuhp_step *sp; 1442 struct cpuhp_step *sp;
1383 1443
1384 mutex_lock(&cpuhp_state_mutex); 1444 mutex_lock(&cpuhp_state_mutex);
1385 sp = cpuhp_get_step(state); 1445 sp = cpuhp_get_step(state);
1386 sp->startup = startup; 1446 sp->startup.single = startup;
1387 sp->teardown = teardown; 1447 sp->teardown.single = teardown;
1388 sp->name = name; 1448 sp->name = name;
1449 sp->multi_instance = multi_instance;
1450 INIT_HLIST_HEAD(&sp->list);
1389 mutex_unlock(&cpuhp_state_mutex); 1451 mutex_unlock(&cpuhp_state_mutex);
1390} 1452}
1391 1453
1392static void *cpuhp_get_teardown_cb(enum cpuhp_state state) 1454static void *cpuhp_get_teardown_cb(enum cpuhp_state state)
1393{ 1455{
1394 return cpuhp_get_step(state)->teardown; 1456 return cpuhp_get_step(state)->teardown.single;
1395} 1457}
1396 1458
1397/* 1459/*
1398 * Call the startup/teardown function for a step either on the AP or 1460 * Call the startup/teardown function for a step either on the AP or
1399 * on the current CPU. 1461 * on the current CPU.
1400 */ 1462 */
1401static int cpuhp_issue_call(int cpu, enum cpuhp_state state, 1463static int cpuhp_issue_call(int cpu, enum cpuhp_state state, bool bringup,
1402 int (*cb)(unsigned int), bool bringup) 1464 struct hlist_node *node)
1403{ 1465{
1466 struct cpuhp_step *sp = cpuhp_get_step(state);
1404 int ret; 1467 int ret;
1405 1468
1406 if (!cb) 1469 if ((bringup && !sp->startup.single) ||
1470 (!bringup && !sp->teardown.single))
1407 return 0; 1471 return 0;
1408 /* 1472 /*
1409 * The non AP bound callbacks can fail on bringup. On teardown 1473 * The non AP bound callbacks can fail on bringup. On teardown
@@ -1411,11 +1475,11 @@ static int cpuhp_issue_call(int cpu, enum cpuhp_state state,
1411 */ 1475 */
1412#ifdef CONFIG_SMP 1476#ifdef CONFIG_SMP
1413 if (cpuhp_is_ap_state(state)) 1477 if (cpuhp_is_ap_state(state))
1414 ret = cpuhp_invoke_ap_callback(cpu, state, cb); 1478 ret = cpuhp_invoke_ap_callback(cpu, state, bringup, node);
1415 else 1479 else
1416 ret = cpuhp_invoke_callback(cpu, state, cb); 1480 ret = cpuhp_invoke_callback(cpu, state, bringup, node);
1417#else 1481#else
1418 ret = cpuhp_invoke_callback(cpu, state, cb); 1482 ret = cpuhp_invoke_callback(cpu, state, bringup, node);
1419#endif 1483#endif
1420 BUG_ON(ret && !bringup); 1484 BUG_ON(ret && !bringup);
1421 return ret; 1485 return ret;
@@ -1427,13 +1491,10 @@ static int cpuhp_issue_call(int cpu, enum cpuhp_state state,
1427 * Note: The teardown callbacks for rollback are not allowed to fail! 1491 * Note: The teardown callbacks for rollback are not allowed to fail!
1428 */ 1492 */
1429static void cpuhp_rollback_install(int failedcpu, enum cpuhp_state state, 1493static void cpuhp_rollback_install(int failedcpu, enum cpuhp_state state,
1430 int (*teardown)(unsigned int cpu)) 1494 struct hlist_node *node)
1431{ 1495{
1432 int cpu; 1496 int cpu;
1433 1497
1434 if (!teardown)
1435 return;
1436
1437 /* Roll back the already executed steps on the other cpus */ 1498 /* Roll back the already executed steps on the other cpus */
1438 for_each_present_cpu(cpu) { 1499 for_each_present_cpu(cpu) {
1439 struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); 1500 struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
@@ -1444,7 +1505,7 @@ static void cpuhp_rollback_install(int failedcpu, enum cpuhp_state state,
1444 1505
1445 /* Did we invoke the startup call on that cpu ? */ 1506 /* Did we invoke the startup call on that cpu ? */
1446 if (cpustate >= state) 1507 if (cpustate >= state)
1447 cpuhp_issue_call(cpu, state, teardown, false); 1508 cpuhp_issue_call(cpu, state, false, node);
1448 } 1509 }
1449} 1510}
1450 1511
@@ -1471,6 +1532,52 @@ static int cpuhp_reserve_state(enum cpuhp_state state)
1471 return -ENOSPC; 1532 return -ENOSPC;
1472} 1533}
1473 1534
1535int __cpuhp_state_add_instance(enum cpuhp_state state, struct hlist_node *node,
1536 bool invoke)
1537{
1538 struct cpuhp_step *sp;
1539 int cpu;
1540 int ret;
1541
1542 sp = cpuhp_get_step(state);
1543 if (sp->multi_instance == false)
1544 return -EINVAL;
1545
1546 get_online_cpus();
1547
1548 if (!invoke || !sp->startup.multi)
1549 goto add_node;
1550
1551 /*
1552 * Try to call the startup callback for each present cpu
1553 * depending on the hotplug state of the cpu.
1554 */
1555 for_each_present_cpu(cpu) {
1556 struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
1557 int cpustate = st->state;
1558
1559 if (cpustate < state)
1560 continue;
1561
1562 ret = cpuhp_issue_call(cpu, state, true, node);
1563 if (ret) {
1564 if (sp->teardown.multi)
1565 cpuhp_rollback_install(cpu, state, node);
1566 goto err;
1567 }
1568 }
1569add_node:
1570 ret = 0;
1571 mutex_lock(&cpuhp_state_mutex);
1572 hlist_add_head(node, &sp->list);
1573 mutex_unlock(&cpuhp_state_mutex);
1574
1575err:
1576 put_online_cpus();
1577 return ret;
1578}
1579EXPORT_SYMBOL_GPL(__cpuhp_state_add_instance);
1580
1474/** 1581/**
1475 * __cpuhp_setup_state - Setup the callbacks for an hotplug machine state 1582 * __cpuhp_setup_state - Setup the callbacks for an hotplug machine state
1476 * @state: The state to setup 1583 * @state: The state to setup
@@ -1484,7 +1591,8 @@ static int cpuhp_reserve_state(enum cpuhp_state state)
1484int __cpuhp_setup_state(enum cpuhp_state state, 1591int __cpuhp_setup_state(enum cpuhp_state state,
1485 const char *name, bool invoke, 1592 const char *name, bool invoke,
1486 int (*startup)(unsigned int cpu), 1593 int (*startup)(unsigned int cpu),
1487 int (*teardown)(unsigned int cpu)) 1594 int (*teardown)(unsigned int cpu),
1595 bool multi_instance)
1488{ 1596{
1489 int cpu, ret = 0; 1597 int cpu, ret = 0;
1490 int dyn_state = 0; 1598 int dyn_state = 0;
@@ -1503,7 +1611,7 @@ int __cpuhp_setup_state(enum cpuhp_state state,
1503 state = ret; 1611 state = ret;
1504 } 1612 }
1505 1613
1506 cpuhp_store_callbacks(state, name, startup, teardown); 1614 cpuhp_store_callbacks(state, name, startup, teardown, multi_instance);
1507 1615
1508 if (!invoke || !startup) 1616 if (!invoke || !startup)
1509 goto out; 1617 goto out;
@@ -1519,10 +1627,11 @@ int __cpuhp_setup_state(enum cpuhp_state state,
1519 if (cpustate < state) 1627 if (cpustate < state)
1520 continue; 1628 continue;
1521 1629
1522 ret = cpuhp_issue_call(cpu, state, startup, true); 1630 ret = cpuhp_issue_call(cpu, state, true, NULL);
1523 if (ret) { 1631 if (ret) {
1524 cpuhp_rollback_install(cpu, state, teardown); 1632 if (teardown)
1525 cpuhp_store_callbacks(state, NULL, NULL, NULL); 1633 cpuhp_rollback_install(cpu, state, NULL);
1634 cpuhp_store_callbacks(state, NULL, NULL, NULL, false);
1526 goto out; 1635 goto out;
1527 } 1636 }
1528 } 1637 }
@@ -1534,6 +1643,42 @@ out:
1534} 1643}
1535EXPORT_SYMBOL(__cpuhp_setup_state); 1644EXPORT_SYMBOL(__cpuhp_setup_state);
1536 1645
1646int __cpuhp_state_remove_instance(enum cpuhp_state state,
1647 struct hlist_node *node, bool invoke)
1648{
1649 struct cpuhp_step *sp = cpuhp_get_step(state);
1650 int cpu;
1651
1652 BUG_ON(cpuhp_cb_check(state));
1653
1654 if (!sp->multi_instance)
1655 return -EINVAL;
1656
1657 get_online_cpus();
1658 if (!invoke || !cpuhp_get_teardown_cb(state))
1659 goto remove;
1660 /*
1661 * Call the teardown callback for each present cpu depending
1662 * on the hotplug state of the cpu. This function is not
1663 * allowed to fail currently!
1664 */
1665 for_each_present_cpu(cpu) {
1666 struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
1667 int cpustate = st->state;
1668
1669 if (cpustate >= state)
1670 cpuhp_issue_call(cpu, state, false, node);
1671 }
1672
1673remove:
1674 mutex_lock(&cpuhp_state_mutex);
1675 hlist_del(node);
1676 mutex_unlock(&cpuhp_state_mutex);
1677 put_online_cpus();
1678
1679 return 0;
1680}
1681EXPORT_SYMBOL_GPL(__cpuhp_state_remove_instance);
1537/** 1682/**
1538 * __cpuhp_remove_state - Remove the callbacks for an hotplug machine state 1683 * __cpuhp_remove_state - Remove the callbacks for an hotplug machine state
1539 * @state: The state to remove 1684 * @state: The state to remove
@@ -1545,14 +1690,21 @@ EXPORT_SYMBOL(__cpuhp_setup_state);
1545 */ 1690 */
1546void __cpuhp_remove_state(enum cpuhp_state state, bool invoke) 1691void __cpuhp_remove_state(enum cpuhp_state state, bool invoke)
1547{ 1692{
1548 int (*teardown)(unsigned int cpu) = cpuhp_get_teardown_cb(state); 1693 struct cpuhp_step *sp = cpuhp_get_step(state);
1549 int cpu; 1694 int cpu;
1550 1695
1551 BUG_ON(cpuhp_cb_check(state)); 1696 BUG_ON(cpuhp_cb_check(state));
1552 1697
1553 get_online_cpus(); 1698 get_online_cpus();
1554 1699
1555 if (!invoke || !teardown) 1700 if (sp->multi_instance) {
1701 WARN(!hlist_empty(&sp->list),
1702 "Error: Removing state %d which has instances left.\n",
1703 state);
1704 goto remove;
1705 }
1706
1707 if (!invoke || !cpuhp_get_teardown_cb(state))
1556 goto remove; 1708 goto remove;
1557 1709
1558 /* 1710 /*
@@ -1565,10 +1717,10 @@ void __cpuhp_remove_state(enum cpuhp_state state, bool invoke)
1565 int cpustate = st->state; 1717 int cpustate = st->state;
1566 1718
1567 if (cpustate >= state) 1719 if (cpustate >= state)
1568 cpuhp_issue_call(cpu, state, teardown, false); 1720 cpuhp_issue_call(cpu, state, false, NULL);
1569 } 1721 }
1570remove: 1722remove:
1571 cpuhp_store_callbacks(state, NULL, NULL, NULL); 1723 cpuhp_store_callbacks(state, NULL, NULL, NULL, false);
1572 put_online_cpus(); 1724 put_online_cpus();
1573} 1725}
1574EXPORT_SYMBOL(__cpuhp_remove_state); 1726EXPORT_SYMBOL(__cpuhp_remove_state);
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index c27e53326bef..29f815d2ef7e 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -325,8 +325,7 @@ static struct file_system_type cpuset_fs_type = {
325/* 325/*
326 * Return in pmask the portion of a cpusets's cpus_allowed that 326 * Return in pmask the portion of a cpusets's cpus_allowed that
327 * are online. If none are online, walk up the cpuset hierarchy 327 * are online. If none are online, walk up the cpuset hierarchy
328 * until we find one that does have some online cpus. The top 328 * until we find one that does have some online cpus.
329 * cpuset always has some cpus online.
330 * 329 *
331 * One way or another, we guarantee to return some non-empty subset 330 * One way or another, we guarantee to return some non-empty subset
332 * of cpu_online_mask. 331 * of cpu_online_mask.
@@ -335,8 +334,20 @@ static struct file_system_type cpuset_fs_type = {
335 */ 334 */
336static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask) 335static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask)
337{ 336{
338 while (!cpumask_intersects(cs->effective_cpus, cpu_online_mask)) 337 while (!cpumask_intersects(cs->effective_cpus, cpu_online_mask)) {
339 cs = parent_cs(cs); 338 cs = parent_cs(cs);
339 if (unlikely(!cs)) {
340 /*
341 * The top cpuset doesn't have any online cpu as a
342 * consequence of a race between cpuset_hotplug_work
343 * and cpu hotplug notifier. But we know the top
344 * cpuset's effective_cpus is on its way to to be
345 * identical to cpu_online_mask.
346 */
347 cpumask_copy(pmask, cpu_online_mask);
348 return;
349 }
350 }
340 cpumask_and(pmask, cs->effective_cpus, cpu_online_mask); 351 cpumask_and(pmask, cs->effective_cpus, cpu_online_mask);
341} 352}
342 353
@@ -2074,7 +2085,7 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css)
2074 * which could have been changed by cpuset just after it inherits the 2085 * which could have been changed by cpuset just after it inherits the
2075 * state from the parent and before it sits on the cgroup's task list. 2086 * state from the parent and before it sits on the cgroup's task list.
2076 */ 2087 */
2077void cpuset_fork(struct task_struct *task) 2088static void cpuset_fork(struct task_struct *task)
2078{ 2089{
2079 if (task_css_is_root(task, cpuset_cgrp_id)) 2090 if (task_css_is_root(task, cpuset_cgrp_id))
2080 return; 2091 return;
@@ -2704,7 +2715,7 @@ void __cpuset_memory_pressure_bump(void)
2704int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns, 2715int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns,
2705 struct pid *pid, struct task_struct *tsk) 2716 struct pid *pid, struct task_struct *tsk)
2706{ 2717{
2707 char *buf, *p; 2718 char *buf;
2708 struct cgroup_subsys_state *css; 2719 struct cgroup_subsys_state *css;
2709 int retval; 2720 int retval;
2710 2721
@@ -2713,14 +2724,15 @@ int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns,
2713 if (!buf) 2724 if (!buf)
2714 goto out; 2725 goto out;
2715 2726
2716 retval = -ENAMETOOLONG;
2717 css = task_get_css(tsk, cpuset_cgrp_id); 2727 css = task_get_css(tsk, cpuset_cgrp_id);
2718 p = cgroup_path_ns(css->cgroup, buf, PATH_MAX, 2728 retval = cgroup_path_ns(css->cgroup, buf, PATH_MAX,
2719 current->nsproxy->cgroup_ns); 2729 current->nsproxy->cgroup_ns);
2720 css_put(css); 2730 css_put(css);
2721 if (!p) 2731 if (retval >= PATH_MAX)
2732 retval = -ENAMETOOLONG;
2733 if (retval < 0)
2722 goto out_free; 2734 goto out_free;
2723 seq_puts(m, p); 2735 seq_puts(m, buf);
2724 seq_putc(m, '\n'); 2736 seq_putc(m, '\n');
2725 retval = 0; 2737 retval = 0;
2726out_free: 2738out_free:
diff --git a/kernel/events/core.c b/kernel/events/core.c
index a54f2c2cdb20..6ee1febdf6ff 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -902,6 +902,17 @@ list_update_cgroup_event(struct perf_event *event,
902 * this will always be called from the right CPU. 902 * this will always be called from the right CPU.
903 */ 903 */
904 cpuctx = __get_cpu_context(ctx); 904 cpuctx = __get_cpu_context(ctx);
905
906 /* Only set/clear cpuctx->cgrp if current task uses event->cgrp. */
907 if (perf_cgroup_from_task(current, ctx) != event->cgrp) {
908 /*
909 * We are removing the last cpu event in this context.
910 * If that event is not active in this cpu, cpuctx->cgrp
911 * should've been cleared by perf_cgroup_switch.
912 */
913 WARN_ON_ONCE(!add && cpuctx->cgrp);
914 return;
915 }
905 cpuctx->cgrp = add ? event->cgrp : NULL; 916 cpuctx->cgrp = add ? event->cgrp : NULL;
906} 917}
907 918
@@ -1475,8 +1486,7 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
1475 if (event->group_leader == event) { 1486 if (event->group_leader == event) {
1476 struct list_head *list; 1487 struct list_head *list;
1477 1488
1478 if (is_software_event(event)) 1489 event->group_caps = event->event_caps;
1479 event->group_flags |= PERF_GROUP_SOFTWARE;
1480 1490
1481 list = ctx_group_list(event, ctx); 1491 list = ctx_group_list(event, ctx);
1482 list_add_tail(&event->group_entry, list); 1492 list_add_tail(&event->group_entry, list);
@@ -1630,9 +1640,7 @@ static void perf_group_attach(struct perf_event *event)
1630 1640
1631 WARN_ON_ONCE(group_leader->ctx != event->ctx); 1641 WARN_ON_ONCE(group_leader->ctx != event->ctx);
1632 1642
1633 if (group_leader->group_flags & PERF_GROUP_SOFTWARE && 1643 group_leader->group_caps &= event->event_caps;
1634 !is_software_event(event))
1635 group_leader->group_flags &= ~PERF_GROUP_SOFTWARE;
1636 1644
1637 list_add_tail(&event->group_entry, &group_leader->sibling_list); 1645 list_add_tail(&event->group_entry, &group_leader->sibling_list);
1638 group_leader->nr_siblings++; 1646 group_leader->nr_siblings++;
@@ -1723,7 +1731,7 @@ static void perf_group_detach(struct perf_event *event)
1723 sibling->group_leader = sibling; 1731 sibling->group_leader = sibling;
1724 1732
1725 /* Inherit group flags from the previous leader */ 1733 /* Inherit group flags from the previous leader */
1726 sibling->group_flags = event->group_flags; 1734 sibling->group_caps = event->group_caps;
1727 1735
1728 WARN_ON_ONCE(sibling->ctx != event->ctx); 1736 WARN_ON_ONCE(sibling->ctx != event->ctx);
1729 } 1737 }
@@ -1832,6 +1840,8 @@ group_sched_out(struct perf_event *group_event,
1832 struct perf_event *event; 1840 struct perf_event *event;
1833 int state = group_event->state; 1841 int state = group_event->state;
1834 1842
1843 perf_pmu_disable(ctx->pmu);
1844
1835 event_sched_out(group_event, cpuctx, ctx); 1845 event_sched_out(group_event, cpuctx, ctx);
1836 1846
1837 /* 1847 /*
@@ -1840,6 +1850,8 @@ group_sched_out(struct perf_event *group_event,
1840 list_for_each_entry(event, &group_event->sibling_list, group_entry) 1850 list_for_each_entry(event, &group_event->sibling_list, group_entry)
1841 event_sched_out(event, cpuctx, ctx); 1851 event_sched_out(event, cpuctx, ctx);
1842 1852
1853 perf_pmu_enable(ctx->pmu);
1854
1843 if (state == PERF_EVENT_STATE_ACTIVE && group_event->attr.exclusive) 1855 if (state == PERF_EVENT_STATE_ACTIVE && group_event->attr.exclusive)
1844 cpuctx->exclusive = 0; 1856 cpuctx->exclusive = 0;
1845} 1857}
@@ -1959,6 +1971,12 @@ void perf_event_disable(struct perf_event *event)
1959} 1971}
1960EXPORT_SYMBOL_GPL(perf_event_disable); 1972EXPORT_SYMBOL_GPL(perf_event_disable);
1961 1973
1974void perf_event_disable_inatomic(struct perf_event *event)
1975{
1976 event->pending_disable = 1;
1977 irq_work_queue(&event->pending);
1978}
1979
1962static void perf_set_shadow_time(struct perf_event *event, 1980static void perf_set_shadow_time(struct perf_event *event,
1963 struct perf_event_context *ctx, 1981 struct perf_event_context *ctx,
1964 u64 tstamp) 1982 u64 tstamp)
@@ -2145,7 +2163,7 @@ static int group_can_go_on(struct perf_event *event,
2145 /* 2163 /*
2146 * Groups consisting entirely of software events can always go on. 2164 * Groups consisting entirely of software events can always go on.
2147 */ 2165 */
2148 if (event->group_flags & PERF_GROUP_SOFTWARE) 2166 if (event->group_caps & PERF_EV_CAP_SOFTWARE)
2149 return 1; 2167 return 1;
2150 /* 2168 /*
2151 * If an exclusive group is already on, no other hardware 2169 * If an exclusive group is already on, no other hardware
@@ -2491,7 +2509,7 @@ static int __perf_event_stop(void *info)
2491 * while restarting. 2509 * while restarting.
2492 */ 2510 */
2493 if (sd->restart) 2511 if (sd->restart)
2494 event->pmu->start(event, PERF_EF_START); 2512 event->pmu->start(event, 0);
2495 2513
2496 return 0; 2514 return 0;
2497} 2515}
@@ -2837,19 +2855,36 @@ unlock:
2837 } 2855 }
2838} 2856}
2839 2857
2858static DEFINE_PER_CPU(struct list_head, sched_cb_list);
2859
2840void perf_sched_cb_dec(struct pmu *pmu) 2860void perf_sched_cb_dec(struct pmu *pmu)
2841{ 2861{
2862 struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
2863
2842 this_cpu_dec(perf_sched_cb_usages); 2864 this_cpu_dec(perf_sched_cb_usages);
2865
2866 if (!--cpuctx->sched_cb_usage)
2867 list_del(&cpuctx->sched_cb_entry);
2843} 2868}
2844 2869
2870
2845void perf_sched_cb_inc(struct pmu *pmu) 2871void perf_sched_cb_inc(struct pmu *pmu)
2846{ 2872{
2873 struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
2874
2875 if (!cpuctx->sched_cb_usage++)
2876 list_add(&cpuctx->sched_cb_entry, this_cpu_ptr(&sched_cb_list));
2877
2847 this_cpu_inc(perf_sched_cb_usages); 2878 this_cpu_inc(perf_sched_cb_usages);
2848} 2879}
2849 2880
2850/* 2881/*
2851 * This function provides the context switch callback to the lower code 2882 * This function provides the context switch callback to the lower code
2852 * layer. It is invoked ONLY when the context switch callback is enabled. 2883 * layer. It is invoked ONLY when the context switch callback is enabled.
2884 *
2885 * This callback is relevant even to per-cpu events; for example multi event
2886 * PEBS requires this to provide PID/TID information. This requires we flush
2887 * all queued PEBS records before we context switch to a new task.
2853 */ 2888 */
2854static void perf_pmu_sched_task(struct task_struct *prev, 2889static void perf_pmu_sched_task(struct task_struct *prev,
2855 struct task_struct *next, 2890 struct task_struct *next,
@@ -2857,34 +2892,24 @@ static void perf_pmu_sched_task(struct task_struct *prev,
2857{ 2892{
2858 struct perf_cpu_context *cpuctx; 2893 struct perf_cpu_context *cpuctx;
2859 struct pmu *pmu; 2894 struct pmu *pmu;
2860 unsigned long flags;
2861 2895
2862 if (prev == next) 2896 if (prev == next)
2863 return; 2897 return;
2864 2898
2865 local_irq_save(flags); 2899 list_for_each_entry(cpuctx, this_cpu_ptr(&sched_cb_list), sched_cb_entry) {
2866 2900 pmu = cpuctx->unique_pmu; /* software PMUs will not have sched_task */
2867 rcu_read_lock();
2868 2901
2869 list_for_each_entry_rcu(pmu, &pmus, entry) { 2902 if (WARN_ON_ONCE(!pmu->sched_task))
2870 if (pmu->sched_task) { 2903 continue;
2871 cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
2872
2873 perf_ctx_lock(cpuctx, cpuctx->task_ctx);
2874
2875 perf_pmu_disable(pmu);
2876 2904
2877 pmu->sched_task(cpuctx->task_ctx, sched_in); 2905 perf_ctx_lock(cpuctx, cpuctx->task_ctx);
2906 perf_pmu_disable(pmu);
2878 2907
2879 perf_pmu_enable(pmu); 2908 pmu->sched_task(cpuctx->task_ctx, sched_in);
2880 2909
2881 perf_ctx_unlock(cpuctx, cpuctx->task_ctx); 2910 perf_pmu_enable(pmu);
2882 } 2911 perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
2883 } 2912 }
2884
2885 rcu_read_unlock();
2886
2887 local_irq_restore(flags);
2888} 2913}
2889 2914
2890static void perf_event_switch(struct task_struct *task, 2915static void perf_event_switch(struct task_struct *task,
@@ -3416,6 +3441,22 @@ struct perf_read_data {
3416 int ret; 3441 int ret;
3417}; 3442};
3418 3443
3444static int find_cpu_to_read(struct perf_event *event, int local_cpu)
3445{
3446 int event_cpu = event->oncpu;
3447 u16 local_pkg, event_pkg;
3448
3449 if (event->group_caps & PERF_EV_CAP_READ_ACTIVE_PKG) {
3450 event_pkg = topology_physical_package_id(event_cpu);
3451 local_pkg = topology_physical_package_id(local_cpu);
3452
3453 if (event_pkg == local_pkg)
3454 return local_cpu;
3455 }
3456
3457 return event_cpu;
3458}
3459
3419/* 3460/*
3420 * Cross CPU call to read the hardware event 3461 * Cross CPU call to read the hardware event
3421 */ 3462 */
@@ -3537,7 +3578,7 @@ u64 perf_event_read_local(struct perf_event *event)
3537 3578
3538static int perf_event_read(struct perf_event *event, bool group) 3579static int perf_event_read(struct perf_event *event, bool group)
3539{ 3580{
3540 int ret = 0; 3581 int ret = 0, cpu_to_read, local_cpu;
3541 3582
3542 /* 3583 /*
3543 * If event is enabled and currently active on a CPU, update the 3584 * If event is enabled and currently active on a CPU, update the
@@ -3549,6 +3590,11 @@ static int perf_event_read(struct perf_event *event, bool group)
3549 .group = group, 3590 .group = group,
3550 .ret = 0, 3591 .ret = 0,
3551 }; 3592 };
3593
3594 local_cpu = get_cpu();
3595 cpu_to_read = find_cpu_to_read(event, local_cpu);
3596 put_cpu();
3597
3552 /* 3598 /*
3553 * Purposely ignore the smp_call_function_single() return 3599 * Purposely ignore the smp_call_function_single() return
3554 * value. 3600 * value.
@@ -3559,7 +3605,7 @@ static int perf_event_read(struct perf_event *event, bool group)
3559 * Therefore, either way, we'll have an up-to-date event count 3605 * Therefore, either way, we'll have an up-to-date event count
3560 * after this. 3606 * after this.
3561 */ 3607 */
3562 (void)smp_call_function_single(event->oncpu, __perf_event_read, &data, 1); 3608 (void)smp_call_function_single(cpu_to_read, __perf_event_read, &data, 1);
3563 ret = data.ret; 3609 ret = data.ret;
3564 } else if (event->state == PERF_EVENT_STATE_INACTIVE) { 3610 } else if (event->state == PERF_EVENT_STATE_INACTIVE) {
3565 struct perf_event_context *ctx = event->ctx; 3611 struct perf_event_context *ctx = event->ctx;
@@ -3929,7 +3975,7 @@ static void exclusive_event_destroy(struct perf_event *event)
3929 3975
3930static bool exclusive_event_match(struct perf_event *e1, struct perf_event *e2) 3976static bool exclusive_event_match(struct perf_event *e1, struct perf_event *e2)
3931{ 3977{
3932 if ((e1->pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE) && 3978 if ((e1->pmu == e2->pmu) &&
3933 (e1->cpu == e2->cpu || 3979 (e1->cpu == e2->cpu ||
3934 e1->cpu == -1 || 3980 e1->cpu == -1 ||
3935 e2->cpu == -1)) 3981 e2->cpu == -1))
@@ -5350,9 +5396,10 @@ perf_output_sample_regs(struct perf_output_handle *handle,
5350 struct pt_regs *regs, u64 mask) 5396 struct pt_regs *regs, u64 mask)
5351{ 5397{
5352 int bit; 5398 int bit;
5399 DECLARE_BITMAP(_mask, 64);
5353 5400
5354 for_each_set_bit(bit, (const unsigned long *) &mask, 5401 bitmap_from_u64(_mask, mask);
5355 sizeof(mask) * BITS_PER_BYTE) { 5402 for_each_set_bit(bit, _mask, sizeof(mask) * BITS_PER_BYTE) {
5356 u64 val; 5403 u64 val;
5357 5404
5358 val = perf_reg_value(regs, bit); 5405 val = perf_reg_value(regs, bit);
@@ -7045,11 +7092,11 @@ static int __perf_event_overflow(struct perf_event *event,
7045 if (events && atomic_dec_and_test(&event->event_limit)) { 7092 if (events && atomic_dec_and_test(&event->event_limit)) {
7046 ret = 1; 7093 ret = 1;
7047 event->pending_kill = POLL_HUP; 7094 event->pending_kill = POLL_HUP;
7048 event->pending_disable = 1; 7095
7049 irq_work_queue(&event->pending); 7096 perf_event_disable_inatomic(event);
7050 } 7097 }
7051 7098
7052 event->overflow_handler(event, data, regs); 7099 READ_ONCE(event->overflow_handler)(event, data, regs);
7053 7100
7054 if (*perf_event_fasync(event) && event->pending_kill) { 7101 if (*perf_event_fasync(event) && event->pending_kill) {
7055 event->pending_wakeup = 1; 7102 event->pending_wakeup = 1;
@@ -7664,11 +7711,83 @@ static void perf_event_free_filter(struct perf_event *event)
7664 ftrace_profile_free_filter(event); 7711 ftrace_profile_free_filter(event);
7665} 7712}
7666 7713
7714#ifdef CONFIG_BPF_SYSCALL
7715static void bpf_overflow_handler(struct perf_event *event,
7716 struct perf_sample_data *data,
7717 struct pt_regs *regs)
7718{
7719 struct bpf_perf_event_data_kern ctx = {
7720 .data = data,
7721 .regs = regs,
7722 };
7723 int ret = 0;
7724
7725 preempt_disable();
7726 if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1))
7727 goto out;
7728 rcu_read_lock();
7729 ret = BPF_PROG_RUN(event->prog, (void *)&ctx);
7730 rcu_read_unlock();
7731out:
7732 __this_cpu_dec(bpf_prog_active);
7733 preempt_enable();
7734 if (!ret)
7735 return;
7736
7737 event->orig_overflow_handler(event, data, regs);
7738}
7739
7740static int perf_event_set_bpf_handler(struct perf_event *event, u32 prog_fd)
7741{
7742 struct bpf_prog *prog;
7743
7744 if (event->overflow_handler_context)
7745 /* hw breakpoint or kernel counter */
7746 return -EINVAL;
7747
7748 if (event->prog)
7749 return -EEXIST;
7750
7751 prog = bpf_prog_get_type(prog_fd, BPF_PROG_TYPE_PERF_EVENT);
7752 if (IS_ERR(prog))
7753 return PTR_ERR(prog);
7754
7755 event->prog = prog;
7756 event->orig_overflow_handler = READ_ONCE(event->overflow_handler);
7757 WRITE_ONCE(event->overflow_handler, bpf_overflow_handler);
7758 return 0;
7759}
7760
7761static void perf_event_free_bpf_handler(struct perf_event *event)
7762{
7763 struct bpf_prog *prog = event->prog;
7764
7765 if (!prog)
7766 return;
7767
7768 WRITE_ONCE(event->overflow_handler, event->orig_overflow_handler);
7769 event->prog = NULL;
7770 bpf_prog_put(prog);
7771}
7772#else
7773static int perf_event_set_bpf_handler(struct perf_event *event, u32 prog_fd)
7774{
7775 return -EOPNOTSUPP;
7776}
7777static void perf_event_free_bpf_handler(struct perf_event *event)
7778{
7779}
7780#endif
7781
7667static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) 7782static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
7668{ 7783{
7669 bool is_kprobe, is_tracepoint; 7784 bool is_kprobe, is_tracepoint;
7670 struct bpf_prog *prog; 7785 struct bpf_prog *prog;
7671 7786
7787 if (event->attr.type == PERF_TYPE_HARDWARE ||
7788 event->attr.type == PERF_TYPE_SOFTWARE)
7789 return perf_event_set_bpf_handler(event, prog_fd);
7790
7672 if (event->attr.type != PERF_TYPE_TRACEPOINT) 7791 if (event->attr.type != PERF_TYPE_TRACEPOINT)
7673 return -EINVAL; 7792 return -EINVAL;
7674 7793
@@ -7709,6 +7828,8 @@ static void perf_event_free_bpf_prog(struct perf_event *event)
7709{ 7828{
7710 struct bpf_prog *prog; 7829 struct bpf_prog *prog;
7711 7830
7831 perf_event_free_bpf_handler(event);
7832
7712 if (!event->tp_event) 7833 if (!event->tp_event)
7713 return; 7834 return;
7714 7835
@@ -7908,6 +8029,7 @@ restart:
7908 * if <size> is not specified, the range is treated as a single address. 8029 * if <size> is not specified, the range is treated as a single address.
7909 */ 8030 */
7910enum { 8031enum {
8032 IF_ACT_NONE = -1,
7911 IF_ACT_FILTER, 8033 IF_ACT_FILTER,
7912 IF_ACT_START, 8034 IF_ACT_START,
7913 IF_ACT_STOP, 8035 IF_ACT_STOP,
@@ -7931,6 +8053,7 @@ static const match_table_t if_tokens = {
7931 { IF_SRC_KERNEL, "%u/%u" }, 8053 { IF_SRC_KERNEL, "%u/%u" },
7932 { IF_SRC_FILEADDR, "%u@%s" }, 8054 { IF_SRC_FILEADDR, "%u@%s" },
7933 { IF_SRC_KERNELADDR, "%u" }, 8055 { IF_SRC_KERNELADDR, "%u" },
8056 { IF_ACT_NONE, NULL },
7934}; 8057};
7935 8058
7936/* 8059/*
@@ -8751,7 +8874,10 @@ EXPORT_SYMBOL_GPL(perf_pmu_register);
8751 8874
8752void perf_pmu_unregister(struct pmu *pmu) 8875void perf_pmu_unregister(struct pmu *pmu)
8753{ 8876{
8877 int remove_device;
8878
8754 mutex_lock(&pmus_lock); 8879 mutex_lock(&pmus_lock);
8880 remove_device = pmu_bus_running;
8755 list_del_rcu(&pmu->entry); 8881 list_del_rcu(&pmu->entry);
8756 mutex_unlock(&pmus_lock); 8882 mutex_unlock(&pmus_lock);
8757 8883
@@ -8765,10 +8891,12 @@ void perf_pmu_unregister(struct pmu *pmu)
8765 free_percpu(pmu->pmu_disable_count); 8891 free_percpu(pmu->pmu_disable_count);
8766 if (pmu->type >= PERF_TYPE_MAX) 8892 if (pmu->type >= PERF_TYPE_MAX)
8767 idr_remove(&pmu_idr, pmu->type); 8893 idr_remove(&pmu_idr, pmu->type);
8768 if (pmu->nr_addr_filters) 8894 if (remove_device) {
8769 device_remove_file(pmu->dev, &dev_attr_nr_addr_filters); 8895 if (pmu->nr_addr_filters)
8770 device_del(pmu->dev); 8896 device_remove_file(pmu->dev, &dev_attr_nr_addr_filters);
8771 put_device(pmu->dev); 8897 device_del(pmu->dev);
8898 put_device(pmu->dev);
8899 }
8772 free_pmu_context(pmu); 8900 free_pmu_context(pmu);
8773} 8901}
8774EXPORT_SYMBOL_GPL(perf_pmu_unregister); 8902EXPORT_SYMBOL_GPL(perf_pmu_unregister);
@@ -9025,6 +9153,19 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
9025 if (!overflow_handler && parent_event) { 9153 if (!overflow_handler && parent_event) {
9026 overflow_handler = parent_event->overflow_handler; 9154 overflow_handler = parent_event->overflow_handler;
9027 context = parent_event->overflow_handler_context; 9155 context = parent_event->overflow_handler_context;
9156#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_EVENT_TRACING)
9157 if (overflow_handler == bpf_overflow_handler) {
9158 struct bpf_prog *prog = bpf_prog_inc(parent_event->prog);
9159
9160 if (IS_ERR(prog)) {
9161 err = PTR_ERR(prog);
9162 goto err_ns;
9163 }
9164 event->prog = prog;
9165 event->orig_overflow_handler =
9166 parent_event->orig_overflow_handler;
9167 }
9168#endif
9028 } 9169 }
9029 9170
9030 if (overflow_handler) { 9171 if (overflow_handler) {
@@ -9505,6 +9646,9 @@ SYSCALL_DEFINE5(perf_event_open,
9505 goto err_alloc; 9646 goto err_alloc;
9506 } 9647 }
9507 9648
9649 if (pmu->task_ctx_nr == perf_sw_context)
9650 event->event_caps |= PERF_EV_CAP_SOFTWARE;
9651
9508 if (group_leader && 9652 if (group_leader &&
9509 (is_software_event(event) != is_software_event(group_leader))) { 9653 (is_software_event(event) != is_software_event(group_leader))) {
9510 if (is_software_event(event)) { 9654 if (is_software_event(event)) {
@@ -9518,7 +9662,7 @@ SYSCALL_DEFINE5(perf_event_open,
9518 */ 9662 */
9519 pmu = group_leader->pmu; 9663 pmu = group_leader->pmu;
9520 } else if (is_software_event(group_leader) && 9664 } else if (is_software_event(group_leader) &&
9521 (group_leader->group_flags & PERF_GROUP_SOFTWARE)) { 9665 (group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) {
9522 /* 9666 /*
9523 * In case the group is a pure software group, and we 9667 * In case the group is a pure software group, and we
9524 * try to add a hardware event, move the whole group to 9668 * try to add a hardware event, move the whole group to
@@ -10453,6 +10597,8 @@ static void __init perf_event_init_all_cpus(void)
10453 10597
10454 INIT_LIST_HEAD(&per_cpu(pmu_sb_events.list, cpu)); 10598 INIT_LIST_HEAD(&per_cpu(pmu_sb_events.list, cpu));
10455 raw_spin_lock_init(&per_cpu(pmu_sb_events.lock, cpu)); 10599 raw_spin_lock_init(&per_cpu(pmu_sb_events.lock, cpu));
10600
10601 INIT_LIST_HEAD(&per_cpu(sched_cb_list, cpu));
10456 } 10602 }
10457} 10603}
10458 10604
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 8c50276b60d1..f9ec9add2164 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -150,7 +150,7 @@ static loff_t vaddr_to_offset(struct vm_area_struct *vma, unsigned long vaddr)
150 * Returns 0 on success, -EFAULT on failure. 150 * Returns 0 on success, -EFAULT on failure.
151 */ 151 */
152static int __replace_page(struct vm_area_struct *vma, unsigned long addr, 152static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
153 struct page *page, struct page *kpage) 153 struct page *old_page, struct page *new_page)
154{ 154{
155 struct mm_struct *mm = vma->vm_mm; 155 struct mm_struct *mm = vma->vm_mm;
156 spinlock_t *ptl; 156 spinlock_t *ptl;
@@ -161,49 +161,49 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
161 const unsigned long mmun_end = addr + PAGE_SIZE; 161 const unsigned long mmun_end = addr + PAGE_SIZE;
162 struct mem_cgroup *memcg; 162 struct mem_cgroup *memcg;
163 163
164 err = mem_cgroup_try_charge(kpage, vma->vm_mm, GFP_KERNEL, &memcg, 164 err = mem_cgroup_try_charge(new_page, vma->vm_mm, GFP_KERNEL, &memcg,
165 false); 165 false);
166 if (err) 166 if (err)
167 return err; 167 return err;
168 168
169 /* For try_to_free_swap() and munlock_vma_page() below */ 169 /* For try_to_free_swap() and munlock_vma_page() below */
170 lock_page(page); 170 lock_page(old_page);
171 171
172 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); 172 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
173 err = -EAGAIN; 173 err = -EAGAIN;
174 ptep = page_check_address(page, mm, addr, &ptl, 0); 174 ptep = page_check_address(old_page, mm, addr, &ptl, 0);
175 if (!ptep) { 175 if (!ptep) {
176 mem_cgroup_cancel_charge(kpage, memcg, false); 176 mem_cgroup_cancel_charge(new_page, memcg, false);
177 goto unlock; 177 goto unlock;
178 } 178 }
179 179
180 get_page(kpage); 180 get_page(new_page);
181 page_add_new_anon_rmap(kpage, vma, addr, false); 181 page_add_new_anon_rmap(new_page, vma, addr, false);
182 mem_cgroup_commit_charge(kpage, memcg, false, false); 182 mem_cgroup_commit_charge(new_page, memcg, false, false);
183 lru_cache_add_active_or_unevictable(kpage, vma); 183 lru_cache_add_active_or_unevictable(new_page, vma);
184 184
185 if (!PageAnon(page)) { 185 if (!PageAnon(old_page)) {
186 dec_mm_counter(mm, mm_counter_file(page)); 186 dec_mm_counter(mm, mm_counter_file(old_page));
187 inc_mm_counter(mm, MM_ANONPAGES); 187 inc_mm_counter(mm, MM_ANONPAGES);
188 } 188 }
189 189
190 flush_cache_page(vma, addr, pte_pfn(*ptep)); 190 flush_cache_page(vma, addr, pte_pfn(*ptep));
191 ptep_clear_flush_notify(vma, addr, ptep); 191 ptep_clear_flush_notify(vma, addr, ptep);
192 set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot)); 192 set_pte_at_notify(mm, addr, ptep, mk_pte(new_page, vma->vm_page_prot));
193 193
194 page_remove_rmap(page, false); 194 page_remove_rmap(old_page, false);
195 if (!page_mapped(page)) 195 if (!page_mapped(old_page))
196 try_to_free_swap(page); 196 try_to_free_swap(old_page);
197 pte_unmap_unlock(ptep, ptl); 197 pte_unmap_unlock(ptep, ptl);
198 198
199 if (vma->vm_flags & VM_LOCKED) 199 if (vma->vm_flags & VM_LOCKED)
200 munlock_vma_page(page); 200 munlock_vma_page(old_page);
201 put_page(page); 201 put_page(old_page);
202 202
203 err = 0; 203 err = 0;
204 unlock: 204 unlock:
205 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); 205 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
206 unlock_page(page); 206 unlock_page(old_page);
207 return err; 207 return err;
208} 208}
209 209
@@ -300,7 +300,8 @@ int uprobe_write_opcode(struct mm_struct *mm, unsigned long vaddr,
300 300
301retry: 301retry:
302 /* Read the page with vaddr into memory */ 302 /* Read the page with vaddr into memory */
303 ret = get_user_pages_remote(NULL, mm, vaddr, 1, 0, 1, &old_page, &vma); 303 ret = get_user_pages_remote(NULL, mm, vaddr, 1, FOLL_FORCE, &old_page,
304 &vma);
304 if (ret <= 0) 305 if (ret <= 0)
305 return ret; 306 return ret;
306 307
@@ -1710,7 +1711,8 @@ static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr)
1710 * but we treat this as a 'remote' access since it is 1711 * but we treat this as a 'remote' access since it is
1711 * essentially a kernel access to the memory. 1712 * essentially a kernel access to the memory.
1712 */ 1713 */
1713 result = get_user_pages_remote(NULL, mm, vaddr, 1, 0, 1, &page, NULL); 1714 result = get_user_pages_remote(NULL, mm, vaddr, 1, FOLL_FORCE, &page,
1715 NULL);
1714 if (result < 0) 1716 if (result < 0)
1715 return result; 1717 return result;
1716 1718
diff --git a/kernel/exit.c b/kernel/exit.c
index 091a78be3b09..3076f3089919 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -511,7 +511,7 @@ static void exit_mm(struct task_struct *tsk)
511 mm_update_next_owner(mm); 511 mm_update_next_owner(mm);
512 mmput(mm); 512 mmput(mm);
513 if (test_thread_flag(TIF_MEMDIE)) 513 if (test_thread_flag(TIF_MEMDIE))
514 exit_oom_victim(tsk); 514 exit_oom_victim();
515} 515}
516 516
517static struct task_struct *find_alive_thread(struct task_struct *p) 517static struct task_struct *find_alive_thread(struct task_struct *p)
@@ -725,7 +725,7 @@ static void check_stack_usage(void)
725static inline void check_stack_usage(void) {} 725static inline void check_stack_usage(void) {}
726#endif 726#endif
727 727
728void do_exit(long code) 728void __noreturn do_exit(long code)
729{ 729{
730 struct task_struct *tsk = current; 730 struct task_struct *tsk = current;
731 int group_dead; 731 int group_dead;
@@ -836,6 +836,7 @@ void do_exit(long code)
836 */ 836 */
837 perf_event_exit_task(tsk); 837 perf_event_exit_task(tsk);
838 838
839 sched_autogroup_exit_task(tsk);
839 cgroup_exit(tsk); 840 cgroup_exit(tsk);
840 841
841 /* 842 /*
@@ -882,29 +883,7 @@ void do_exit(long code)
882 exit_rcu(); 883 exit_rcu();
883 TASKS_RCU(__srcu_read_unlock(&tasks_rcu_exit_srcu, tasks_rcu_i)); 884 TASKS_RCU(__srcu_read_unlock(&tasks_rcu_exit_srcu, tasks_rcu_i));
884 885
885 /* 886 do_task_dead();
886 * The setting of TASK_RUNNING by try_to_wake_up() may be delayed
887 * when the following two conditions become true.
888 * - There is race condition of mmap_sem (It is acquired by
889 * exit_mm()), and
890 * - SMI occurs before setting TASK_RUNINNG.
891 * (or hypervisor of virtual machine switches to other guest)
892 * As a result, we may become TASK_RUNNING after becoming TASK_DEAD
893 *
894 * To avoid it, we have to wait for releasing tsk->pi_lock which
895 * is held by try_to_wake_up()
896 */
897 smp_mb();
898 raw_spin_unlock_wait(&tsk->pi_lock);
899
900 /* causes final put_task_struct in finish_task_switch(). */
901 tsk->state = TASK_DEAD;
902 tsk->flags |= PF_NOFREEZE; /* tell freezer to ignore us */
903 schedule();
904 BUG();
905 /* Avoid "noreturn function does return". */
906 for (;;)
907 cpu_relax(); /* For when BUG is null */
908} 887}
909EXPORT_SYMBOL_GPL(do_exit); 888EXPORT_SYMBOL_GPL(do_exit);
910 889
diff --git a/kernel/fork.c b/kernel/fork.c
index beb31725f7e2..997ac1d584f7 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -158,19 +158,83 @@ void __weak arch_release_thread_stack(unsigned long *stack)
158 * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a 158 * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a
159 * kmemcache based allocator. 159 * kmemcache based allocator.
160 */ 160 */
161# if THREAD_SIZE >= PAGE_SIZE 161# if THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK)
162static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, 162
163 int node) 163#ifdef CONFIG_VMAP_STACK
164/*
165 * vmalloc() is a bit slow, and calling vfree() enough times will force a TLB
166 * flush. Try to minimize the number of calls by caching stacks.
167 */
168#define NR_CACHED_STACKS 2
169static DEFINE_PER_CPU(struct vm_struct *, cached_stacks[NR_CACHED_STACKS]);
170#endif
171
172static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
164{ 173{
174#ifdef CONFIG_VMAP_STACK
175 void *stack;
176 int i;
177
178 local_irq_disable();
179 for (i = 0; i < NR_CACHED_STACKS; i++) {
180 struct vm_struct *s = this_cpu_read(cached_stacks[i]);
181
182 if (!s)
183 continue;
184 this_cpu_write(cached_stacks[i], NULL);
185
186 tsk->stack_vm_area = s;
187 local_irq_enable();
188 return s->addr;
189 }
190 local_irq_enable();
191
192 stack = __vmalloc_node_range(THREAD_SIZE, THREAD_SIZE,
193 VMALLOC_START, VMALLOC_END,
194 THREADINFO_GFP | __GFP_HIGHMEM,
195 PAGE_KERNEL,
196 0, node, __builtin_return_address(0));
197
198 /*
199 * We can't call find_vm_area() in interrupt context, and
200 * free_thread_stack() can be called in interrupt context,
201 * so cache the vm_struct.
202 */
203 if (stack)
204 tsk->stack_vm_area = find_vm_area(stack);
205 return stack;
206#else
165 struct page *page = alloc_pages_node(node, THREADINFO_GFP, 207 struct page *page = alloc_pages_node(node, THREADINFO_GFP,
166 THREAD_SIZE_ORDER); 208 THREAD_SIZE_ORDER);
167 209
168 return page ? page_address(page) : NULL; 210 return page ? page_address(page) : NULL;
211#endif
169} 212}
170 213
171static inline void free_thread_stack(unsigned long *stack) 214static inline void free_thread_stack(struct task_struct *tsk)
172{ 215{
173 __free_pages(virt_to_page(stack), THREAD_SIZE_ORDER); 216#ifdef CONFIG_VMAP_STACK
217 if (task_stack_vm_area(tsk)) {
218 unsigned long flags;
219 int i;
220
221 local_irq_save(flags);
222 for (i = 0; i < NR_CACHED_STACKS; i++) {
223 if (this_cpu_read(cached_stacks[i]))
224 continue;
225
226 this_cpu_write(cached_stacks[i], tsk->stack_vm_area);
227 local_irq_restore(flags);
228 return;
229 }
230 local_irq_restore(flags);
231
232 vfree(tsk->stack);
233 return;
234 }
235#endif
236
237 __free_pages(virt_to_page(tsk->stack), THREAD_SIZE_ORDER);
174} 238}
175# else 239# else
176static struct kmem_cache *thread_stack_cache; 240static struct kmem_cache *thread_stack_cache;
@@ -181,9 +245,9 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk,
181 return kmem_cache_alloc_node(thread_stack_cache, THREADINFO_GFP, node); 245 return kmem_cache_alloc_node(thread_stack_cache, THREADINFO_GFP, node);
182} 246}
183 247
184static void free_thread_stack(unsigned long *stack) 248static void free_thread_stack(struct task_struct *tsk)
185{ 249{
186 kmem_cache_free(thread_stack_cache, stack); 250 kmem_cache_free(thread_stack_cache, tsk->stack);
187} 251}
188 252
189void thread_stack_cache_init(void) 253void thread_stack_cache_init(void)
@@ -213,24 +277,79 @@ struct kmem_cache *vm_area_cachep;
213/* SLAB cache for mm_struct structures (tsk->mm) */ 277/* SLAB cache for mm_struct structures (tsk->mm) */
214static struct kmem_cache *mm_cachep; 278static struct kmem_cache *mm_cachep;
215 279
216static void account_kernel_stack(unsigned long *stack, int account) 280static void account_kernel_stack(struct task_struct *tsk, int account)
281{
282 void *stack = task_stack_page(tsk);
283 struct vm_struct *vm = task_stack_vm_area(tsk);
284
285 BUILD_BUG_ON(IS_ENABLED(CONFIG_VMAP_STACK) && PAGE_SIZE % 1024 != 0);
286
287 if (vm) {
288 int i;
289
290 BUG_ON(vm->nr_pages != THREAD_SIZE / PAGE_SIZE);
291
292 for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) {
293 mod_zone_page_state(page_zone(vm->pages[i]),
294 NR_KERNEL_STACK_KB,
295 PAGE_SIZE / 1024 * account);
296 }
297
298 /* All stack pages belong to the same memcg. */
299 memcg_kmem_update_page_stat(vm->pages[0], MEMCG_KERNEL_STACK_KB,
300 account * (THREAD_SIZE / 1024));
301 } else {
302 /*
303 * All stack pages are in the same zone and belong to the
304 * same memcg.
305 */
306 struct page *first_page = virt_to_page(stack);
307
308 mod_zone_page_state(page_zone(first_page), NR_KERNEL_STACK_KB,
309 THREAD_SIZE / 1024 * account);
310
311 memcg_kmem_update_page_stat(first_page, MEMCG_KERNEL_STACK_KB,
312 account * (THREAD_SIZE / 1024));
313 }
314}
315
316static void release_task_stack(struct task_struct *tsk)
217{ 317{
218 /* All stack pages are in the same zone and belong to the same memcg. */ 318 if (WARN_ON(tsk->state != TASK_DEAD))
219 struct page *first_page = virt_to_page(stack); 319 return; /* Better to leak the stack than to free prematurely */
220 320
221 mod_zone_page_state(page_zone(first_page), NR_KERNEL_STACK_KB, 321 account_kernel_stack(tsk, -1);
222 THREAD_SIZE / 1024 * account); 322 arch_release_thread_stack(tsk->stack);
323 free_thread_stack(tsk);
324 tsk->stack = NULL;
325#ifdef CONFIG_VMAP_STACK
326 tsk->stack_vm_area = NULL;
327#endif
328}
223 329
224 memcg_kmem_update_page_stat( 330#ifdef CONFIG_THREAD_INFO_IN_TASK
225 first_page, MEMCG_KERNEL_STACK_KB, 331void put_task_stack(struct task_struct *tsk)
226 account * (THREAD_SIZE / 1024)); 332{
333 if (atomic_dec_and_test(&tsk->stack_refcount))
334 release_task_stack(tsk);
227} 335}
336#endif
228 337
229void free_task(struct task_struct *tsk) 338void free_task(struct task_struct *tsk)
230{ 339{
231 account_kernel_stack(tsk->stack, -1); 340#ifndef CONFIG_THREAD_INFO_IN_TASK
232 arch_release_thread_stack(tsk->stack); 341 /*
233 free_thread_stack(tsk->stack); 342 * The task is finally done with both the stack and thread_info,
343 * so free both.
344 */
345 release_task_stack(tsk);
346#else
347 /*
348 * If the task had a separate stack allocation, it should be gone
349 * by now.
350 */
351 WARN_ON_ONCE(atomic_read(&tsk->stack_refcount) != 0);
352#endif
234 rt_mutex_debug_task_free(tsk); 353 rt_mutex_debug_task_free(tsk);
235 ftrace_graph_exit_task(tsk); 354 ftrace_graph_exit_task(tsk);
236 put_seccomp_filter(tsk); 355 put_seccomp_filter(tsk);
@@ -243,6 +362,12 @@ static inline void free_signal_struct(struct signal_struct *sig)
243{ 362{
244 taskstats_tgid_free(sig); 363 taskstats_tgid_free(sig);
245 sched_autogroup_exit(sig); 364 sched_autogroup_exit(sig);
365 /*
366 * __mmdrop is not safe to call from softirq context on x86 due to
367 * pgd_dtor so postpone it to the async context
368 */
369 if (sig->oom_mm)
370 mmdrop_async(sig->oom_mm);
246 kmem_cache_free(signal_cachep, sig); 371 kmem_cache_free(signal_cachep, sig);
247} 372}
248 373
@@ -302,6 +427,7 @@ int arch_task_struct_size __read_mostly;
302 427
303void __init fork_init(void) 428void __init fork_init(void)
304{ 429{
430 int i;
305#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR 431#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
306#ifndef ARCH_MIN_TASKALIGN 432#ifndef ARCH_MIN_TASKALIGN
307#define ARCH_MIN_TASKALIGN L1_CACHE_BYTES 433#define ARCH_MIN_TASKALIGN L1_CACHE_BYTES
@@ -321,6 +447,10 @@ void __init fork_init(void)
321 init_task.signal->rlim[RLIMIT_NPROC].rlim_max = max_threads/2; 447 init_task.signal->rlim[RLIMIT_NPROC].rlim_max = max_threads/2;
322 init_task.signal->rlim[RLIMIT_SIGPENDING] = 448 init_task.signal->rlim[RLIMIT_SIGPENDING] =
323 init_task.signal->rlim[RLIMIT_NPROC]; 449 init_task.signal->rlim[RLIMIT_NPROC];
450
451 for (i = 0; i < UCOUNT_COUNTS; i++) {
452 init_user_ns.ucount_max[i] = max_threads/2;
453 }
324} 454}
325 455
326int __weak arch_dup_task_struct(struct task_struct *dst, 456int __weak arch_dup_task_struct(struct task_struct *dst,
@@ -342,6 +472,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
342{ 472{
343 struct task_struct *tsk; 473 struct task_struct *tsk;
344 unsigned long *stack; 474 unsigned long *stack;
475 struct vm_struct *stack_vm_area;
345 int err; 476 int err;
346 477
347 if (node == NUMA_NO_NODE) 478 if (node == NUMA_NO_NODE)
@@ -354,11 +485,26 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
354 if (!stack) 485 if (!stack)
355 goto free_tsk; 486 goto free_tsk;
356 487
488 stack_vm_area = task_stack_vm_area(tsk);
489
357 err = arch_dup_task_struct(tsk, orig); 490 err = arch_dup_task_struct(tsk, orig);
491
492 /*
493 * arch_dup_task_struct() clobbers the stack-related fields. Make
494 * sure they're properly initialized before using any stack-related
495 * functions again.
496 */
497 tsk->stack = stack;
498#ifdef CONFIG_VMAP_STACK
499 tsk->stack_vm_area = stack_vm_area;
500#endif
501#ifdef CONFIG_THREAD_INFO_IN_TASK
502 atomic_set(&tsk->stack_refcount, 1);
503#endif
504
358 if (err) 505 if (err)
359 goto free_stack; 506 goto free_stack;
360 507
361 tsk->stack = stack;
362#ifdef CONFIG_SECCOMP 508#ifdef CONFIG_SECCOMP
363 /* 509 /*
364 * We must handle setting up seccomp filters once we're under 510 * We must handle setting up seccomp filters once we're under
@@ -390,21 +536,22 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
390 tsk->task_frag.page = NULL; 536 tsk->task_frag.page = NULL;
391 tsk->wake_q.next = NULL; 537 tsk->wake_q.next = NULL;
392 538
393 account_kernel_stack(stack, 1); 539 account_kernel_stack(tsk, 1);
394 540
395 kcov_task_init(tsk); 541 kcov_task_init(tsk);
396 542
397 return tsk; 543 return tsk;
398 544
399free_stack: 545free_stack:
400 free_thread_stack(stack); 546 free_thread_stack(tsk);
401free_tsk: 547free_tsk:
402 free_task_struct(tsk); 548 free_task_struct(tsk);
403 return NULL; 549 return NULL;
404} 550}
405 551
406#ifdef CONFIG_MMU 552#ifdef CONFIG_MMU
407static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) 553static __latent_entropy int dup_mmap(struct mm_struct *mm,
554 struct mm_struct *oldmm)
408{ 555{
409 struct vm_area_struct *mpnt, *tmp, *prev, **pprev; 556 struct vm_area_struct *mpnt, *tmp, *prev, **pprev;
410 struct rb_node **rb_link, *rb_parent; 557 struct rb_node **rb_link, *rb_parent;
@@ -711,6 +858,7 @@ static inline void __mmput(struct mm_struct *mm)
711 ksm_exit(mm); 858 ksm_exit(mm);
712 khugepaged_exit(mm); /* must run before exit_mmap */ 859 khugepaged_exit(mm); /* must run before exit_mmap */
713 exit_mmap(mm); 860 exit_mmap(mm);
861 mm_put_huge_zero_page(mm);
714 set_mm_exe_file(mm, NULL); 862 set_mm_exe_file(mm, NULL);
715 if (!list_empty(&mm->mmlist)) { 863 if (!list_empty(&mm->mmlist)) {
716 spin_lock(&mmlist_lock); 864 spin_lock(&mmlist_lock);
@@ -719,6 +867,7 @@ static inline void __mmput(struct mm_struct *mm)
719 } 867 }
720 if (mm->binfmt) 868 if (mm->binfmt)
721 module_put(mm->binfmt->module); 869 module_put(mm->binfmt->module);
870 set_bit(MMF_OOM_SKIP, &mm->flags);
722 mmdrop(mm); 871 mmdrop(mm);
723} 872}
724 873
@@ -1296,7 +1445,8 @@ init_task_pid(struct task_struct *task, enum pid_type type, struct pid *pid)
1296 * parts of the process environment (as per the clone 1445 * parts of the process environment (as per the clone
1297 * flags). The actual kick-off is left to the caller. 1446 * flags). The actual kick-off is left to the caller.
1298 */ 1447 */
1299static struct task_struct *copy_process(unsigned long clone_flags, 1448static __latent_entropy struct task_struct *copy_process(
1449 unsigned long clone_flags,
1300 unsigned long stack_start, 1450 unsigned long stack_start,
1301 unsigned long stack_size, 1451 unsigned long stack_size,
1302 int __user *child_tidptr, 1452 int __user *child_tidptr,
@@ -1715,6 +1865,8 @@ bad_fork_cleanup_count:
1715 atomic_dec(&p->cred->user->processes); 1865 atomic_dec(&p->cred->user->processes);
1716 exit_creds(p); 1866 exit_creds(p);
1717bad_fork_free: 1867bad_fork_free:
1868 p->state = TASK_DEAD;
1869 put_task_stack(p);
1718 free_task(p); 1870 free_task(p);
1719fork_out: 1871fork_out:
1720 return ERR_PTR(retval); 1872 return ERR_PTR(retval);
@@ -1780,6 +1932,7 @@ long _do_fork(unsigned long clone_flags,
1780 1932
1781 p = copy_process(clone_flags, stack_start, stack_size, 1933 p = copy_process(clone_flags, stack_start, stack_size,
1782 child_tidptr, NULL, trace, tls, NUMA_NO_NODE); 1934 child_tidptr, NULL, trace, tls, NUMA_NO_NODE);
1935 add_latent_entropy();
1783 /* 1936 /*
1784 * Do this prior waking up the new thread - the thread pointer 1937 * Do this prior waking up the new thread - the thread pointer
1785 * might get invalid after that point, if the thread exits quickly. 1938 * might get invalid after that point, if the thread exits quickly.
diff --git a/kernel/futex.c b/kernel/futex.c
index 46cb3a301bc1..2c4be467fecd 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -381,8 +381,12 @@ static inline int hb_waiters_pending(struct futex_hash_bucket *hb)
381#endif 381#endif
382} 382}
383 383
384/* 384/**
385 * We hash on the keys returned from get_futex_key (see below). 385 * hash_futex - Return the hash bucket in the global hash
386 * @key: Pointer to the futex key for which the hash is calculated
387 *
388 * We hash on the keys returned from get_futex_key (see below) and return the
389 * corresponding hash bucket in the global hash.
386 */ 390 */
387static struct futex_hash_bucket *hash_futex(union futex_key *key) 391static struct futex_hash_bucket *hash_futex(union futex_key *key)
388{ 392{
@@ -392,7 +396,12 @@ static struct futex_hash_bucket *hash_futex(union futex_key *key)
392 return &futex_queues[hash & (futex_hashsize - 1)]; 396 return &futex_queues[hash & (futex_hashsize - 1)];
393} 397}
394 398
395/* 399
400/**
401 * match_futex - Check whether two futex keys are equal
402 * @key1: Pointer to key1
403 * @key2: Pointer to key2
404 *
396 * Return 1 if two futex_keys are equal, 0 otherwise. 405 * Return 1 if two futex_keys are equal, 0 otherwise.
397 */ 406 */
398static inline int match_futex(union futex_key *key1, union futex_key *key2) 407static inline int match_futex(union futex_key *key1, union futex_key *key2)
diff --git a/kernel/groups.c b/kernel/groups.c
index 74d431d25251..2fcadd66a8fd 100644
--- a/kernel/groups.c
+++ b/kernel/groups.c
@@ -7,55 +7,31 @@
7#include <linux/security.h> 7#include <linux/security.h>
8#include <linux/syscalls.h> 8#include <linux/syscalls.h>
9#include <linux/user_namespace.h> 9#include <linux/user_namespace.h>
10#include <linux/vmalloc.h>
10#include <asm/uaccess.h> 11#include <asm/uaccess.h>
11 12
12struct group_info *groups_alloc(int gidsetsize) 13struct group_info *groups_alloc(int gidsetsize)
13{ 14{
14 struct group_info *group_info; 15 struct group_info *gi;
15 int nblocks; 16 unsigned int len;
16 int i; 17
17 18 len = sizeof(struct group_info) + sizeof(kgid_t) * gidsetsize;
18 nblocks = (gidsetsize + NGROUPS_PER_BLOCK - 1) / NGROUPS_PER_BLOCK; 19 gi = kmalloc(len, GFP_KERNEL_ACCOUNT|__GFP_NOWARN|__GFP_NORETRY);
19 /* Make sure we always allocate at least one indirect block pointer */ 20 if (!gi)
20 nblocks = nblocks ? : 1; 21 gi = __vmalloc(len, GFP_KERNEL_ACCOUNT|__GFP_HIGHMEM, PAGE_KERNEL);
21 group_info = kmalloc(sizeof(*group_info) + nblocks*sizeof(gid_t *), GFP_USER); 22 if (!gi)
22 if (!group_info)
23 return NULL; 23 return NULL;
24 group_info->ngroups = gidsetsize;
25 group_info->nblocks = nblocks;
26 atomic_set(&group_info->usage, 1);
27
28 if (gidsetsize <= NGROUPS_SMALL)
29 group_info->blocks[0] = group_info->small_block;
30 else {
31 for (i = 0; i < nblocks; i++) {
32 kgid_t *b;
33 b = (void *)__get_free_page(GFP_USER);
34 if (!b)
35 goto out_undo_partial_alloc;
36 group_info->blocks[i] = b;
37 }
38 }
39 return group_info;
40 24
41out_undo_partial_alloc: 25 atomic_set(&gi->usage, 1);
42 while (--i >= 0) { 26 gi->ngroups = gidsetsize;
43 free_page((unsigned long)group_info->blocks[i]); 27 return gi;
44 }
45 kfree(group_info);
46 return NULL;
47} 28}
48 29
49EXPORT_SYMBOL(groups_alloc); 30EXPORT_SYMBOL(groups_alloc);
50 31
51void groups_free(struct group_info *group_info) 32void groups_free(struct group_info *group_info)
52{ 33{
53 if (group_info->blocks[0] != group_info->small_block) { 34 kvfree(group_info);
54 int i;
55 for (i = 0; i < group_info->nblocks; i++)
56 free_page((unsigned long)group_info->blocks[i]);
57 }
58 kfree(group_info);
59} 35}
60 36
61EXPORT_SYMBOL(groups_free); 37EXPORT_SYMBOL(groups_free);
@@ -70,7 +46,7 @@ static int groups_to_user(gid_t __user *grouplist,
70 46
71 for (i = 0; i < count; i++) { 47 for (i = 0; i < count; i++) {
72 gid_t gid; 48 gid_t gid;
73 gid = from_kgid_munged(user_ns, GROUP_AT(group_info, i)); 49 gid = from_kgid_munged(user_ns, group_info->gid[i]);
74 if (put_user(gid, grouplist+i)) 50 if (put_user(gid, grouplist+i))
75 return -EFAULT; 51 return -EFAULT;
76 } 52 }
@@ -95,7 +71,7 @@ static int groups_from_user(struct group_info *group_info,
95 if (!gid_valid(kgid)) 71 if (!gid_valid(kgid))
96 return -EINVAL; 72 return -EINVAL;
97 73
98 GROUP_AT(group_info, i) = kgid; 74 group_info->gid[i] = kgid;
99 } 75 }
100 return 0; 76 return 0;
101} 77}
@@ -115,15 +91,14 @@ static void groups_sort(struct group_info *group_info)
115 for (base = 0; base < max; base++) { 91 for (base = 0; base < max; base++) {
116 int left = base; 92 int left = base;
117 int right = left + stride; 93 int right = left + stride;
118 kgid_t tmp = GROUP_AT(group_info, right); 94 kgid_t tmp = group_info->gid[right];
119 95
120 while (left >= 0 && gid_gt(GROUP_AT(group_info, left), tmp)) { 96 while (left >= 0 && gid_gt(group_info->gid[left], tmp)) {
121 GROUP_AT(group_info, right) = 97 group_info->gid[right] = group_info->gid[left];
122 GROUP_AT(group_info, left);
123 right = left; 98 right = left;
124 left -= stride; 99 left -= stride;
125 } 100 }
126 GROUP_AT(group_info, right) = tmp; 101 group_info->gid[right] = tmp;
127 } 102 }
128 stride /= 3; 103 stride /= 3;
129 } 104 }
@@ -141,9 +116,9 @@ int groups_search(const struct group_info *group_info, kgid_t grp)
141 right = group_info->ngroups; 116 right = group_info->ngroups;
142 while (left < right) { 117 while (left < right) {
143 unsigned int mid = (left+right)/2; 118 unsigned int mid = (left+right)/2;
144 if (gid_gt(grp, GROUP_AT(group_info, mid))) 119 if (gid_gt(grp, group_info->gid[mid]))
145 left = mid + 1; 120 left = mid + 1;
146 else if (gid_lt(grp, GROUP_AT(group_info, mid))) 121 else if (gid_lt(grp, group_info->gid[mid]))
147 right = mid; 122 right = mid;
148 else 123 else
149 return 1; 124 return 1;
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index d234022805dc..2b59c82cc3e1 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -98,26 +98,26 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
98 98
99 trace_sched_process_hang(t); 99 trace_sched_process_hang(t);
100 100
101 if (!sysctl_hung_task_warnings) 101 if (!sysctl_hung_task_warnings && !sysctl_hung_task_panic)
102 return; 102 return;
103 103
104 if (sysctl_hung_task_warnings > 0)
105 sysctl_hung_task_warnings--;
106
107 /* 104 /*
108 * Ok, the task did not get scheduled for more than 2 minutes, 105 * Ok, the task did not get scheduled for more than 2 minutes,
109 * complain: 106 * complain:
110 */ 107 */
111 pr_err("INFO: task %s:%d blocked for more than %ld seconds.\n", 108 if (sysctl_hung_task_warnings) {
112 t->comm, t->pid, timeout); 109 sysctl_hung_task_warnings--;
113 pr_err(" %s %s %.*s\n", 110 pr_err("INFO: task %s:%d blocked for more than %ld seconds.\n",
114 print_tainted(), init_utsname()->release, 111 t->comm, t->pid, timeout);
115 (int)strcspn(init_utsname()->version, " "), 112 pr_err(" %s %s %.*s\n",
116 init_utsname()->version); 113 print_tainted(), init_utsname()->release,
117 pr_err("\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\"" 114 (int)strcspn(init_utsname()->version, " "),
118 " disables this message.\n"); 115 init_utsname()->version);
119 sched_show_task(t); 116 pr_err("\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\""
120 debug_show_held_locks(t); 117 " disables this message.\n");
118 sched_show_task(t);
119 debug_show_all_locks();
120 }
121 121
122 touch_nmi_watchdog(); 122 touch_nmi_watchdog();
123 123
diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
index 32f6cfcff212..17f51d63da56 100644
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -4,60 +4,151 @@
4#include <linux/slab.h> 4#include <linux/slab.h>
5#include <linux/cpu.h> 5#include <linux/cpu.h>
6 6
7static int get_first_sibling(unsigned int cpu) 7static void irq_spread_init_one(struct cpumask *irqmsk, struct cpumask *nmsk,
8 int cpus_per_vec)
8{ 9{
9 unsigned int ret; 10 const struct cpumask *siblmsk;
11 int cpu, sibl;
10 12
11 ret = cpumask_first(topology_sibling_cpumask(cpu)); 13 for ( ; cpus_per_vec > 0; ) {
12 if (ret < nr_cpu_ids) 14 cpu = cpumask_first(nmsk);
13 return ret; 15
14 return cpu; 16 /* Should not happen, but I'm too lazy to think about it */
17 if (cpu >= nr_cpu_ids)
18 return;
19
20 cpumask_clear_cpu(cpu, nmsk);
21 cpumask_set_cpu(cpu, irqmsk);
22 cpus_per_vec--;
23
24 /* If the cpu has siblings, use them first */
25 siblmsk = topology_sibling_cpumask(cpu);
26 for (sibl = -1; cpus_per_vec > 0; ) {
27 sibl = cpumask_next(sibl, siblmsk);
28 if (sibl >= nr_cpu_ids)
29 break;
30 if (!cpumask_test_and_clear_cpu(sibl, nmsk))
31 continue;
32 cpumask_set_cpu(sibl, irqmsk);
33 cpus_per_vec--;
34 }
35 }
36}
37
38static int get_nodes_in_cpumask(const struct cpumask *mask, nodemask_t *nodemsk)
39{
40 int n, nodes;
41
42 /* Calculate the number of nodes in the supplied affinity mask */
43 for (n = 0, nodes = 0; n < num_online_nodes(); n++) {
44 if (cpumask_intersects(mask, cpumask_of_node(n))) {
45 node_set(n, *nodemsk);
46 nodes++;
47 }
48 }
49 return nodes;
15} 50}
16 51
17/* 52/**
18 * Take a map of online CPUs and the number of available interrupt vectors 53 * irq_create_affinity_masks - Create affinity masks for multiqueue spreading
19 * and generate an output cpumask suitable for spreading MSI/MSI-X vectors 54 * @affinity: The affinity mask to spread. If NULL cpu_online_mask
20 * so that they are distributed as good as possible around the CPUs. If 55 * is used
21 * more vectors than CPUs are available we'll map one to each CPU, 56 * @nvecs: The number of vectors
22 * otherwise we map one to the first sibling of each socket.
23 * 57 *
24 * If there are more vectors than CPUs we will still only have one bit 58 * Returns the masks pointer or NULL if allocation failed.
25 * set per CPU, but interrupt code will keep on assigning the vectors from
26 * the start of the bitmap until we run out of vectors.
27 */ 59 */
28struct cpumask *irq_create_affinity_mask(unsigned int *nr_vecs) 60struct cpumask *irq_create_affinity_masks(const struct cpumask *affinity,
61 int nvec)
29{ 62{
30 struct cpumask *affinity_mask; 63 int n, nodes, vecs_per_node, cpus_per_vec, extra_vecs, curvec = 0;
31 unsigned int max_vecs = *nr_vecs; 64 nodemask_t nodemsk = NODE_MASK_NONE;
65 struct cpumask *masks;
66 cpumask_var_t nmsk;
32 67
33 if (max_vecs == 1) 68 if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL))
34 return NULL; 69 return NULL;
35 70
36 affinity_mask = kzalloc(cpumask_size(), GFP_KERNEL); 71 masks = kzalloc(nvec * sizeof(*masks), GFP_KERNEL);
37 if (!affinity_mask) { 72 if (!masks)
38 *nr_vecs = 1; 73 goto out;
39 return NULL;
40 }
41 74
75 /* Stabilize the cpumasks */
42 get_online_cpus(); 76 get_online_cpus();
43 if (max_vecs >= num_online_cpus()) { 77 /* If the supplied affinity mask is NULL, use cpu online mask */
44 cpumask_copy(affinity_mask, cpu_online_mask); 78 if (!affinity)
45 *nr_vecs = num_online_cpus(); 79 affinity = cpu_online_mask;
46 } else { 80
47 unsigned int vecs = 0, cpu; 81 nodes = get_nodes_in_cpumask(affinity, &nodemsk);
48
49 for_each_online_cpu(cpu) {
50 if (cpu == get_first_sibling(cpu)) {
51 cpumask_set_cpu(cpu, affinity_mask);
52 vecs++;
53 }
54 82
55 if (--max_vecs == 0) 83 /*
84 * If the number of nodes in the mask is less than or equal the
85 * number of vectors we just spread the vectors across the nodes.
86 */
87 if (nvec <= nodes) {
88 for_each_node_mask(n, nodemsk) {
89 cpumask_copy(masks + curvec, cpumask_of_node(n));
90 if (++curvec == nvec)
56 break; 91 break;
57 } 92 }
58 *nr_vecs = vecs; 93 goto outonl;
59 } 94 }
95
96 /* Spread the vectors per node */
97 vecs_per_node = nvec / nodes;
98 /* Account for rounding errors */
99 extra_vecs = nvec - (nodes * vecs_per_node);
100
101 for_each_node_mask(n, nodemsk) {
102 int ncpus, v, vecs_to_assign = vecs_per_node;
103
104 /* Get the cpus on this node which are in the mask */
105 cpumask_and(nmsk, affinity, cpumask_of_node(n));
106
107 /* Calculate the number of cpus per vector */
108 ncpus = cpumask_weight(nmsk);
109
110 for (v = 0; curvec < nvec && v < vecs_to_assign; curvec++, v++) {
111 cpus_per_vec = ncpus / vecs_to_assign;
112
113 /* Account for extra vectors to compensate rounding errors */
114 if (extra_vecs) {
115 cpus_per_vec++;
116 if (!--extra_vecs)
117 vecs_per_node++;
118 }
119 irq_spread_init_one(masks + curvec, nmsk, cpus_per_vec);
120 }
121
122 if (curvec >= nvec)
123 break;
124 }
125
126outonl:
60 put_online_cpus(); 127 put_online_cpus();
128out:
129 free_cpumask_var(nmsk);
130 return masks;
131}
132
133/**
134 * irq_calc_affinity_vectors - Calculate to optimal number of vectors for a given affinity mask
135 * @affinity: The affinity mask to spread. If NULL cpu_online_mask
136 * is used
137 * @maxvec: The maximum number of vectors available
138 */
139int irq_calc_affinity_vectors(const struct cpumask *affinity, int maxvec)
140{
141 int cpus, ret;
61 142
62 return affinity_mask; 143 /* Stabilize the cpumasks */
144 get_online_cpus();
145 /* If the supplied affinity mask is NULL, use cpu online mask */
146 if (!affinity)
147 affinity = cpu_online_mask;
148
149 cpus = cpumask_weight(affinity);
150 ret = (cpus < maxvec) ? cpus : maxvec;
151
152 put_online_cpus();
153 return ret;
63} 154}
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 637389088b3f..be3c34e4f2ac 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -76,7 +76,6 @@ int irq_set_irq_type(unsigned int irq, unsigned int type)
76 if (!desc) 76 if (!desc)
77 return -EINVAL; 77 return -EINVAL;
78 78
79 type &= IRQ_TYPE_SENSE_MASK;
80 ret = __irq_set_trigger(desc, type); 79 ret = __irq_set_trigger(desc, type);
81 irq_put_desc_busunlock(desc, flags); 80 irq_put_desc_busunlock(desc, flags);
82 return ret; 81 return ret;
@@ -756,7 +755,6 @@ void handle_percpu_devid_irq(struct irq_desc *desc)
756{ 755{
757 struct irq_chip *chip = irq_desc_get_chip(desc); 756 struct irq_chip *chip = irq_desc_get_chip(desc);
758 struct irqaction *action = desc->action; 757 struct irqaction *action = desc->action;
759 void *dev_id = raw_cpu_ptr(action->percpu_dev_id);
760 unsigned int irq = irq_desc_get_irq(desc); 758 unsigned int irq = irq_desc_get_irq(desc);
761 irqreturn_t res; 759 irqreturn_t res;
762 760
@@ -765,15 +763,26 @@ void handle_percpu_devid_irq(struct irq_desc *desc)
765 if (chip->irq_ack) 763 if (chip->irq_ack)
766 chip->irq_ack(&desc->irq_data); 764 chip->irq_ack(&desc->irq_data);
767 765
768 trace_irq_handler_entry(irq, action); 766 if (likely(action)) {
769 res = action->handler(irq, dev_id); 767 trace_irq_handler_entry(irq, action);
770 trace_irq_handler_exit(irq, action, res); 768 res = action->handler(irq, raw_cpu_ptr(action->percpu_dev_id));
769 trace_irq_handler_exit(irq, action, res);
770 } else {
771 unsigned int cpu = smp_processor_id();
772 bool enabled = cpumask_test_cpu(cpu, desc->percpu_enabled);
773
774 if (enabled)
775 irq_percpu_disable(desc, cpu);
776
777 pr_err_once("Spurious%s percpu IRQ%u on CPU%u\n",
778 enabled ? " and unmasked" : "", irq, cpu);
779 }
771 780
772 if (chip->irq_eoi) 781 if (chip->irq_eoi)
773 chip->irq_eoi(&desc->irq_data); 782 chip->irq_eoi(&desc->irq_data);
774} 783}
775 784
776void 785static void
777__irq_do_set_handler(struct irq_desc *desc, irq_flow_handler_t handle, 786__irq_do_set_handler(struct irq_desc *desc, irq_flow_handler_t handle,
778 int is_chained, const char *name) 787 int is_chained, const char *name)
779{ 788{
@@ -820,6 +829,8 @@ __irq_do_set_handler(struct irq_desc *desc, irq_flow_handler_t handle,
820 desc->name = name; 829 desc->name = name;
821 830
822 if (handle != handle_bad_irq && is_chained) { 831 if (handle != handle_bad_irq && is_chained) {
832 unsigned int type = irqd_get_trigger_type(&desc->irq_data);
833
823 /* 834 /*
824 * We're about to start this interrupt immediately, 835 * We're about to start this interrupt immediately,
825 * hence the need to set the trigger configuration. 836 * hence the need to set the trigger configuration.
@@ -828,8 +839,10 @@ __irq_do_set_handler(struct irq_desc *desc, irq_flow_handler_t handle,
828 * chained interrupt. Reset it immediately because we 839 * chained interrupt. Reset it immediately because we
829 * do know better. 840 * do know better.
830 */ 841 */
831 __irq_set_trigger(desc, irqd_get_trigger_type(&desc->irq_data)); 842 if (type != IRQ_TYPE_NONE) {
832 desc->handle_irq = handle; 843 __irq_set_trigger(desc, type);
844 desc->handle_irq = handle;
845 }
833 846
834 irq_settings_set_noprobe(desc); 847 irq_settings_set_noprobe(desc);
835 irq_settings_set_norequest(desc); 848 irq_settings_set_norequest(desc);
diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c
index abd286afbd27..ee32870079c9 100644
--- a/kernel/irq/generic-chip.c
+++ b/kernel/irq/generic-chip.c
@@ -260,9 +260,9 @@ irq_gc_init_mask_cache(struct irq_chip_generic *gc, enum irq_gc_flags flags)
260} 260}
261 261
262/** 262/**
263 * irq_alloc_domain_generic_chip - Allocate generic chips for an irq domain 263 * __irq_alloc_domain_generic_chip - Allocate generic chips for an irq domain
264 * @d: irq domain for which to allocate chips 264 * @d: irq domain for which to allocate chips
265 * @irqs_per_chip: Number of interrupts each chip handles 265 * @irqs_per_chip: Number of interrupts each chip handles (max 32)
266 * @num_ct: Number of irq_chip_type instances associated with this 266 * @num_ct: Number of irq_chip_type instances associated with this
267 * @name: Name of the irq chip 267 * @name: Name of the irq chip
268 * @handler: Default flow handler associated with these chips 268 * @handler: Default flow handler associated with these chips
@@ -270,11 +270,11 @@ irq_gc_init_mask_cache(struct irq_chip_generic *gc, enum irq_gc_flags flags)
270 * @set: IRQ_* bits to set in the mapping function 270 * @set: IRQ_* bits to set in the mapping function
271 * @gcflags: Generic chip specific setup flags 271 * @gcflags: Generic chip specific setup flags
272 */ 272 */
273int irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip, 273int __irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip,
274 int num_ct, const char *name, 274 int num_ct, const char *name,
275 irq_flow_handler_t handler, 275 irq_flow_handler_t handler,
276 unsigned int clr, unsigned int set, 276 unsigned int clr, unsigned int set,
277 enum irq_gc_flags gcflags) 277 enum irq_gc_flags gcflags)
278{ 278{
279 struct irq_domain_chip_generic *dgc; 279 struct irq_domain_chip_generic *dgc;
280 struct irq_chip_generic *gc; 280 struct irq_chip_generic *gc;
@@ -326,7 +326,21 @@ int irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip,
326 d->name = name; 326 d->name = name;
327 return 0; 327 return 0;
328} 328}
329EXPORT_SYMBOL_GPL(irq_alloc_domain_generic_chips); 329EXPORT_SYMBOL_GPL(__irq_alloc_domain_generic_chips);
330
331static struct irq_chip_generic *
332__irq_get_domain_generic_chip(struct irq_domain *d, unsigned int hw_irq)
333{
334 struct irq_domain_chip_generic *dgc = d->gc;
335 int idx;
336
337 if (!dgc)
338 return ERR_PTR(-ENODEV);
339 idx = hw_irq / dgc->irqs_per_chip;
340 if (idx >= dgc->num_chips)
341 return ERR_PTR(-EINVAL);
342 return dgc->gc[idx];
343}
330 344
331/** 345/**
332 * irq_get_domain_generic_chip - Get a pointer to the generic chip of a hw_irq 346 * irq_get_domain_generic_chip - Get a pointer to the generic chip of a hw_irq
@@ -336,15 +350,9 @@ EXPORT_SYMBOL_GPL(irq_alloc_domain_generic_chips);
336struct irq_chip_generic * 350struct irq_chip_generic *
337irq_get_domain_generic_chip(struct irq_domain *d, unsigned int hw_irq) 351irq_get_domain_generic_chip(struct irq_domain *d, unsigned int hw_irq)
338{ 352{
339 struct irq_domain_chip_generic *dgc = d->gc; 353 struct irq_chip_generic *gc = __irq_get_domain_generic_chip(d, hw_irq);
340 int idx;
341 354
342 if (!dgc) 355 return !IS_ERR(gc) ? gc : NULL;
343 return NULL;
344 idx = hw_irq / dgc->irqs_per_chip;
345 if (idx >= dgc->num_chips)
346 return NULL;
347 return dgc->gc[idx];
348} 356}
349EXPORT_SYMBOL_GPL(irq_get_domain_generic_chip); 357EXPORT_SYMBOL_GPL(irq_get_domain_generic_chip);
350 358
@@ -368,13 +376,9 @@ int irq_map_generic_chip(struct irq_domain *d, unsigned int virq,
368 unsigned long flags; 376 unsigned long flags;
369 int idx; 377 int idx;
370 378
371 if (!d->gc) 379 gc = __irq_get_domain_generic_chip(d, hw_irq);
372 return -ENODEV; 380 if (IS_ERR(gc))
373 381 return PTR_ERR(gc);
374 idx = hw_irq / dgc->irqs_per_chip;
375 if (idx >= dgc->num_chips)
376 return -EINVAL;
377 gc = dgc->gc[idx];
378 382
379 idx = hw_irq % dgc->irqs_per_chip; 383 idx = hw_irq % dgc->irqs_per_chip;
380 384
@@ -409,10 +413,30 @@ int irq_map_generic_chip(struct irq_domain *d, unsigned int virq,
409 irq_modify_status(virq, dgc->irq_flags_to_clear, dgc->irq_flags_to_set); 413 irq_modify_status(virq, dgc->irq_flags_to_clear, dgc->irq_flags_to_set);
410 return 0; 414 return 0;
411} 415}
412EXPORT_SYMBOL_GPL(irq_map_generic_chip); 416
417static void irq_unmap_generic_chip(struct irq_domain *d, unsigned int virq)
418{
419 struct irq_data *data = irq_domain_get_irq_data(d, virq);
420 struct irq_domain_chip_generic *dgc = d->gc;
421 unsigned int hw_irq = data->hwirq;
422 struct irq_chip_generic *gc;
423 int irq_idx;
424
425 gc = irq_get_domain_generic_chip(d, hw_irq);
426 if (!gc)
427 return;
428
429 irq_idx = hw_irq % dgc->irqs_per_chip;
430
431 clear_bit(irq_idx, &gc->installed);
432 irq_domain_set_info(d, virq, hw_irq, &no_irq_chip, NULL, NULL, NULL,
433 NULL);
434
435}
413 436
414struct irq_domain_ops irq_generic_chip_ops = { 437struct irq_domain_ops irq_generic_chip_ops = {
415 .map = irq_map_generic_chip, 438 .map = irq_map_generic_chip,
439 .unmap = irq_unmap_generic_chip,
416 .xlate = irq_domain_xlate_onetwocell, 440 .xlate = irq_domain_xlate_onetwocell,
417}; 441};
418EXPORT_SYMBOL_GPL(irq_generic_chip_ops); 442EXPORT_SYMBOL_GPL(irq_generic_chip_ops);
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index a623b44f2d4b..00bb0aeea1d0 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -15,6 +15,7 @@
15#include <linux/radix-tree.h> 15#include <linux/radix-tree.h>
16#include <linux/bitmap.h> 16#include <linux/bitmap.h>
17#include <linux/irqdomain.h> 17#include <linux/irqdomain.h>
18#include <linux/sysfs.h>
18 19
19#include "internals.h" 20#include "internals.h"
20 21
@@ -123,6 +124,181 @@ static DECLARE_BITMAP(allocated_irqs, IRQ_BITMAP_BITS);
123 124
124#ifdef CONFIG_SPARSE_IRQ 125#ifdef CONFIG_SPARSE_IRQ
125 126
127static void irq_kobj_release(struct kobject *kobj);
128
129#ifdef CONFIG_SYSFS
130static struct kobject *irq_kobj_base;
131
132#define IRQ_ATTR_RO(_name) \
133static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
134
135static ssize_t per_cpu_count_show(struct kobject *kobj,
136 struct kobj_attribute *attr, char *buf)
137{
138 struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj);
139 int cpu, irq = desc->irq_data.irq;
140 ssize_t ret = 0;
141 char *p = "";
142
143 for_each_possible_cpu(cpu) {
144 unsigned int c = kstat_irqs_cpu(irq, cpu);
145
146 ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%s%u", p, c);
147 p = ",";
148 }
149
150 ret += scnprintf(buf + ret, PAGE_SIZE - ret, "\n");
151 return ret;
152}
153IRQ_ATTR_RO(per_cpu_count);
154
155static ssize_t chip_name_show(struct kobject *kobj,
156 struct kobj_attribute *attr, char *buf)
157{
158 struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj);
159 ssize_t ret = 0;
160
161 raw_spin_lock_irq(&desc->lock);
162 if (desc->irq_data.chip && desc->irq_data.chip->name) {
163 ret = scnprintf(buf, PAGE_SIZE, "%s\n",
164 desc->irq_data.chip->name);
165 }
166 raw_spin_unlock_irq(&desc->lock);
167
168 return ret;
169}
170IRQ_ATTR_RO(chip_name);
171
172static ssize_t hwirq_show(struct kobject *kobj,
173 struct kobj_attribute *attr, char *buf)
174{
175 struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj);
176 ssize_t ret = 0;
177
178 raw_spin_lock_irq(&desc->lock);
179 if (desc->irq_data.domain)
180 ret = sprintf(buf, "%d\n", (int)desc->irq_data.hwirq);
181 raw_spin_unlock_irq(&desc->lock);
182
183 return ret;
184}
185IRQ_ATTR_RO(hwirq);
186
187static ssize_t type_show(struct kobject *kobj,
188 struct kobj_attribute *attr, char *buf)
189{
190 struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj);
191 ssize_t ret = 0;
192
193 raw_spin_lock_irq(&desc->lock);
194 ret = sprintf(buf, "%s\n",
195 irqd_is_level_type(&desc->irq_data) ? "level" : "edge");
196 raw_spin_unlock_irq(&desc->lock);
197
198 return ret;
199
200}
201IRQ_ATTR_RO(type);
202
203static ssize_t name_show(struct kobject *kobj,
204 struct kobj_attribute *attr, char *buf)
205{
206 struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj);
207 ssize_t ret = 0;
208
209 raw_spin_lock_irq(&desc->lock);
210 if (desc->name)
211 ret = scnprintf(buf, PAGE_SIZE, "%s\n", desc->name);
212 raw_spin_unlock_irq(&desc->lock);
213
214 return ret;
215}
216IRQ_ATTR_RO(name);
217
218static ssize_t actions_show(struct kobject *kobj,
219 struct kobj_attribute *attr, char *buf)
220{
221 struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj);
222 struct irqaction *action;
223 ssize_t ret = 0;
224 char *p = "";
225
226 raw_spin_lock_irq(&desc->lock);
227 for (action = desc->action; action != NULL; action = action->next) {
228 ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%s%s",
229 p, action->name);
230 p = ",";
231 }
232 raw_spin_unlock_irq(&desc->lock);
233
234 if (ret)
235 ret += scnprintf(buf + ret, PAGE_SIZE - ret, "\n");
236
237 return ret;
238}
239IRQ_ATTR_RO(actions);
240
241static struct attribute *irq_attrs[] = {
242 &per_cpu_count_attr.attr,
243 &chip_name_attr.attr,
244 &hwirq_attr.attr,
245 &type_attr.attr,
246 &name_attr.attr,
247 &actions_attr.attr,
248 NULL
249};
250
251static struct kobj_type irq_kobj_type = {
252 .release = irq_kobj_release,
253 .sysfs_ops = &kobj_sysfs_ops,
254 .default_attrs = irq_attrs,
255};
256
257static void irq_sysfs_add(int irq, struct irq_desc *desc)
258{
259 if (irq_kobj_base) {
260 /*
261 * Continue even in case of failure as this is nothing
262 * crucial.
263 */
264 if (kobject_add(&desc->kobj, irq_kobj_base, "%d", irq))
265 pr_warn("Failed to add kobject for irq %d\n", irq);
266 }
267}
268
269static int __init irq_sysfs_init(void)
270{
271 struct irq_desc *desc;
272 int irq;
273
274 /* Prevent concurrent irq alloc/free */
275 irq_lock_sparse();
276
277 irq_kobj_base = kobject_create_and_add("irq", kernel_kobj);
278 if (!irq_kobj_base) {
279 irq_unlock_sparse();
280 return -ENOMEM;
281 }
282
283 /* Add the already allocated interrupts */
284 for_each_irq_desc(irq, desc)
285 irq_sysfs_add(irq, desc);
286 irq_unlock_sparse();
287
288 return 0;
289}
290postcore_initcall(irq_sysfs_init);
291
292#else /* !CONFIG_SYSFS */
293
294static struct kobj_type irq_kobj_type = {
295 .release = irq_kobj_release,
296};
297
298static void irq_sysfs_add(int irq, struct irq_desc *desc) {}
299
300#endif /* CONFIG_SYSFS */
301
126static RADIX_TREE(irq_desc_tree, GFP_KERNEL); 302static RADIX_TREE(irq_desc_tree, GFP_KERNEL);
127 303
128static void irq_insert_desc(unsigned int irq, struct irq_desc *desc) 304static void irq_insert_desc(unsigned int irq, struct irq_desc *desc)
@@ -187,6 +363,7 @@ static struct irq_desc *alloc_desc(int irq, int node, unsigned int flags,
187 363
188 desc_set_defaults(irq, desc, node, affinity, owner); 364 desc_set_defaults(irq, desc, node, affinity, owner);
189 irqd_set(&desc->irq_data, flags); 365 irqd_set(&desc->irq_data, flags);
366 kobject_init(&desc->kobj, &irq_kobj_type);
190 367
191 return desc; 368 return desc;
192 369
@@ -197,15 +374,22 @@ err_desc:
197 return NULL; 374 return NULL;
198} 375}
199 376
200static void delayed_free_desc(struct rcu_head *rhp) 377static void irq_kobj_release(struct kobject *kobj)
201{ 378{
202 struct irq_desc *desc = container_of(rhp, struct irq_desc, rcu); 379 struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj);
203 380
204 free_masks(desc); 381 free_masks(desc);
205 free_percpu(desc->kstat_irqs); 382 free_percpu(desc->kstat_irqs);
206 kfree(desc); 383 kfree(desc);
207} 384}
208 385
386static void delayed_free_desc(struct rcu_head *rhp)
387{
388 struct irq_desc *desc = container_of(rhp, struct irq_desc, rcu);
389
390 kobject_put(&desc->kobj);
391}
392
209static void free_desc(unsigned int irq) 393static void free_desc(unsigned int irq)
210{ 394{
211 struct irq_desc *desc = irq_to_desc(irq); 395 struct irq_desc *desc = irq_to_desc(irq);
@@ -217,8 +401,12 @@ static void free_desc(unsigned int irq)
217 * kstat_irq_usr(). Once we deleted the descriptor from the 401 * kstat_irq_usr(). Once we deleted the descriptor from the
218 * sparse tree we can free it. Access in proc will fail to 402 * sparse tree we can free it. Access in proc will fail to
219 * lookup the descriptor. 403 * lookup the descriptor.
404 *
405 * The sysfs entry must be serialized against a concurrent
406 * irq_sysfs_init() as well.
220 */ 407 */
221 mutex_lock(&sparse_irq_lock); 408 mutex_lock(&sparse_irq_lock);
409 kobject_del(&desc->kobj);
222 delete_irq_desc(irq); 410 delete_irq_desc(irq);
223 mutex_unlock(&sparse_irq_lock); 411 mutex_unlock(&sparse_irq_lock);
224 412
@@ -236,31 +424,31 @@ static int alloc_descs(unsigned int start, unsigned int cnt, int node,
236 const struct cpumask *mask = NULL; 424 const struct cpumask *mask = NULL;
237 struct irq_desc *desc; 425 struct irq_desc *desc;
238 unsigned int flags; 426 unsigned int flags;
239 int i, cpu = -1; 427 int i;
240 428
241 if (affinity && cpumask_empty(affinity)) 429 /* Validate affinity mask(s) */
242 return -EINVAL; 430 if (affinity) {
431 for (i = 0, mask = affinity; i < cnt; i++, mask++) {
432 if (cpumask_empty(mask))
433 return -EINVAL;
434 }
435 }
243 436
244 flags = affinity ? IRQD_AFFINITY_MANAGED : 0; 437 flags = affinity ? IRQD_AFFINITY_MANAGED : 0;
438 mask = NULL;
245 439
246 for (i = 0; i < cnt; i++) { 440 for (i = 0; i < cnt; i++) {
247 if (affinity) { 441 if (affinity) {
248 cpu = cpumask_next(cpu, affinity); 442 node = cpu_to_node(cpumask_first(affinity));
249 if (cpu >= nr_cpu_ids) 443 mask = affinity;
250 cpu = cpumask_first(affinity); 444 affinity++;
251 node = cpu_to_node(cpu);
252
253 /*
254 * For single allocations we use the caller provided
255 * mask otherwise we use the mask of the target cpu
256 */
257 mask = cnt == 1 ? affinity : cpumask_of(cpu);
258 } 445 }
259 desc = alloc_desc(start + i, node, flags, mask, owner); 446 desc = alloc_desc(start + i, node, flags, mask, owner);
260 if (!desc) 447 if (!desc)
261 goto err; 448 goto err;
262 mutex_lock(&sparse_irq_lock); 449 mutex_lock(&sparse_irq_lock);
263 irq_insert_desc(start + i, desc); 450 irq_insert_desc(start + i, desc);
451 irq_sysfs_add(start + i, desc);
264 mutex_unlock(&sparse_irq_lock); 452 mutex_unlock(&sparse_irq_lock);
265 } 453 }
266 return start; 454 return start;
@@ -481,9 +669,9 @@ EXPORT_SYMBOL_GPL(irq_free_descs);
481 * @cnt: Number of consecutive irqs to allocate. 669 * @cnt: Number of consecutive irqs to allocate.
482 * @node: Preferred node on which the irq descriptor should be allocated 670 * @node: Preferred node on which the irq descriptor should be allocated
483 * @owner: Owning module (can be NULL) 671 * @owner: Owning module (can be NULL)
484 * @affinity: Optional pointer to an affinity mask which hints where the 672 * @affinity: Optional pointer to an affinity mask array of size @cnt which
485 * irq descriptors should be allocated and which default 673 * hints where the irq descriptors should be allocated and which
486 * affinities to use 674 * default affinities to use
487 * 675 *
488 * Returns the first irq number or error code 676 * Returns the first irq number or error code
489 */ 677 */
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 4752b43662e0..8c0a0ae43521 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -80,7 +80,7 @@ EXPORT_SYMBOL_GPL(irq_domain_free_fwnode);
80 80
81/** 81/**
82 * __irq_domain_add() - Allocate a new irq_domain data structure 82 * __irq_domain_add() - Allocate a new irq_domain data structure
83 * @of_node: optional device-tree node of the interrupt controller 83 * @fwnode: firmware node for the interrupt controller
84 * @size: Size of linear map; 0 for radix mapping only 84 * @size: Size of linear map; 0 for radix mapping only
85 * @hwirq_max: Maximum number of interrupts supported by controller 85 * @hwirq_max: Maximum number of interrupts supported by controller
86 * @direct_max: Maximum value of direct maps; Use ~0 for no limit; 0 for no 86 * @direct_max: Maximum value of direct maps; Use ~0 for no limit; 0 for no
@@ -96,10 +96,8 @@ struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, int size,
96 const struct irq_domain_ops *ops, 96 const struct irq_domain_ops *ops,
97 void *host_data) 97 void *host_data)
98{ 98{
99 struct device_node *of_node = to_of_node(fwnode);
99 struct irq_domain *domain; 100 struct irq_domain *domain;
100 struct device_node *of_node;
101
102 of_node = to_of_node(fwnode);
103 101
104 domain = kzalloc_node(sizeof(*domain) + (sizeof(unsigned int) * size), 102 domain = kzalloc_node(sizeof(*domain) + (sizeof(unsigned int) * size),
105 GFP_KERNEL, of_node_to_nid(of_node)); 103 GFP_KERNEL, of_node_to_nid(of_node));
@@ -868,7 +866,10 @@ int irq_domain_xlate_onetwocell(struct irq_domain *d,
868 if (WARN_ON(intsize < 1)) 866 if (WARN_ON(intsize < 1))
869 return -EINVAL; 867 return -EINVAL;
870 *out_hwirq = intspec[0]; 868 *out_hwirq = intspec[0];
871 *out_type = (intsize > 1) ? intspec[1] : IRQ_TYPE_NONE; 869 if (intsize > 1)
870 *out_type = intspec[1] & IRQ_TYPE_SENSE_MASK;
871 else
872 *out_type = IRQ_TYPE_NONE;
872 return 0; 873 return 0;
873} 874}
874EXPORT_SYMBOL_GPL(irq_domain_xlate_onetwocell); 875EXPORT_SYMBOL_GPL(irq_domain_xlate_onetwocell);
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 9530fcd27704..6b669593e7eb 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -669,8 +669,6 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned long flags)
669 return 0; 669 return 0;
670 } 670 }
671 671
672 flags &= IRQ_TYPE_SENSE_MASK;
673
674 if (chip->flags & IRQCHIP_SET_TYPE_MASKED) { 672 if (chip->flags & IRQCHIP_SET_TYPE_MASKED) {
675 if (!irqd_irq_masked(&desc->irq_data)) 673 if (!irqd_irq_masked(&desc->irq_data))
676 mask_irq(desc); 674 mask_irq(desc);
@@ -678,7 +676,8 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned long flags)
678 unmask = 1; 676 unmask = 1;
679 } 677 }
680 678
681 /* caller masked out all except trigger mode flags */ 679 /* Mask all flags except trigger mode */
680 flags &= IRQ_TYPE_SENSE_MASK;
682 ret = chip->irq_set_type(&desc->irq_data, flags); 681 ret = chip->irq_set_type(&desc->irq_data, flags);
683 682
684 switch (ret) { 683 switch (ret) {
@@ -722,6 +721,7 @@ int irq_set_parent(int irq, int parent_irq)
722 irq_put_desc_unlock(desc, flags); 721 irq_put_desc_unlock(desc, flags);
723 return 0; 722 return 0;
724} 723}
724EXPORT_SYMBOL_GPL(irq_set_parent);
725#endif 725#endif
726 726
727/* 727/*
@@ -1341,12 +1341,12 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
1341 1341
1342 } else if (new->flags & IRQF_TRIGGER_MASK) { 1342 } else if (new->flags & IRQF_TRIGGER_MASK) {
1343 unsigned int nmsk = new->flags & IRQF_TRIGGER_MASK; 1343 unsigned int nmsk = new->flags & IRQF_TRIGGER_MASK;
1344 unsigned int omsk = irq_settings_get_trigger_mask(desc); 1344 unsigned int omsk = irqd_get_trigger_type(&desc->irq_data);
1345 1345
1346 if (nmsk != omsk) 1346 if (nmsk != omsk)
1347 /* hope the handler works with current trigger mode */ 1347 /* hope the handler works with current trigger mode */
1348 pr_warn("irq %d uses trigger mode %u; requested %u\n", 1348 pr_warn("irq %d uses trigger mode %u; requested %u\n",
1349 irq, nmsk, omsk); 1349 irq, omsk, nmsk);
1350 } 1350 }
1351 1351
1352 *old_ptr = new; 1352 *old_ptr = new;
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index 19e9dfbe97fa..8a3e872798f3 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -18,20 +18,42 @@
18/* Temparory solution for building, will be removed later */ 18/* Temparory solution for building, will be removed later */
19#include <linux/pci.h> 19#include <linux/pci.h>
20 20
21struct msi_desc *alloc_msi_entry(struct device *dev) 21/**
22 * alloc_msi_entry - Allocate an initialize msi_entry
23 * @dev: Pointer to the device for which this is allocated
24 * @nvec: The number of vectors used in this entry
25 * @affinity: Optional pointer to an affinity mask array size of @nvec
26 *
27 * If @affinity is not NULL then a an affinity array[@nvec] is allocated
28 * and the affinity masks from @affinity are copied.
29 */
30struct msi_desc *
31alloc_msi_entry(struct device *dev, int nvec, const struct cpumask *affinity)
22{ 32{
23 struct msi_desc *desc = kzalloc(sizeof(*desc), GFP_KERNEL); 33 struct msi_desc *desc;
34
35 desc = kzalloc(sizeof(*desc), GFP_KERNEL);
24 if (!desc) 36 if (!desc)
25 return NULL; 37 return NULL;
26 38
27 INIT_LIST_HEAD(&desc->list); 39 INIT_LIST_HEAD(&desc->list);
28 desc->dev = dev; 40 desc->dev = dev;
41 desc->nvec_used = nvec;
42 if (affinity) {
43 desc->affinity = kmemdup(affinity,
44 nvec * sizeof(*desc->affinity), GFP_KERNEL);
45 if (!desc->affinity) {
46 kfree(desc);
47 return NULL;
48 }
49 }
29 50
30 return desc; 51 return desc;
31} 52}
32 53
33void free_msi_entry(struct msi_desc *entry) 54void free_msi_entry(struct msi_desc *entry)
34{ 55{
56 kfree(entry->affinity);
35 kfree(entry); 57 kfree(entry);
36} 58}
37 59
diff --git a/kernel/kcov.c b/kernel/kcov.c
index 8d44b3fea9d0..30e6d05aa5a9 100644
--- a/kernel/kcov.c
+++ b/kernel/kcov.c
@@ -53,8 +53,15 @@ void notrace __sanitizer_cov_trace_pc(void)
53 /* 53 /*
54 * We are interested in code coverage as a function of a syscall inputs, 54 * We are interested in code coverage as a function of a syscall inputs,
55 * so we ignore code executed in interrupts. 55 * so we ignore code executed in interrupts.
56 * The checks for whether we are in an interrupt are open-coded, because
57 * 1. We can't use in_interrupt() here, since it also returns true
58 * when we are inside local_bh_disable() section.
59 * 2. We don't want to use (in_irq() | in_serving_softirq() | in_nmi()),
60 * since that leads to slower generated code (three separate tests,
61 * one for each of the flags).
56 */ 62 */
57 if (!t || in_interrupt()) 63 if (!t || (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_OFFSET
64 | NMI_MASK)))
58 return; 65 return;
59 mode = READ_ONCE(t->kcov_mode); 66 mode = READ_ONCE(t->kcov_mode);
60 if (mode == KCOV_MODE_TRACE) { 67 if (mode == KCOV_MODE_TRACE) {
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index d10ab6b9b5e0..d63095472ea9 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -49,7 +49,7 @@
49#include <linux/cpu.h> 49#include <linux/cpu.h>
50#include <linux/jump_label.h> 50#include <linux/jump_label.h>
51 51
52#include <asm-generic/sections.h> 52#include <asm/sections.h>
53#include <asm/cacheflush.h> 53#include <asm/cacheflush.h>
54#include <asm/errno.h> 54#include <asm/errno.h>
55#include <asm/uaccess.h> 55#include <asm/uaccess.h>
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 9ff173dca1ae..be2cc1f9dd57 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -64,7 +64,7 @@ static inline struct kthread *to_kthread(struct task_struct *k)
64static struct kthread *to_live_kthread(struct task_struct *k) 64static struct kthread *to_live_kthread(struct task_struct *k)
65{ 65{
66 struct completion *vfork = ACCESS_ONCE(k->vfork_done); 66 struct completion *vfork = ACCESS_ONCE(k->vfork_done);
67 if (likely(vfork)) 67 if (likely(vfork) && try_get_task_stack(k))
68 return __to_kthread(vfork); 68 return __to_kthread(vfork);
69 return NULL; 69 return NULL;
70} 70}
@@ -138,7 +138,7 @@ void *kthread_data(struct task_struct *task)
138} 138}
139 139
140/** 140/**
141 * probe_kthread_data - speculative version of kthread_data() 141 * kthread_probe_data - speculative version of kthread_data()
142 * @task: possible kthread task in question 142 * @task: possible kthread task in question
143 * 143 *
144 * @task could be a kthread task. Return the data value specified when it 144 * @task could be a kthread task. Return the data value specified when it
@@ -146,7 +146,7 @@ void *kthread_data(struct task_struct *task)
146 * inaccessible for any reason, %NULL is returned. This function requires 146 * inaccessible for any reason, %NULL is returned. This function requires
147 * that @task itself is safe to dereference. 147 * that @task itself is safe to dereference.
148 */ 148 */
149void *probe_kthread_data(struct task_struct *task) 149void *kthread_probe_data(struct task_struct *task)
150{ 150{
151 struct kthread *kthread = to_kthread(task); 151 struct kthread *kthread = to_kthread(task);
152 void *data = NULL; 152 void *data = NULL;
@@ -244,33 +244,10 @@ static void create_kthread(struct kthread_create_info *create)
244 } 244 }
245} 245}
246 246
247/** 247static struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data),
248 * kthread_create_on_node - create a kthread. 248 void *data, int node,
249 * @threadfn: the function to run until signal_pending(current). 249 const char namefmt[],
250 * @data: data ptr for @threadfn. 250 va_list args)
251 * @node: task and thread structures for the thread are allocated on this node
252 * @namefmt: printf-style name for the thread.
253 *
254 * Description: This helper function creates and names a kernel
255 * thread. The thread will be stopped: use wake_up_process() to start
256 * it. See also kthread_run(). The new thread has SCHED_NORMAL policy and
257 * is affine to all CPUs.
258 *
259 * If thread is going to be bound on a particular cpu, give its node
260 * in @node, to get NUMA affinity for kthread stack, or else give NUMA_NO_NODE.
261 * When woken, the thread will run @threadfn() with @data as its
262 * argument. @threadfn() can either call do_exit() directly if it is a
263 * standalone thread for which no one will call kthread_stop(), or
264 * return when 'kthread_should_stop()' is true (which means
265 * kthread_stop() has been called). The return value should be zero
266 * or a negative error number; it will be passed to kthread_stop().
267 *
268 * Returns a task_struct or ERR_PTR(-ENOMEM) or ERR_PTR(-EINTR).
269 */
270struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
271 void *data, int node,
272 const char namefmt[],
273 ...)
274{ 251{
275 DECLARE_COMPLETION_ONSTACK(done); 252 DECLARE_COMPLETION_ONSTACK(done);
276 struct task_struct *task; 253 struct task_struct *task;
@@ -311,11 +288,8 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
311 task = create->result; 288 task = create->result;
312 if (!IS_ERR(task)) { 289 if (!IS_ERR(task)) {
313 static const struct sched_param param = { .sched_priority = 0 }; 290 static const struct sched_param param = { .sched_priority = 0 };
314 va_list args;
315 291
316 va_start(args, namefmt);
317 vsnprintf(task->comm, sizeof(task->comm), namefmt, args); 292 vsnprintf(task->comm, sizeof(task->comm), namefmt, args);
318 va_end(args);
319 /* 293 /*
320 * root may have changed our (kthreadd's) priority or CPU mask. 294 * root may have changed our (kthreadd's) priority or CPU mask.
321 * The kernel thread should not inherit these properties. 295 * The kernel thread should not inherit these properties.
@@ -326,6 +300,44 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
326 kfree(create); 300 kfree(create);
327 return task; 301 return task;
328} 302}
303
304/**
305 * kthread_create_on_node - create a kthread.
306 * @threadfn: the function to run until signal_pending(current).
307 * @data: data ptr for @threadfn.
308 * @node: task and thread structures for the thread are allocated on this node
309 * @namefmt: printf-style name for the thread.
310 *
311 * Description: This helper function creates and names a kernel
312 * thread. The thread will be stopped: use wake_up_process() to start
313 * it. See also kthread_run(). The new thread has SCHED_NORMAL policy and
314 * is affine to all CPUs.
315 *
316 * If thread is going to be bound on a particular cpu, give its node
317 * in @node, to get NUMA affinity for kthread stack, or else give NUMA_NO_NODE.
318 * When woken, the thread will run @threadfn() with @data as its
319 * argument. @threadfn() can either call do_exit() directly if it is a
320 * standalone thread for which no one will call kthread_stop(), or
321 * return when 'kthread_should_stop()' is true (which means
322 * kthread_stop() has been called). The return value should be zero
323 * or a negative error number; it will be passed to kthread_stop().
324 *
325 * Returns a task_struct or ERR_PTR(-ENOMEM) or ERR_PTR(-EINTR).
326 */
327struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
328 void *data, int node,
329 const char namefmt[],
330 ...)
331{
332 struct task_struct *task;
333 va_list args;
334
335 va_start(args, namefmt);
336 task = __kthread_create_on_node(threadfn, data, node, namefmt, args);
337 va_end(args);
338
339 return task;
340}
329EXPORT_SYMBOL(kthread_create_on_node); 341EXPORT_SYMBOL(kthread_create_on_node);
330 342
331static void __kthread_bind_mask(struct task_struct *p, const struct cpumask *mask, long state) 343static void __kthread_bind_mask(struct task_struct *p, const struct cpumask *mask, long state)
@@ -390,10 +402,10 @@ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data),
390 cpu); 402 cpu);
391 if (IS_ERR(p)) 403 if (IS_ERR(p))
392 return p; 404 return p;
405 kthread_bind(p, cpu);
406 /* CPU hotplug need to bind once again when unparking the thread. */
393 set_bit(KTHREAD_IS_PER_CPU, &to_kthread(p)->flags); 407 set_bit(KTHREAD_IS_PER_CPU, &to_kthread(p)->flags);
394 to_kthread(p)->cpu = cpu; 408 to_kthread(p)->cpu = cpu;
395 /* Park the thread to get it out of TASK_UNINTERRUPTIBLE state */
396 kthread_park(p);
397 return p; 409 return p;
398} 410}
399 411
@@ -407,6 +419,10 @@ static void __kthread_unpark(struct task_struct *k, struct kthread *kthread)
407 * which might be about to be cleared. 419 * which might be about to be cleared.
408 */ 420 */
409 if (test_and_clear_bit(KTHREAD_IS_PARKED, &kthread->flags)) { 421 if (test_and_clear_bit(KTHREAD_IS_PARKED, &kthread->flags)) {
422 /*
423 * Newly created kthread was parked when the CPU was offline.
424 * The binding was lost and we need to set it again.
425 */
410 if (test_bit(KTHREAD_IS_PER_CPU, &kthread->flags)) 426 if (test_bit(KTHREAD_IS_PER_CPU, &kthread->flags))
411 __kthread_bind(k, kthread->cpu, TASK_PARKED); 427 __kthread_bind(k, kthread->cpu, TASK_PARKED);
412 wake_up_state(k, TASK_PARKED); 428 wake_up_state(k, TASK_PARKED);
@@ -425,8 +441,10 @@ void kthread_unpark(struct task_struct *k)
425{ 441{
426 struct kthread *kthread = to_live_kthread(k); 442 struct kthread *kthread = to_live_kthread(k);
427 443
428 if (kthread) 444 if (kthread) {
429 __kthread_unpark(k, kthread); 445 __kthread_unpark(k, kthread);
446 put_task_stack(k);
447 }
430} 448}
431EXPORT_SYMBOL_GPL(kthread_unpark); 449EXPORT_SYMBOL_GPL(kthread_unpark);
432 450
@@ -455,6 +473,7 @@ int kthread_park(struct task_struct *k)
455 wait_for_completion(&kthread->parked); 473 wait_for_completion(&kthread->parked);
456 } 474 }
457 } 475 }
476 put_task_stack(k);
458 ret = 0; 477 ret = 0;
459 } 478 }
460 return ret; 479 return ret;
@@ -490,6 +509,7 @@ int kthread_stop(struct task_struct *k)
490 __kthread_unpark(k, kthread); 509 __kthread_unpark(k, kthread);
491 wake_up_process(k); 510 wake_up_process(k);
492 wait_for_completion(&kthread->exited); 511 wait_for_completion(&kthread->exited);
512 put_task_stack(k);
493 } 513 }
494 ret = k->exit_code; 514 ret = k->exit_code;
495 put_task_struct(k); 515 put_task_struct(k);
@@ -536,39 +556,48 @@ int kthreadd(void *unused)
536 return 0; 556 return 0;
537} 557}
538 558
539void __init_kthread_worker(struct kthread_worker *worker, 559void __kthread_init_worker(struct kthread_worker *worker,
540 const char *name, 560 const char *name,
541 struct lock_class_key *key) 561 struct lock_class_key *key)
542{ 562{
563 memset(worker, 0, sizeof(struct kthread_worker));
543 spin_lock_init(&worker->lock); 564 spin_lock_init(&worker->lock);
544 lockdep_set_class_and_name(&worker->lock, key, name); 565 lockdep_set_class_and_name(&worker->lock, key, name);
545 INIT_LIST_HEAD(&worker->work_list); 566 INIT_LIST_HEAD(&worker->work_list);
546 worker->task = NULL; 567 INIT_LIST_HEAD(&worker->delayed_work_list);
547} 568}
548EXPORT_SYMBOL_GPL(__init_kthread_worker); 569EXPORT_SYMBOL_GPL(__kthread_init_worker);
549 570
550/** 571/**
551 * kthread_worker_fn - kthread function to process kthread_worker 572 * kthread_worker_fn - kthread function to process kthread_worker
552 * @worker_ptr: pointer to initialized kthread_worker 573 * @worker_ptr: pointer to initialized kthread_worker
553 * 574 *
554 * This function can be used as @threadfn to kthread_create() or 575 * This function implements the main cycle of kthread worker. It processes
555 * kthread_run() with @worker_ptr argument pointing to an initialized 576 * work_list until it is stopped with kthread_stop(). It sleeps when the queue
556 * kthread_worker. The started kthread will process work_list until 577 * is empty.
557 * the it is stopped with kthread_stop(). A kthread can also call
558 * this function directly after extra initialization.
559 * 578 *
560 * Different kthreads can be used for the same kthread_worker as long 579 * The works are not allowed to keep any locks, disable preemption or interrupts
561 * as there's only one kthread attached to it at any given time. A 580 * when they finish. There is defined a safe point for freezing when one work
562 * kthread_worker without an attached kthread simply collects queued 581 * finishes and before a new one is started.
563 * kthread_works. 582 *
583 * Also the works must not be handled by more than one worker at the same time,
584 * see also kthread_queue_work().
564 */ 585 */
565int kthread_worker_fn(void *worker_ptr) 586int kthread_worker_fn(void *worker_ptr)
566{ 587{
567 struct kthread_worker *worker = worker_ptr; 588 struct kthread_worker *worker = worker_ptr;
568 struct kthread_work *work; 589 struct kthread_work *work;
569 590
570 WARN_ON(worker->task); 591 /*
592 * FIXME: Update the check and remove the assignment when all kthread
593 * worker users are created using kthread_create_worker*() functions.
594 */
595 WARN_ON(worker->task && worker->task != current);
571 worker->task = current; 596 worker->task = current;
597
598 if (worker->flags & KTW_FREEZABLE)
599 set_freezable();
600
572repeat: 601repeat:
573 set_current_state(TASK_INTERRUPTIBLE); /* mb paired w/ kthread_stop */ 602 set_current_state(TASK_INTERRUPTIBLE); /* mb paired w/ kthread_stop */
574 603
@@ -601,13 +630,132 @@ repeat:
601} 630}
602EXPORT_SYMBOL_GPL(kthread_worker_fn); 631EXPORT_SYMBOL_GPL(kthread_worker_fn);
603 632
604/* insert @work before @pos in @worker */ 633static struct kthread_worker *
605static void insert_kthread_work(struct kthread_worker *worker, 634__kthread_create_worker(int cpu, unsigned int flags,
606 struct kthread_work *work, 635 const char namefmt[], va_list args)
607 struct list_head *pos) 636{
637 struct kthread_worker *worker;
638 struct task_struct *task;
639
640 worker = kzalloc(sizeof(*worker), GFP_KERNEL);
641 if (!worker)
642 return ERR_PTR(-ENOMEM);
643
644 kthread_init_worker(worker);
645
646 if (cpu >= 0) {
647 char name[TASK_COMM_LEN];
648
649 /*
650 * kthread_create_worker_on_cpu() allows to pass a generic
651 * namefmt in compare with kthread_create_on_cpu. We need
652 * to format it here.
653 */
654 vsnprintf(name, sizeof(name), namefmt, args);
655 task = kthread_create_on_cpu(kthread_worker_fn, worker,
656 cpu, name);
657 } else {
658 task = __kthread_create_on_node(kthread_worker_fn, worker,
659 -1, namefmt, args);
660 }
661
662 if (IS_ERR(task))
663 goto fail_task;
664
665 worker->flags = flags;
666 worker->task = task;
667 wake_up_process(task);
668 return worker;
669
670fail_task:
671 kfree(worker);
672 return ERR_CAST(task);
673}
674
675/**
676 * kthread_create_worker - create a kthread worker
677 * @flags: flags modifying the default behavior of the worker
678 * @namefmt: printf-style name for the kthread worker (task).
679 *
680 * Returns a pointer to the allocated worker on success, ERR_PTR(-ENOMEM)
681 * when the needed structures could not get allocated, and ERR_PTR(-EINTR)
682 * when the worker was SIGKILLed.
683 */
684struct kthread_worker *
685kthread_create_worker(unsigned int flags, const char namefmt[], ...)
686{
687 struct kthread_worker *worker;
688 va_list args;
689
690 va_start(args, namefmt);
691 worker = __kthread_create_worker(-1, flags, namefmt, args);
692 va_end(args);
693
694 return worker;
695}
696EXPORT_SYMBOL(kthread_create_worker);
697
698/**
699 * kthread_create_worker_on_cpu - create a kthread worker and bind it
700 * it to a given CPU and the associated NUMA node.
701 * @cpu: CPU number
702 * @flags: flags modifying the default behavior of the worker
703 * @namefmt: printf-style name for the kthread worker (task).
704 *
705 * Use a valid CPU number if you want to bind the kthread worker
706 * to the given CPU and the associated NUMA node.
707 *
708 * A good practice is to add the cpu number also into the worker name.
709 * For example, use kthread_create_worker_on_cpu(cpu, "helper/%d", cpu).
710 *
711 * Returns a pointer to the allocated worker on success, ERR_PTR(-ENOMEM)
712 * when the needed structures could not get allocated, and ERR_PTR(-EINTR)
713 * when the worker was SIGKILLed.
714 */
715struct kthread_worker *
716kthread_create_worker_on_cpu(int cpu, unsigned int flags,
717 const char namefmt[], ...)
718{
719 struct kthread_worker *worker;
720 va_list args;
721
722 va_start(args, namefmt);
723 worker = __kthread_create_worker(cpu, flags, namefmt, args);
724 va_end(args);
725
726 return worker;
727}
728EXPORT_SYMBOL(kthread_create_worker_on_cpu);
729
730/*
731 * Returns true when the work could not be queued at the moment.
732 * It happens when it is already pending in a worker list
733 * or when it is being cancelled.
734 */
735static inline bool queuing_blocked(struct kthread_worker *worker,
736 struct kthread_work *work)
608{ 737{
609 lockdep_assert_held(&worker->lock); 738 lockdep_assert_held(&worker->lock);
610 739
740 return !list_empty(&work->node) || work->canceling;
741}
742
743static void kthread_insert_work_sanity_check(struct kthread_worker *worker,
744 struct kthread_work *work)
745{
746 lockdep_assert_held(&worker->lock);
747 WARN_ON_ONCE(!list_empty(&work->node));
748 /* Do not use a work with >1 worker, see kthread_queue_work() */
749 WARN_ON_ONCE(work->worker && work->worker != worker);
750}
751
752/* insert @work before @pos in @worker */
753static void kthread_insert_work(struct kthread_worker *worker,
754 struct kthread_work *work,
755 struct list_head *pos)
756{
757 kthread_insert_work_sanity_check(worker, work);
758
611 list_add_tail(&work->node, pos); 759 list_add_tail(&work->node, pos);
612 work->worker = worker; 760 work->worker = worker;
613 if (!worker->current_work && likely(worker->task)) 761 if (!worker->current_work && likely(worker->task))
@@ -615,29 +763,133 @@ static void insert_kthread_work(struct kthread_worker *worker,
615} 763}
616 764
617/** 765/**
618 * queue_kthread_work - queue a kthread_work 766 * kthread_queue_work - queue a kthread_work
619 * @worker: target kthread_worker 767 * @worker: target kthread_worker
620 * @work: kthread_work to queue 768 * @work: kthread_work to queue
621 * 769 *
622 * Queue @work to work processor @task for async execution. @task 770 * Queue @work to work processor @task for async execution. @task
623 * must have been created with kthread_worker_create(). Returns %true 771 * must have been created with kthread_worker_create(). Returns %true
624 * if @work was successfully queued, %false if it was already pending. 772 * if @work was successfully queued, %false if it was already pending.
773 *
774 * Reinitialize the work if it needs to be used by another worker.
775 * For example, when the worker was stopped and started again.
625 */ 776 */
626bool queue_kthread_work(struct kthread_worker *worker, 777bool kthread_queue_work(struct kthread_worker *worker,
627 struct kthread_work *work) 778 struct kthread_work *work)
628{ 779{
629 bool ret = false; 780 bool ret = false;
630 unsigned long flags; 781 unsigned long flags;
631 782
632 spin_lock_irqsave(&worker->lock, flags); 783 spin_lock_irqsave(&worker->lock, flags);
633 if (list_empty(&work->node)) { 784 if (!queuing_blocked(worker, work)) {
634 insert_kthread_work(worker, work, &worker->work_list); 785 kthread_insert_work(worker, work, &worker->work_list);
786 ret = true;
787 }
788 spin_unlock_irqrestore(&worker->lock, flags);
789 return ret;
790}
791EXPORT_SYMBOL_GPL(kthread_queue_work);
792
793/**
794 * kthread_delayed_work_timer_fn - callback that queues the associated kthread
795 * delayed work when the timer expires.
796 * @__data: pointer to the data associated with the timer
797 *
798 * The format of the function is defined by struct timer_list.
799 * It should have been called from irqsafe timer with irq already off.
800 */
801void kthread_delayed_work_timer_fn(unsigned long __data)
802{
803 struct kthread_delayed_work *dwork =
804 (struct kthread_delayed_work *)__data;
805 struct kthread_work *work = &dwork->work;
806 struct kthread_worker *worker = work->worker;
807
808 /*
809 * This might happen when a pending work is reinitialized.
810 * It means that it is used a wrong way.
811 */
812 if (WARN_ON_ONCE(!worker))
813 return;
814
815 spin_lock(&worker->lock);
816 /* Work must not be used with >1 worker, see kthread_queue_work(). */
817 WARN_ON_ONCE(work->worker != worker);
818
819 /* Move the work from worker->delayed_work_list. */
820 WARN_ON_ONCE(list_empty(&work->node));
821 list_del_init(&work->node);
822 kthread_insert_work(worker, work, &worker->work_list);
823
824 spin_unlock(&worker->lock);
825}
826EXPORT_SYMBOL(kthread_delayed_work_timer_fn);
827
828void __kthread_queue_delayed_work(struct kthread_worker *worker,
829 struct kthread_delayed_work *dwork,
830 unsigned long delay)
831{
832 struct timer_list *timer = &dwork->timer;
833 struct kthread_work *work = &dwork->work;
834
835 WARN_ON_ONCE(timer->function != kthread_delayed_work_timer_fn ||
836 timer->data != (unsigned long)dwork);
837
838 /*
839 * If @delay is 0, queue @dwork->work immediately. This is for
840 * both optimization and correctness. The earliest @timer can
841 * expire is on the closest next tick and delayed_work users depend
842 * on that there's no such delay when @delay is 0.
843 */
844 if (!delay) {
845 kthread_insert_work(worker, work, &worker->work_list);
846 return;
847 }
848
849 /* Be paranoid and try to detect possible races already now. */
850 kthread_insert_work_sanity_check(worker, work);
851
852 list_add(&work->node, &worker->delayed_work_list);
853 work->worker = worker;
854 timer_stats_timer_set_start_info(&dwork->timer);
855 timer->expires = jiffies + delay;
856 add_timer(timer);
857}
858
859/**
860 * kthread_queue_delayed_work - queue the associated kthread work
861 * after a delay.
862 * @worker: target kthread_worker
863 * @dwork: kthread_delayed_work to queue
864 * @delay: number of jiffies to wait before queuing
865 *
866 * If the work has not been pending it starts a timer that will queue
867 * the work after the given @delay. If @delay is zero, it queues the
868 * work immediately.
869 *
870 * Return: %false if the @work has already been pending. It means that
871 * either the timer was running or the work was queued. It returns %true
872 * otherwise.
873 */
874bool kthread_queue_delayed_work(struct kthread_worker *worker,
875 struct kthread_delayed_work *dwork,
876 unsigned long delay)
877{
878 struct kthread_work *work = &dwork->work;
879 unsigned long flags;
880 bool ret = false;
881
882 spin_lock_irqsave(&worker->lock, flags);
883
884 if (!queuing_blocked(worker, work)) {
885 __kthread_queue_delayed_work(worker, dwork, delay);
635 ret = true; 886 ret = true;
636 } 887 }
888
637 spin_unlock_irqrestore(&worker->lock, flags); 889 spin_unlock_irqrestore(&worker->lock, flags);
638 return ret; 890 return ret;
639} 891}
640EXPORT_SYMBOL_GPL(queue_kthread_work); 892EXPORT_SYMBOL_GPL(kthread_queue_delayed_work);
641 893
642struct kthread_flush_work { 894struct kthread_flush_work {
643 struct kthread_work work; 895 struct kthread_work work;
@@ -652,12 +904,12 @@ static void kthread_flush_work_fn(struct kthread_work *work)
652} 904}
653 905
654/** 906/**
655 * flush_kthread_work - flush a kthread_work 907 * kthread_flush_work - flush a kthread_work
656 * @work: work to flush 908 * @work: work to flush
657 * 909 *
658 * If @work is queued or executing, wait for it to finish execution. 910 * If @work is queued or executing, wait for it to finish execution.
659 */ 911 */
660void flush_kthread_work(struct kthread_work *work) 912void kthread_flush_work(struct kthread_work *work)
661{ 913{
662 struct kthread_flush_work fwork = { 914 struct kthread_flush_work fwork = {
663 KTHREAD_WORK_INIT(fwork.work, kthread_flush_work_fn), 915 KTHREAD_WORK_INIT(fwork.work, kthread_flush_work_fn),
@@ -666,21 +918,19 @@ void flush_kthread_work(struct kthread_work *work)
666 struct kthread_worker *worker; 918 struct kthread_worker *worker;
667 bool noop = false; 919 bool noop = false;
668 920
669retry:
670 worker = work->worker; 921 worker = work->worker;
671 if (!worker) 922 if (!worker)
672 return; 923 return;
673 924
674 spin_lock_irq(&worker->lock); 925 spin_lock_irq(&worker->lock);
675 if (work->worker != worker) { 926 /* Work must not be used with >1 worker, see kthread_queue_work(). */
676 spin_unlock_irq(&worker->lock); 927 WARN_ON_ONCE(work->worker != worker);
677 goto retry;
678 }
679 928
680 if (!list_empty(&work->node)) 929 if (!list_empty(&work->node))
681 insert_kthread_work(worker, &fwork.work, work->node.next); 930 kthread_insert_work(worker, &fwork.work, work->node.next);
682 else if (worker->current_work == work) 931 else if (worker->current_work == work)
683 insert_kthread_work(worker, &fwork.work, worker->work_list.next); 932 kthread_insert_work(worker, &fwork.work,
933 worker->work_list.next);
684 else 934 else
685 noop = true; 935 noop = true;
686 936
@@ -689,23 +939,214 @@ retry:
689 if (!noop) 939 if (!noop)
690 wait_for_completion(&fwork.done); 940 wait_for_completion(&fwork.done);
691} 941}
692EXPORT_SYMBOL_GPL(flush_kthread_work); 942EXPORT_SYMBOL_GPL(kthread_flush_work);
943
944/*
945 * This function removes the work from the worker queue. Also it makes sure
946 * that it won't get queued later via the delayed work's timer.
947 *
948 * The work might still be in use when this function finishes. See the
949 * current_work proceed by the worker.
950 *
951 * Return: %true if @work was pending and successfully canceled,
952 * %false if @work was not pending
953 */
954static bool __kthread_cancel_work(struct kthread_work *work, bool is_dwork,
955 unsigned long *flags)
956{
957 /* Try to cancel the timer if exists. */
958 if (is_dwork) {
959 struct kthread_delayed_work *dwork =
960 container_of(work, struct kthread_delayed_work, work);
961 struct kthread_worker *worker = work->worker;
962
963 /*
964 * del_timer_sync() must be called to make sure that the timer
965 * callback is not running. The lock must be temporary released
966 * to avoid a deadlock with the callback. In the meantime,
967 * any queuing is blocked by setting the canceling counter.
968 */
969 work->canceling++;
970 spin_unlock_irqrestore(&worker->lock, *flags);
971 del_timer_sync(&dwork->timer);
972 spin_lock_irqsave(&worker->lock, *flags);
973 work->canceling--;
974 }
975
976 /*
977 * Try to remove the work from a worker list. It might either
978 * be from worker->work_list or from worker->delayed_work_list.
979 */
980 if (!list_empty(&work->node)) {
981 list_del_init(&work->node);
982 return true;
983 }
984
985 return false;
986}
693 987
694/** 988/**
695 * flush_kthread_worker - flush all current works on a kthread_worker 989 * kthread_mod_delayed_work - modify delay of or queue a kthread delayed work
990 * @worker: kthread worker to use
991 * @dwork: kthread delayed work to queue
992 * @delay: number of jiffies to wait before queuing
993 *
994 * If @dwork is idle, equivalent to kthread_queue_delayed_work(). Otherwise,
995 * modify @dwork's timer so that it expires after @delay. If @delay is zero,
996 * @work is guaranteed to be queued immediately.
997 *
998 * Return: %true if @dwork was pending and its timer was modified,
999 * %false otherwise.
1000 *
1001 * A special case is when the work is being canceled in parallel.
1002 * It might be caused either by the real kthread_cancel_delayed_work_sync()
1003 * or yet another kthread_mod_delayed_work() call. We let the other command
1004 * win and return %false here. The caller is supposed to synchronize these
1005 * operations a reasonable way.
1006 *
1007 * This function is safe to call from any context including IRQ handler.
1008 * See __kthread_cancel_work() and kthread_delayed_work_timer_fn()
1009 * for details.
1010 */
1011bool kthread_mod_delayed_work(struct kthread_worker *worker,
1012 struct kthread_delayed_work *dwork,
1013 unsigned long delay)
1014{
1015 struct kthread_work *work = &dwork->work;
1016 unsigned long flags;
1017 int ret = false;
1018
1019 spin_lock_irqsave(&worker->lock, flags);
1020
1021 /* Do not bother with canceling when never queued. */
1022 if (!work->worker)
1023 goto fast_queue;
1024
1025 /* Work must not be used with >1 worker, see kthread_queue_work() */
1026 WARN_ON_ONCE(work->worker != worker);
1027
1028 /* Do not fight with another command that is canceling this work. */
1029 if (work->canceling)
1030 goto out;
1031
1032 ret = __kthread_cancel_work(work, true, &flags);
1033fast_queue:
1034 __kthread_queue_delayed_work(worker, dwork, delay);
1035out:
1036 spin_unlock_irqrestore(&worker->lock, flags);
1037 return ret;
1038}
1039EXPORT_SYMBOL_GPL(kthread_mod_delayed_work);
1040
1041static bool __kthread_cancel_work_sync(struct kthread_work *work, bool is_dwork)
1042{
1043 struct kthread_worker *worker = work->worker;
1044 unsigned long flags;
1045 int ret = false;
1046
1047 if (!worker)
1048 goto out;
1049
1050 spin_lock_irqsave(&worker->lock, flags);
1051 /* Work must not be used with >1 worker, see kthread_queue_work(). */
1052 WARN_ON_ONCE(work->worker != worker);
1053
1054 ret = __kthread_cancel_work(work, is_dwork, &flags);
1055
1056 if (worker->current_work != work)
1057 goto out_fast;
1058
1059 /*
1060 * The work is in progress and we need to wait with the lock released.
1061 * In the meantime, block any queuing by setting the canceling counter.
1062 */
1063 work->canceling++;
1064 spin_unlock_irqrestore(&worker->lock, flags);
1065 kthread_flush_work(work);
1066 spin_lock_irqsave(&worker->lock, flags);
1067 work->canceling--;
1068
1069out_fast:
1070 spin_unlock_irqrestore(&worker->lock, flags);
1071out:
1072 return ret;
1073}
1074
1075/**
1076 * kthread_cancel_work_sync - cancel a kthread work and wait for it to finish
1077 * @work: the kthread work to cancel
1078 *
1079 * Cancel @work and wait for its execution to finish. This function
1080 * can be used even if the work re-queues itself. On return from this
1081 * function, @work is guaranteed to be not pending or executing on any CPU.
1082 *
1083 * kthread_cancel_work_sync(&delayed_work->work) must not be used for
1084 * delayed_work's. Use kthread_cancel_delayed_work_sync() instead.
1085 *
1086 * The caller must ensure that the worker on which @work was last
1087 * queued can't be destroyed before this function returns.
1088 *
1089 * Return: %true if @work was pending, %false otherwise.
1090 */
1091bool kthread_cancel_work_sync(struct kthread_work *work)
1092{
1093 return __kthread_cancel_work_sync(work, false);
1094}
1095EXPORT_SYMBOL_GPL(kthread_cancel_work_sync);
1096
1097/**
1098 * kthread_cancel_delayed_work_sync - cancel a kthread delayed work and
1099 * wait for it to finish.
1100 * @dwork: the kthread delayed work to cancel
1101 *
1102 * This is kthread_cancel_work_sync() for delayed works.
1103 *
1104 * Return: %true if @dwork was pending, %false otherwise.
1105 */
1106bool kthread_cancel_delayed_work_sync(struct kthread_delayed_work *dwork)
1107{
1108 return __kthread_cancel_work_sync(&dwork->work, true);
1109}
1110EXPORT_SYMBOL_GPL(kthread_cancel_delayed_work_sync);
1111
1112/**
1113 * kthread_flush_worker - flush all current works on a kthread_worker
696 * @worker: worker to flush 1114 * @worker: worker to flush
697 * 1115 *
698 * Wait until all currently executing or pending works on @worker are 1116 * Wait until all currently executing or pending works on @worker are
699 * finished. 1117 * finished.
700 */ 1118 */
701void flush_kthread_worker(struct kthread_worker *worker) 1119void kthread_flush_worker(struct kthread_worker *worker)
702{ 1120{
703 struct kthread_flush_work fwork = { 1121 struct kthread_flush_work fwork = {
704 KTHREAD_WORK_INIT(fwork.work, kthread_flush_work_fn), 1122 KTHREAD_WORK_INIT(fwork.work, kthread_flush_work_fn),
705 COMPLETION_INITIALIZER_ONSTACK(fwork.done), 1123 COMPLETION_INITIALIZER_ONSTACK(fwork.done),
706 }; 1124 };
707 1125
708 queue_kthread_work(worker, &fwork.work); 1126 kthread_queue_work(worker, &fwork.work);
709 wait_for_completion(&fwork.done); 1127 wait_for_completion(&fwork.done);
710} 1128}
711EXPORT_SYMBOL_GPL(flush_kthread_worker); 1129EXPORT_SYMBOL_GPL(kthread_flush_worker);
1130
1131/**
1132 * kthread_destroy_worker - destroy a kthread worker
1133 * @worker: worker to be destroyed
1134 *
1135 * Flush and destroy @worker. The simple flush is enough because the kthread
1136 * worker API is used only in trivial scenarios. There are no multi-step state
1137 * machines needed.
1138 */
1139void kthread_destroy_worker(struct kthread_worker *worker)
1140{
1141 struct task_struct *task;
1142
1143 task = worker->task;
1144 if (WARN_ON(!task))
1145 return;
1146
1147 kthread_flush_worker(worker);
1148 kthread_stop(task);
1149 WARN_ON(!list_empty(&worker->work_list));
1150 kfree(worker);
1151}
1152EXPORT_SYMBOL(kthread_destroy_worker);
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
index 8bbe50704621..af4643873e71 100644
--- a/kernel/livepatch/core.c
+++ b/kernel/livepatch/core.c
@@ -274,7 +274,6 @@ static int klp_write_object_relocations(struct module *pmod,
274 274
275 objname = klp_is_module(obj) ? obj->name : "vmlinux"; 275 objname = klp_is_module(obj) ? obj->name : "vmlinux";
276 276
277 module_disable_ro(pmod);
278 /* For each klp relocation section */ 277 /* For each klp relocation section */
279 for (i = 1; i < pmod->klp_info->hdr.e_shnum; i++) { 278 for (i = 1; i < pmod->klp_info->hdr.e_shnum; i++) {
280 sec = pmod->klp_info->sechdrs + i; 279 sec = pmod->klp_info->sechdrs + i;
@@ -309,7 +308,6 @@ static int klp_write_object_relocations(struct module *pmod,
309 break; 308 break;
310 } 309 }
311 310
312 module_enable_ro(pmod, true);
313 return ret; 311 return ret;
314} 312}
315 313
@@ -547,9 +545,6 @@ static int __klp_enable_patch(struct klp_patch *patch)
547 list_prev_entry(patch, list)->state == KLP_DISABLED) 545 list_prev_entry(patch, list)->state == KLP_DISABLED)
548 return -EBUSY; 546 return -EBUSY;
549 547
550 pr_notice_once("tainting kernel with TAINT_LIVEPATCH\n");
551 add_taint(TAINT_LIVEPATCH, LOCKDEP_STILL_OK);
552
553 pr_notice("enabling patch '%s'\n", patch->mod->name); 548 pr_notice("enabling patch '%s'\n", patch->mod->name);
554 549
555 klp_for_each_object(patch, obj) { 550 klp_for_each_object(patch, obj) {
@@ -763,6 +758,12 @@ static int klp_init_func(struct klp_object *obj, struct klp_func *func)
763 func->old_sympos ? func->old_sympos : 1); 758 func->old_sympos ? func->old_sympos : 1);
764} 759}
765 760
761/* Arches may override this to finish any remaining arch-specific tasks */
762void __weak arch_klp_init_object_loaded(struct klp_patch *patch,
763 struct klp_object *obj)
764{
765}
766
766/* parts of the initialization that is done only when the object is loaded */ 767/* parts of the initialization that is done only when the object is loaded */
767static int klp_init_object_loaded(struct klp_patch *patch, 768static int klp_init_object_loaded(struct klp_patch *patch,
768 struct klp_object *obj) 769 struct klp_object *obj)
@@ -770,9 +771,15 @@ static int klp_init_object_loaded(struct klp_patch *patch,
770 struct klp_func *func; 771 struct klp_func *func;
771 int ret; 772 int ret;
772 773
774 module_disable_ro(patch->mod);
773 ret = klp_write_object_relocations(patch->mod, obj); 775 ret = klp_write_object_relocations(patch->mod, obj);
774 if (ret) 776 if (ret) {
777 module_enable_ro(patch->mod, true);
775 return ret; 778 return ret;
779 }
780
781 arch_klp_init_object_loaded(patch, obj);
782 module_enable_ro(patch->mod, true);
776 783
777 klp_for_each_func(obj, func) { 784 klp_for_each_func(obj, func) {
778 ret = klp_find_object_symbol(obj->name, func->old_name, 785 ret = klp_find_object_symbol(obj->name, func->old_name,
diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile
index 31322a4275cd..6f88e352cd4f 100644
--- a/kernel/locking/Makefile
+++ b/kernel/locking/Makefile
@@ -18,7 +18,6 @@ obj-$(CONFIG_LOCKDEP) += lockdep_proc.o
18endif 18endif
19obj-$(CONFIG_SMP) += spinlock.o 19obj-$(CONFIG_SMP) += spinlock.o
20obj-$(CONFIG_LOCK_SPIN_ON_OWNER) += osq_lock.o 20obj-$(CONFIG_LOCK_SPIN_ON_OWNER) += osq_lock.o
21obj-$(CONFIG_SMP) += lglock.o
22obj-$(CONFIG_PROVE_LOCKING) += spinlock.o 21obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
23obj-$(CONFIG_QUEUED_SPINLOCKS) += qspinlock.o 22obj-$(CONFIG_QUEUED_SPINLOCKS) += qspinlock.o
24obj-$(CONFIG_RT_MUTEXES) += rtmutex.o 23obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
diff --git a/kernel/locking/lglock.c b/kernel/locking/lglock.c
deleted file mode 100644
index 951cfcd10b4a..000000000000
--- a/kernel/locking/lglock.c
+++ /dev/null
@@ -1,111 +0,0 @@
1/* See include/linux/lglock.h for description */
2#include <linux/module.h>
3#include <linux/lglock.h>
4#include <linux/cpu.h>
5#include <linux/string.h>
6
7/*
8 * Note there is no uninit, so lglocks cannot be defined in
9 * modules (but it's fine to use them from there)
10 * Could be added though, just undo lg_lock_init
11 */
12
13void lg_lock_init(struct lglock *lg, char *name)
14{
15 LOCKDEP_INIT_MAP(&lg->lock_dep_map, name, &lg->lock_key, 0);
16}
17EXPORT_SYMBOL(lg_lock_init);
18
19void lg_local_lock(struct lglock *lg)
20{
21 arch_spinlock_t *lock;
22
23 preempt_disable();
24 lock_acquire_shared(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_);
25 lock = this_cpu_ptr(lg->lock);
26 arch_spin_lock(lock);
27}
28EXPORT_SYMBOL(lg_local_lock);
29
30void lg_local_unlock(struct lglock *lg)
31{
32 arch_spinlock_t *lock;
33
34 lock_release(&lg->lock_dep_map, 1, _RET_IP_);
35 lock = this_cpu_ptr(lg->lock);
36 arch_spin_unlock(lock);
37 preempt_enable();
38}
39EXPORT_SYMBOL(lg_local_unlock);
40
41void lg_local_lock_cpu(struct lglock *lg, int cpu)
42{
43 arch_spinlock_t *lock;
44
45 preempt_disable();
46 lock_acquire_shared(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_);
47 lock = per_cpu_ptr(lg->lock, cpu);
48 arch_spin_lock(lock);
49}
50EXPORT_SYMBOL(lg_local_lock_cpu);
51
52void lg_local_unlock_cpu(struct lglock *lg, int cpu)
53{
54 arch_spinlock_t *lock;
55
56 lock_release(&lg->lock_dep_map, 1, _RET_IP_);
57 lock = per_cpu_ptr(lg->lock, cpu);
58 arch_spin_unlock(lock);
59 preempt_enable();
60}
61EXPORT_SYMBOL(lg_local_unlock_cpu);
62
63void lg_double_lock(struct lglock *lg, int cpu1, int cpu2)
64{
65 BUG_ON(cpu1 == cpu2);
66
67 /* lock in cpu order, just like lg_global_lock */
68 if (cpu2 < cpu1)
69 swap(cpu1, cpu2);
70
71 preempt_disable();
72 lock_acquire_shared(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_);
73 arch_spin_lock(per_cpu_ptr(lg->lock, cpu1));
74 arch_spin_lock(per_cpu_ptr(lg->lock, cpu2));
75}
76
77void lg_double_unlock(struct lglock *lg, int cpu1, int cpu2)
78{
79 lock_release(&lg->lock_dep_map, 1, _RET_IP_);
80 arch_spin_unlock(per_cpu_ptr(lg->lock, cpu1));
81 arch_spin_unlock(per_cpu_ptr(lg->lock, cpu2));
82 preempt_enable();
83}
84
85void lg_global_lock(struct lglock *lg)
86{
87 int i;
88
89 preempt_disable();
90 lock_acquire_exclusive(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_);
91 for_each_possible_cpu(i) {
92 arch_spinlock_t *lock;
93 lock = per_cpu_ptr(lg->lock, i);
94 arch_spin_lock(lock);
95 }
96}
97EXPORT_SYMBOL(lg_global_lock);
98
99void lg_global_unlock(struct lglock *lg)
100{
101 int i;
102
103 lock_release(&lg->lock_dep_map, 1, _RET_IP_);
104 for_each_possible_cpu(i) {
105 arch_spinlock_t *lock;
106 lock = per_cpu_ptr(lg->lock, i);
107 arch_spin_unlock(lock);
108 }
109 preempt_enable();
110}
111EXPORT_SYMBOL(lg_global_unlock);
diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h
index 51c4b24b6328..c2b88490d857 100644
--- a/kernel/locking/lockdep_internals.h
+++ b/kernel/locking/lockdep_internals.h
@@ -46,6 +46,14 @@ enum {
46 (LOCKF_USED_IN_HARDIRQ_READ | LOCKF_USED_IN_SOFTIRQ_READ) 46 (LOCKF_USED_IN_HARDIRQ_READ | LOCKF_USED_IN_SOFTIRQ_READ)
47 47
48/* 48/*
49 * CONFIG_PROVE_LOCKING_SMALL is defined for sparc. Sparc requires .text,
50 * .data and .bss to fit in required 32MB limit for the kernel. With
51 * PROVE_LOCKING we could go over this limit and cause system boot-up problems.
52 * So, reduce the static allocations for lockdeps related structures so that
53 * everything fits in current required size limit.
54 */
55#ifdef CONFIG_PROVE_LOCKING_SMALL
56/*
49 * MAX_LOCKDEP_ENTRIES is the maximum number of lock dependencies 57 * MAX_LOCKDEP_ENTRIES is the maximum number of lock dependencies
50 * we track. 58 * we track.
51 * 59 *
@@ -54,18 +62,24 @@ enum {
54 * table (if it's not there yet), and we check it for lock order 62 * table (if it's not there yet), and we check it for lock order
55 * conflicts and deadlocks. 63 * conflicts and deadlocks.
56 */ 64 */
65#define MAX_LOCKDEP_ENTRIES 16384UL
66#define MAX_LOCKDEP_CHAINS_BITS 15
67#define MAX_STACK_TRACE_ENTRIES 262144UL
68#else
57#define MAX_LOCKDEP_ENTRIES 32768UL 69#define MAX_LOCKDEP_ENTRIES 32768UL
58 70
59#define MAX_LOCKDEP_CHAINS_BITS 16 71#define MAX_LOCKDEP_CHAINS_BITS 16
60#define MAX_LOCKDEP_CHAINS (1UL << MAX_LOCKDEP_CHAINS_BITS)
61
62#define MAX_LOCKDEP_CHAIN_HLOCKS (MAX_LOCKDEP_CHAINS*5)
63 72
64/* 73/*
65 * Stack-trace: tightly packed array of stack backtrace 74 * Stack-trace: tightly packed array of stack backtrace
66 * addresses. Protected by the hash_lock. 75 * addresses. Protected by the hash_lock.
67 */ 76 */
68#define MAX_STACK_TRACE_ENTRIES 524288UL 77#define MAX_STACK_TRACE_ENTRIES 524288UL
78#endif
79
80#define MAX_LOCKDEP_CHAINS (1UL << MAX_LOCKDEP_CHAINS_BITS)
81
82#define MAX_LOCKDEP_CHAIN_HLOCKS (MAX_LOCKDEP_CHAINS*5)
69 83
70extern struct list_head all_lock_classes; 84extern struct list_head all_lock_classes;
71extern struct lock_chain lock_chains[]; 85extern struct lock_chain lock_chains[];
diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c
index bec0b647f9cc..ce182599cf2e 100644
--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -8,152 +8,186 @@
8#include <linux/sched.h> 8#include <linux/sched.h>
9#include <linux/errno.h> 9#include <linux/errno.h>
10 10
11int __percpu_init_rwsem(struct percpu_rw_semaphore *brw, 11int __percpu_init_rwsem(struct percpu_rw_semaphore *sem,
12 const char *name, struct lock_class_key *rwsem_key) 12 const char *name, struct lock_class_key *rwsem_key)
13{ 13{
14 brw->fast_read_ctr = alloc_percpu(int); 14 sem->read_count = alloc_percpu(int);
15 if (unlikely(!brw->fast_read_ctr)) 15 if (unlikely(!sem->read_count))
16 return -ENOMEM; 16 return -ENOMEM;
17 17
18 /* ->rw_sem represents the whole percpu_rw_semaphore for lockdep */ 18 /* ->rw_sem represents the whole percpu_rw_semaphore for lockdep */
19 __init_rwsem(&brw->rw_sem, name, rwsem_key); 19 rcu_sync_init(&sem->rss, RCU_SCHED_SYNC);
20 rcu_sync_init(&brw->rss, RCU_SCHED_SYNC); 20 __init_rwsem(&sem->rw_sem, name, rwsem_key);
21 atomic_set(&brw->slow_read_ctr, 0); 21 init_waitqueue_head(&sem->writer);
22 init_waitqueue_head(&brw->write_waitq); 22 sem->readers_block = 0;
23 return 0; 23 return 0;
24} 24}
25EXPORT_SYMBOL_GPL(__percpu_init_rwsem); 25EXPORT_SYMBOL_GPL(__percpu_init_rwsem);
26 26
27void percpu_free_rwsem(struct percpu_rw_semaphore *brw) 27void percpu_free_rwsem(struct percpu_rw_semaphore *sem)
28{ 28{
29 /* 29 /*
30 * XXX: temporary kludge. The error path in alloc_super() 30 * XXX: temporary kludge. The error path in alloc_super()
31 * assumes that percpu_free_rwsem() is safe after kzalloc(). 31 * assumes that percpu_free_rwsem() is safe after kzalloc().
32 */ 32 */
33 if (!brw->fast_read_ctr) 33 if (!sem->read_count)
34 return; 34 return;
35 35
36 rcu_sync_dtor(&brw->rss); 36 rcu_sync_dtor(&sem->rss);
37 free_percpu(brw->fast_read_ctr); 37 free_percpu(sem->read_count);
38 brw->fast_read_ctr = NULL; /* catch use after free bugs */ 38 sem->read_count = NULL; /* catch use after free bugs */
39} 39}
40EXPORT_SYMBOL_GPL(percpu_free_rwsem); 40EXPORT_SYMBOL_GPL(percpu_free_rwsem);
41 41
42/* 42int __percpu_down_read(struct percpu_rw_semaphore *sem, int try)
43 * This is the fast-path for down_read/up_read. If it succeeds we rely
44 * on the barriers provided by rcu_sync_enter/exit; see the comments in
45 * percpu_down_write() and percpu_up_write().
46 *
47 * If this helper fails the callers rely on the normal rw_semaphore and
48 * atomic_dec_and_test(), so in this case we have the necessary barriers.
49 */
50static bool update_fast_ctr(struct percpu_rw_semaphore *brw, unsigned int val)
51{ 43{
52 bool success; 44 /*
45 * Due to having preemption disabled the decrement happens on
46 * the same CPU as the increment, avoiding the
47 * increment-on-one-CPU-and-decrement-on-another problem.
48 *
49 * If the reader misses the writer's assignment of readers_block, then
50 * the writer is guaranteed to see the reader's increment.
51 *
52 * Conversely, any readers that increment their sem->read_count after
53 * the writer looks are guaranteed to see the readers_block value,
54 * which in turn means that they are guaranteed to immediately
55 * decrement their sem->read_count, so that it doesn't matter that the
56 * writer missed them.
57 */
53 58
54 preempt_disable(); 59 smp_mb(); /* A matches D */
55 success = rcu_sync_is_idle(&brw->rss);
56 if (likely(success))
57 __this_cpu_add(*brw->fast_read_ctr, val);
58 preempt_enable();
59 60
60 return success; 61 /*
61} 62 * If !readers_block the critical section starts here, matched by the
63 * release in percpu_up_write().
64 */
65 if (likely(!smp_load_acquire(&sem->readers_block)))
66 return 1;
62 67
63/* 68 /*
64 * Like the normal down_read() this is not recursive, the writer can 69 * Per the above comment; we still have preemption disabled and
65 * come after the first percpu_down_read() and create the deadlock. 70 * will thus decrement on the same CPU as we incremented.
66 * 71 */
67 * Note: returns with lock_is_held(brw->rw_sem) == T for lockdep, 72 __percpu_up_read(sem);
68 * percpu_up_read() does rwsem_release(). This pairs with the usage
69 * of ->rw_sem in percpu_down/up_write().
70 */
71void percpu_down_read(struct percpu_rw_semaphore *brw)
72{
73 might_sleep();
74 rwsem_acquire_read(&brw->rw_sem.dep_map, 0, 0, _RET_IP_);
75 73
76 if (likely(update_fast_ctr(brw, +1))) 74 if (try)
77 return; 75 return 0;
78 76
79 /* Avoid rwsem_acquire_read() and rwsem_release() */ 77 /*
80 __down_read(&brw->rw_sem); 78 * We either call schedule() in the wait, or we'll fall through
81 atomic_inc(&brw->slow_read_ctr); 79 * and reschedule on the preempt_enable() in percpu_down_read().
82 __up_read(&brw->rw_sem); 80 */
83} 81 preempt_enable_no_resched();
84EXPORT_SYMBOL_GPL(percpu_down_read);
85 82
86int percpu_down_read_trylock(struct percpu_rw_semaphore *brw) 83 /*
87{ 84 * Avoid lockdep for the down/up_read() we already have them.
88 if (unlikely(!update_fast_ctr(brw, +1))) { 85 */
89 if (!__down_read_trylock(&brw->rw_sem)) 86 __down_read(&sem->rw_sem);
90 return 0; 87 this_cpu_inc(*sem->read_count);
91 atomic_inc(&brw->slow_read_ctr); 88 __up_read(&sem->rw_sem);
92 __up_read(&brw->rw_sem); 89
93 } 90 preempt_disable();
94
95 rwsem_acquire_read(&brw->rw_sem.dep_map, 0, 1, _RET_IP_);
96 return 1; 91 return 1;
97} 92}
93EXPORT_SYMBOL_GPL(__percpu_down_read);
98 94
99void percpu_up_read(struct percpu_rw_semaphore *brw) 95void __percpu_up_read(struct percpu_rw_semaphore *sem)
100{ 96{
101 rwsem_release(&brw->rw_sem.dep_map, 1, _RET_IP_); 97 smp_mb(); /* B matches C */
102 98 /*
103 if (likely(update_fast_ctr(brw, -1))) 99 * In other words, if they see our decrement (presumably to aggregate
104 return; 100 * zero, as that is the only time it matters) they will also see our
101 * critical section.
102 */
103 __this_cpu_dec(*sem->read_count);
105 104
106 /* false-positive is possible but harmless */ 105 /* Prod writer to recheck readers_active */
107 if (atomic_dec_and_test(&brw->slow_read_ctr)) 106 wake_up(&sem->writer);
108 wake_up_all(&brw->write_waitq);
109} 107}
110EXPORT_SYMBOL_GPL(percpu_up_read); 108EXPORT_SYMBOL_GPL(__percpu_up_read);
109
110#define per_cpu_sum(var) \
111({ \
112 typeof(var) __sum = 0; \
113 int cpu; \
114 compiletime_assert_atomic_type(__sum); \
115 for_each_possible_cpu(cpu) \
116 __sum += per_cpu(var, cpu); \
117 __sum; \
118})
111 119
112static int clear_fast_ctr(struct percpu_rw_semaphore *brw) 120/*
121 * Return true if the modular sum of the sem->read_count per-CPU variable is
122 * zero. If this sum is zero, then it is stable due to the fact that if any
123 * newly arriving readers increment a given counter, they will immediately
124 * decrement that same counter.
125 */
126static bool readers_active_check(struct percpu_rw_semaphore *sem)
113{ 127{
114 unsigned int sum = 0; 128 if (per_cpu_sum(*sem->read_count) != 0)
115 int cpu; 129 return false;
130
131 /*
132 * If we observed the decrement; ensure we see the entire critical
133 * section.
134 */
116 135
117 for_each_possible_cpu(cpu) { 136 smp_mb(); /* C matches B */
118 sum += per_cpu(*brw->fast_read_ctr, cpu);
119 per_cpu(*brw->fast_read_ctr, cpu) = 0;
120 }
121 137
122 return sum; 138 return true;
123} 139}
124 140
125void percpu_down_write(struct percpu_rw_semaphore *brw) 141void percpu_down_write(struct percpu_rw_semaphore *sem)
126{ 142{
143 /* Notify readers to take the slow path. */
144 rcu_sync_enter(&sem->rss);
145
146 down_write(&sem->rw_sem);
147
127 /* 148 /*
128 * Make rcu_sync_is_idle() == F and thus disable the fast-path in 149 * Notify new readers to block; up until now, and thus throughout the
129 * percpu_down_read() and percpu_up_read(), and wait for gp pass. 150 * longish rcu_sync_enter() above, new readers could still come in.
130 *
131 * The latter synchronises us with the preceding readers which used
132 * the fast-past, so we can not miss the result of __this_cpu_add()
133 * or anything else inside their criticial sections.
134 */ 151 */
135 rcu_sync_enter(&brw->rss); 152 WRITE_ONCE(sem->readers_block, 1);
136 153
137 /* exclude other writers, and block the new readers completely */ 154 smp_mb(); /* D matches A */
138 down_write(&brw->rw_sem);
139 155
140 /* nobody can use fast_read_ctr, move its sum into slow_read_ctr */ 156 /*
141 atomic_add(clear_fast_ctr(brw), &brw->slow_read_ctr); 157 * If they don't see our writer of readers_block, then we are
158 * guaranteed to see their sem->read_count increment, and therefore
159 * will wait for them.
160 */
142 161
143 /* wait for all readers to complete their percpu_up_read() */ 162 /* Wait for all now active readers to complete. */
144 wait_event(brw->write_waitq, !atomic_read(&brw->slow_read_ctr)); 163 wait_event(sem->writer, readers_active_check(sem));
145} 164}
146EXPORT_SYMBOL_GPL(percpu_down_write); 165EXPORT_SYMBOL_GPL(percpu_down_write);
147 166
148void percpu_up_write(struct percpu_rw_semaphore *brw) 167void percpu_up_write(struct percpu_rw_semaphore *sem)
149{ 168{
150 /* release the lock, but the readers can't use the fast-path */
151 up_write(&brw->rw_sem);
152 /* 169 /*
153 * Enable the fast-path in percpu_down_read() and percpu_up_read() 170 * Signal the writer is done, no fast path yet.
154 * but only after another gp pass; this adds the necessary barrier 171 *
155 * to ensure the reader can't miss the changes done by us. 172 * One reason that we cannot just immediately flip to readers_fast is
173 * that new readers might fail to see the results of this writer's
174 * critical section.
175 *
176 * Therefore we force it through the slow path which guarantees an
177 * acquire and thereby guarantees the critical section's consistency.
178 */
179 smp_store_release(&sem->readers_block, 0);
180
181 /*
182 * Release the write lock, this will allow readers back in the game.
183 */
184 up_write(&sem->rw_sem);
185
186 /*
187 * Once this completes (at least one RCU-sched grace period hence) the
188 * reader fast path will be available again. Safe to use outside the
189 * exclusive write lock because its counting.
156 */ 190 */
157 rcu_sync_exit(&brw->rss); 191 rcu_sync_exit(&sem->rss);
158} 192}
159EXPORT_SYMBOL_GPL(percpu_up_write); 193EXPORT_SYMBOL_GPL(percpu_up_write);
diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h
index 8a99abf58080..e3b5520005db 100644
--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -70,11 +70,14 @@ struct pv_node {
70static inline bool pv_queued_spin_steal_lock(struct qspinlock *lock) 70static inline bool pv_queued_spin_steal_lock(struct qspinlock *lock)
71{ 71{
72 struct __qspinlock *l = (void *)lock; 72 struct __qspinlock *l = (void *)lock;
73 int ret = !(atomic_read(&lock->val) & _Q_LOCKED_PENDING_MASK) &&
74 (cmpxchg(&l->locked, 0, _Q_LOCKED_VAL) == 0);
75 73
76 qstat_inc(qstat_pv_lock_stealing, ret); 74 if (!(atomic_read(&lock->val) & _Q_LOCKED_PENDING_MASK) &&
77 return ret; 75 (cmpxchg(&l->locked, 0, _Q_LOCKED_VAL) == 0)) {
76 qstat_inc(qstat_pv_lock_stealing, true);
77 return true;
78 }
79
80 return false;
78} 81}
79 82
80/* 83/*
@@ -257,7 +260,6 @@ static struct pv_node *pv_unhash(struct qspinlock *lock)
257static inline bool 260static inline bool
258pv_wait_early(struct pv_node *prev, int loop) 261pv_wait_early(struct pv_node *prev, int loop)
259{ 262{
260
261 if ((loop & PV_PREV_CHECK_MASK) != 0) 263 if ((loop & PV_PREV_CHECK_MASK) != 0)
262 return false; 264 return false;
263 265
@@ -286,12 +288,10 @@ static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev)
286{ 288{
287 struct pv_node *pn = (struct pv_node *)node; 289 struct pv_node *pn = (struct pv_node *)node;
288 struct pv_node *pp = (struct pv_node *)prev; 290 struct pv_node *pp = (struct pv_node *)prev;
289 int waitcnt = 0;
290 int loop; 291 int loop;
291 bool wait_early; 292 bool wait_early;
292 293
293 /* waitcnt processing will be compiled out if !QUEUED_LOCK_STAT */ 294 for (;;) {
294 for (;; waitcnt++) {
295 for (wait_early = false, loop = SPIN_THRESHOLD; loop; loop--) { 295 for (wait_early = false, loop = SPIN_THRESHOLD; loop; loop--) {
296 if (READ_ONCE(node->locked)) 296 if (READ_ONCE(node->locked))
297 return; 297 return;
@@ -315,7 +315,6 @@ static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev)
315 315
316 if (!READ_ONCE(node->locked)) { 316 if (!READ_ONCE(node->locked)) {
317 qstat_inc(qstat_pv_wait_node, true); 317 qstat_inc(qstat_pv_wait_node, true);
318 qstat_inc(qstat_pv_wait_again, waitcnt);
319 qstat_inc(qstat_pv_wait_early, wait_early); 318 qstat_inc(qstat_pv_wait_early, wait_early);
320 pv_wait(&pn->state, vcpu_halted); 319 pv_wait(&pn->state, vcpu_halted);
321 } 320 }
@@ -456,12 +455,9 @@ pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node)
456 pv_wait(&l->locked, _Q_SLOW_VAL); 455 pv_wait(&l->locked, _Q_SLOW_VAL);
457 456
458 /* 457 /*
459 * The unlocker should have freed the lock before kicking the 458 * Because of lock stealing, the queue head vCPU may not be
460 * CPU. So if the lock is still not free, it is a spurious 459 * able to acquire the lock before it has to wait again.
461 * wakeup or another vCPU has stolen the lock. The current
462 * vCPU should spin again.
463 */ 460 */
464 qstat_inc(qstat_pv_spurious_wakeup, READ_ONCE(l->locked));
465 } 461 }
466 462
467 /* 463 /*
@@ -544,7 +540,7 @@ __visible void __pv_queued_spin_unlock(struct qspinlock *lock)
544 * unhash. Otherwise it would be possible to have multiple @lock 540 * unhash. Otherwise it would be possible to have multiple @lock
545 * entries, which would be BAD. 541 * entries, which would be BAD.
546 */ 542 */
547 locked = cmpxchg(&l->locked, _Q_LOCKED_VAL, 0); 543 locked = cmpxchg_release(&l->locked, _Q_LOCKED_VAL, 0);
548 if (likely(locked == _Q_LOCKED_VAL)) 544 if (likely(locked == _Q_LOCKED_VAL))
549 return; 545 return;
550 546
diff --git a/kernel/locking/qspinlock_stat.h b/kernel/locking/qspinlock_stat.h
index b9d031516254..eb0a599fcf58 100644
--- a/kernel/locking/qspinlock_stat.h
+++ b/kernel/locking/qspinlock_stat.h
@@ -24,8 +24,8 @@
24 * pv_latency_wake - average latency (ns) from vCPU kick to wakeup 24 * pv_latency_wake - average latency (ns) from vCPU kick to wakeup
25 * pv_lock_slowpath - # of locking operations via the slowpath 25 * pv_lock_slowpath - # of locking operations via the slowpath
26 * pv_lock_stealing - # of lock stealing operations 26 * pv_lock_stealing - # of lock stealing operations
27 * pv_spurious_wakeup - # of spurious wakeups 27 * pv_spurious_wakeup - # of spurious wakeups in non-head vCPUs
28 * pv_wait_again - # of vCPU wait's that happened after a vCPU kick 28 * pv_wait_again - # of wait's after a queue head vCPU kick
29 * pv_wait_early - # of early vCPU wait's 29 * pv_wait_early - # of early vCPU wait's
30 * pv_wait_head - # of vCPU wait's at the queue head 30 * pv_wait_head - # of vCPU wait's at the queue head
31 * pv_wait_node - # of vCPU wait's at a non-head queue node 31 * pv_wait_node - # of vCPU wait's at a non-head queue node
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index 447e08de1fab..2337b4bb2366 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -121,16 +121,19 @@ enum rwsem_wake_type {
121 * - woken process blocks are discarded from the list after having task zeroed 121 * - woken process blocks are discarded from the list after having task zeroed
122 * - writers are only marked woken if downgrading is false 122 * - writers are only marked woken if downgrading is false
123 */ 123 */
124static struct rw_semaphore * 124static void __rwsem_mark_wake(struct rw_semaphore *sem,
125__rwsem_mark_wake(struct rw_semaphore *sem, 125 enum rwsem_wake_type wake_type,
126 enum rwsem_wake_type wake_type, struct wake_q_head *wake_q) 126 struct wake_q_head *wake_q)
127{ 127{
128 struct rwsem_waiter *waiter; 128 struct rwsem_waiter *waiter, *tmp;
129 struct task_struct *tsk; 129 long oldcount, woken = 0, adjustment = 0;
130 struct list_head *next; 130
131 long oldcount, woken, loop, adjustment; 131 /*
132 * Take a peek at the queue head waiter such that we can determine
133 * the wakeup(s) to perform.
134 */
135 waiter = list_first_entry(&sem->wait_list, struct rwsem_waiter, list);
132 136
133 waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list);
134 if (waiter->type == RWSEM_WAITING_FOR_WRITE) { 137 if (waiter->type == RWSEM_WAITING_FOR_WRITE) {
135 if (wake_type == RWSEM_WAKE_ANY) { 138 if (wake_type == RWSEM_WAKE_ANY) {
136 /* 139 /*
@@ -142,19 +145,19 @@ __rwsem_mark_wake(struct rw_semaphore *sem,
142 */ 145 */
143 wake_q_add(wake_q, waiter->task); 146 wake_q_add(wake_q, waiter->task);
144 } 147 }
145 goto out; 148
149 return;
146 } 150 }
147 151
148 /* Writers might steal the lock before we grant it to the next reader. 152 /*
153 * Writers might steal the lock before we grant it to the next reader.
149 * We prefer to do the first reader grant before counting readers 154 * We prefer to do the first reader grant before counting readers
150 * so we can bail out early if a writer stole the lock. 155 * so we can bail out early if a writer stole the lock.
151 */ 156 */
152 adjustment = 0;
153 if (wake_type != RWSEM_WAKE_READ_OWNED) { 157 if (wake_type != RWSEM_WAKE_READ_OWNED) {
154 adjustment = RWSEM_ACTIVE_READ_BIAS; 158 adjustment = RWSEM_ACTIVE_READ_BIAS;
155 try_reader_grant: 159 try_reader_grant:
156 oldcount = atomic_long_fetch_add(adjustment, &sem->count); 160 oldcount = atomic_long_fetch_add(adjustment, &sem->count);
157
158 if (unlikely(oldcount < RWSEM_WAITING_BIAS)) { 161 if (unlikely(oldcount < RWSEM_WAITING_BIAS)) {
159 /* 162 /*
160 * If the count is still less than RWSEM_WAITING_BIAS 163 * If the count is still less than RWSEM_WAITING_BIAS
@@ -164,7 +167,8 @@ __rwsem_mark_wake(struct rw_semaphore *sem,
164 */ 167 */
165 if (atomic_long_add_return(-adjustment, &sem->count) < 168 if (atomic_long_add_return(-adjustment, &sem->count) <
166 RWSEM_WAITING_BIAS) 169 RWSEM_WAITING_BIAS)
167 goto out; 170 return;
171
168 /* Last active locker left. Retry waking readers. */ 172 /* Last active locker left. Retry waking readers. */
169 goto try_reader_grant; 173 goto try_reader_grant;
170 } 174 }
@@ -176,38 +180,23 @@ __rwsem_mark_wake(struct rw_semaphore *sem,
176 rwsem_set_reader_owned(sem); 180 rwsem_set_reader_owned(sem);
177 } 181 }
178 182
179 /* Grant an infinite number of read locks to the readers at the front 183 /*
180 * of the queue. Note we increment the 'active part' of the count by 184 * Grant an infinite number of read locks to the readers at the front
181 * the number of readers before waking any processes up. 185 * of the queue. We know that woken will be at least 1 as we accounted
186 * for above. Note we increment the 'active part' of the count by the
187 * number of readers before waking any processes up.
182 */ 188 */
183 woken = 0; 189 list_for_each_entry_safe(waiter, tmp, &sem->wait_list, list) {
184 do { 190 struct task_struct *tsk;
185 woken++;
186 191
187 if (waiter->list.next == &sem->wait_list) 192 if (waiter->type == RWSEM_WAITING_FOR_WRITE)
188 break; 193 break;
189 194
190 waiter = list_entry(waiter->list.next, 195 woken++;
191 struct rwsem_waiter, list);
192
193 } while (waiter->type != RWSEM_WAITING_FOR_WRITE);
194
195 adjustment = woken * RWSEM_ACTIVE_READ_BIAS - adjustment;
196 if (waiter->type != RWSEM_WAITING_FOR_WRITE)
197 /* hit end of list above */
198 adjustment -= RWSEM_WAITING_BIAS;
199
200 if (adjustment)
201 atomic_long_add(adjustment, &sem->count);
202
203 next = sem->wait_list.next;
204 loop = woken;
205 do {
206 waiter = list_entry(next, struct rwsem_waiter, list);
207 next = waiter->list.next;
208 tsk = waiter->task; 196 tsk = waiter->task;
209 197
210 wake_q_add(wake_q, tsk); 198 wake_q_add(wake_q, tsk);
199 list_del(&waiter->list);
211 /* 200 /*
212 * Ensure that the last operation is setting the reader 201 * Ensure that the last operation is setting the reader
213 * waiter to nil such that rwsem_down_read_failed() cannot 202 * waiter to nil such that rwsem_down_read_failed() cannot
@@ -215,13 +204,16 @@ __rwsem_mark_wake(struct rw_semaphore *sem,
215 * to the task to wakeup. 204 * to the task to wakeup.
216 */ 205 */
217 smp_store_release(&waiter->task, NULL); 206 smp_store_release(&waiter->task, NULL);
218 } while (--loop); 207 }
219 208
220 sem->wait_list.next = next; 209 adjustment = woken * RWSEM_ACTIVE_READ_BIAS - adjustment;
221 next->prev = &sem->wait_list; 210 if (list_empty(&sem->wait_list)) {
211 /* hit end of list above */
212 adjustment -= RWSEM_WAITING_BIAS;
213 }
222 214
223 out: 215 if (adjustment)
224 return sem; 216 atomic_long_add(adjustment, &sem->count);
225} 217}
226 218
227/* 219/*
@@ -235,7 +227,6 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
235 struct task_struct *tsk = current; 227 struct task_struct *tsk = current;
236 WAKE_Q(wake_q); 228 WAKE_Q(wake_q);
237 229
238 /* set up my own style of waitqueue */
239 waiter.task = tsk; 230 waiter.task = tsk;
240 waiter.type = RWSEM_WAITING_FOR_READ; 231 waiter.type = RWSEM_WAITING_FOR_READ;
241 232
@@ -247,7 +238,8 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
247 /* we're now waiting on the lock, but no longer actively locking */ 238 /* we're now waiting on the lock, but no longer actively locking */
248 count = atomic_long_add_return(adjustment, &sem->count); 239 count = atomic_long_add_return(adjustment, &sem->count);
249 240
250 /* If there are no active locks, wake the front queued process(es). 241 /*
242 * If there are no active locks, wake the front queued process(es).
251 * 243 *
252 * If there are no writers and we are first in the queue, 244 * If there are no writers and we are first in the queue,
253 * wake our own waiter to join the existing active readers ! 245 * wake our own waiter to join the existing active readers !
@@ -255,7 +247,7 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
255 if (count == RWSEM_WAITING_BIAS || 247 if (count == RWSEM_WAITING_BIAS ||
256 (count > RWSEM_WAITING_BIAS && 248 (count > RWSEM_WAITING_BIAS &&
257 adjustment != -RWSEM_ACTIVE_READ_BIAS)) 249 adjustment != -RWSEM_ACTIVE_READ_BIAS))
258 sem = __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); 250 __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
259 251
260 raw_spin_unlock_irq(&sem->wait_lock); 252 raw_spin_unlock_irq(&sem->wait_lock);
261 wake_up_q(&wake_q); 253 wake_up_q(&wake_q);
@@ -505,7 +497,7 @@ __rwsem_down_write_failed_common(struct rw_semaphore *sem, int state)
505 if (count > RWSEM_WAITING_BIAS) { 497 if (count > RWSEM_WAITING_BIAS) {
506 WAKE_Q(wake_q); 498 WAKE_Q(wake_q);
507 499
508 sem = __rwsem_mark_wake(sem, RWSEM_WAKE_READERS, &wake_q); 500 __rwsem_mark_wake(sem, RWSEM_WAKE_READERS, &wake_q);
509 /* 501 /*
510 * The wakeup is normally called _after_ the wait_lock 502 * The wakeup is normally called _after_ the wait_lock
511 * is released, but given that we are proactively waking 503 * is released, but given that we are proactively waking
@@ -614,9 +606,8 @@ struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem)
614 raw_spin_lock_irqsave(&sem->wait_lock, flags); 606 raw_spin_lock_irqsave(&sem->wait_lock, flags);
615locked: 607locked:
616 608
617 /* do nothing if list empty */
618 if (!list_empty(&sem->wait_list)) 609 if (!list_empty(&sem->wait_list))
619 sem = __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); 610 __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
620 611
621 raw_spin_unlock_irqrestore(&sem->wait_lock, flags); 612 raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
622 wake_up_q(&wake_q); 613 wake_up_q(&wake_q);
@@ -638,9 +629,8 @@ struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem)
638 629
639 raw_spin_lock_irqsave(&sem->wait_lock, flags); 630 raw_spin_lock_irqsave(&sem->wait_lock, flags);
640 631
641 /* do nothing if list empty */
642 if (!list_empty(&sem->wait_list)) 632 if (!list_empty(&sem->wait_list))
643 sem = __rwsem_mark_wake(sem, RWSEM_WAKE_READ_OWNED, &wake_q); 633 __rwsem_mark_wake(sem, RWSEM_WAKE_READ_OWNED, &wake_q);
644 634
645 raw_spin_unlock_irqrestore(&sem->wait_lock, flags); 635 raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
646 wake_up_q(&wake_q); 636 wake_up_q(&wake_q);
diff --git a/kernel/module.c b/kernel/module.c
index 529efae9f481..f57dd63186e6 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1149,6 +1149,8 @@ static size_t module_flags_taint(struct module *mod, char *buf)
1149 buf[l++] = 'C'; 1149 buf[l++] = 'C';
1150 if (mod->taints & (1 << TAINT_UNSIGNED_MODULE)) 1150 if (mod->taints & (1 << TAINT_UNSIGNED_MODULE))
1151 buf[l++] = 'E'; 1151 buf[l++] = 'E';
1152 if (mod->taints & (1 << TAINT_LIVEPATCH))
1153 buf[l++] = 'K';
1152 /* 1154 /*
1153 * TAINT_FORCED_RMMOD: could be added. 1155 * TAINT_FORCED_RMMOD: could be added.
1154 * TAINT_CPU_OUT_OF_SPEC, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't 1156 * TAINT_CPU_OUT_OF_SPEC, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't
@@ -2792,14 +2794,17 @@ static int copy_chunked_from_user(void *dst, const void __user *usrc, unsigned l
2792} 2794}
2793 2795
2794#ifdef CONFIG_LIVEPATCH 2796#ifdef CONFIG_LIVEPATCH
2795static int find_livepatch_modinfo(struct module *mod, struct load_info *info) 2797static int check_modinfo_livepatch(struct module *mod, struct load_info *info)
2796{ 2798{
2797 mod->klp = get_modinfo(info, "livepatch") ? true : false; 2799 if (get_modinfo(info, "livepatch")) {
2800 mod->klp = true;
2801 add_taint_module(mod, TAINT_LIVEPATCH, LOCKDEP_STILL_OK);
2802 }
2798 2803
2799 return 0; 2804 return 0;
2800} 2805}
2801#else /* !CONFIG_LIVEPATCH */ 2806#else /* !CONFIG_LIVEPATCH */
2802static int find_livepatch_modinfo(struct module *mod, struct load_info *info) 2807static int check_modinfo_livepatch(struct module *mod, struct load_info *info)
2803{ 2808{
2804 if (get_modinfo(info, "livepatch")) { 2809 if (get_modinfo(info, "livepatch")) {
2805 pr_err("%s: module is marked as livepatch module, but livepatch support is disabled", 2810 pr_err("%s: module is marked as livepatch module, but livepatch support is disabled",
@@ -2969,7 +2974,7 @@ static int check_modinfo(struct module *mod, struct load_info *info, int flags)
2969 "is unknown, you have been warned.\n", mod->name); 2974 "is unknown, you have been warned.\n", mod->name);
2970 } 2975 }
2971 2976
2972 err = find_livepatch_modinfo(mod, info); 2977 err = check_modinfo_livepatch(mod, info);
2973 if (err) 2978 if (err)
2974 return err; 2979 return err;
2975 2980
diff --git a/kernel/padata.c b/kernel/padata.c
index 993278895ccc..7848f0566403 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -30,6 +30,7 @@
30#include <linux/slab.h> 30#include <linux/slab.h>
31#include <linux/sysfs.h> 31#include <linux/sysfs.h>
32#include <linux/rcupdate.h> 32#include <linux/rcupdate.h>
33#include <linux/module.h>
33 34
34#define MAX_OBJ_NUM 1000 35#define MAX_OBJ_NUM 1000
35 36
@@ -769,52 +770,43 @@ static inline int pinst_has_cpu(struct padata_instance *pinst, int cpu)
769 cpumask_test_cpu(cpu, pinst->cpumask.cbcpu); 770 cpumask_test_cpu(cpu, pinst->cpumask.cbcpu);
770} 771}
771 772
772 773static int padata_cpu_online(unsigned int cpu, struct hlist_node *node)
773static int padata_cpu_callback(struct notifier_block *nfb,
774 unsigned long action, void *hcpu)
775{ 774{
776 int err;
777 struct padata_instance *pinst; 775 struct padata_instance *pinst;
778 int cpu = (unsigned long)hcpu; 776 int ret;
779 777
780 pinst = container_of(nfb, struct padata_instance, cpu_notifier); 778 pinst = hlist_entry_safe(node, struct padata_instance, node);
779 if (!pinst_has_cpu(pinst, cpu))
780 return 0;
781 781
782 switch (action) { 782 mutex_lock(&pinst->lock);
783 case CPU_ONLINE: 783 ret = __padata_add_cpu(pinst, cpu);
784 case CPU_ONLINE_FROZEN: 784 mutex_unlock(&pinst->lock);
785 case CPU_DOWN_FAILED: 785 return ret;
786 case CPU_DOWN_FAILED_FROZEN: 786}
787 if (!pinst_has_cpu(pinst, cpu))
788 break;
789 mutex_lock(&pinst->lock);
790 err = __padata_add_cpu(pinst, cpu);
791 mutex_unlock(&pinst->lock);
792 if (err)
793 return notifier_from_errno(err);
794 break;
795 787
796 case CPU_DOWN_PREPARE: 788static int padata_cpu_prep_down(unsigned int cpu, struct hlist_node *node)
797 case CPU_DOWN_PREPARE_FROZEN: 789{
798 case CPU_UP_CANCELED: 790 struct padata_instance *pinst;
799 case CPU_UP_CANCELED_FROZEN: 791 int ret;
800 if (!pinst_has_cpu(pinst, cpu)) 792
801 break; 793 pinst = hlist_entry_safe(node, struct padata_instance, node);
802 mutex_lock(&pinst->lock); 794 if (!pinst_has_cpu(pinst, cpu))
803 err = __padata_remove_cpu(pinst, cpu); 795 return 0;
804 mutex_unlock(&pinst->lock);
805 if (err)
806 return notifier_from_errno(err);
807 break;
808 }
809 796
810 return NOTIFY_OK; 797 mutex_lock(&pinst->lock);
798 ret = __padata_remove_cpu(pinst, cpu);
799 mutex_unlock(&pinst->lock);
800 return ret;
811} 801}
802
803static enum cpuhp_state hp_online;
812#endif 804#endif
813 805
814static void __padata_free(struct padata_instance *pinst) 806static void __padata_free(struct padata_instance *pinst)
815{ 807{
816#ifdef CONFIG_HOTPLUG_CPU 808#ifdef CONFIG_HOTPLUG_CPU
817 unregister_hotcpu_notifier(&pinst->cpu_notifier); 809 cpuhp_state_remove_instance_nocalls(hp_online, &pinst->node);
818#endif 810#endif
819 811
820 padata_stop(pinst); 812 padata_stop(pinst);
@@ -1012,11 +1004,8 @@ struct padata_instance *padata_alloc(struct workqueue_struct *wq,
1012 mutex_init(&pinst->lock); 1004 mutex_init(&pinst->lock);
1013 1005
1014#ifdef CONFIG_HOTPLUG_CPU 1006#ifdef CONFIG_HOTPLUG_CPU
1015 pinst->cpu_notifier.notifier_call = padata_cpu_callback; 1007 cpuhp_state_add_instance_nocalls(hp_online, &pinst->node);
1016 pinst->cpu_notifier.priority = 0;
1017 register_hotcpu_notifier(&pinst->cpu_notifier);
1018#endif 1008#endif
1019
1020 return pinst; 1009 return pinst;
1021 1010
1022err_free_masks: 1011err_free_masks:
@@ -1039,3 +1028,26 @@ void padata_free(struct padata_instance *pinst)
1039 kobject_put(&pinst->kobj); 1028 kobject_put(&pinst->kobj);
1040} 1029}
1041EXPORT_SYMBOL(padata_free); 1030EXPORT_SYMBOL(padata_free);
1031
1032#ifdef CONFIG_HOTPLUG_CPU
1033
1034static __init int padata_driver_init(void)
1035{
1036 int ret;
1037
1038 ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, "padata:online",
1039 padata_cpu_online,
1040 padata_cpu_prep_down);
1041 if (ret < 0)
1042 return ret;
1043 hp_online = ret;
1044 return 0;
1045}
1046module_init(padata_driver_init);
1047
1048static __exit void padata_driver_exit(void)
1049{
1050 cpuhp_remove_multi_state(hp_online);
1051}
1052module_exit(padata_driver_exit);
1053#endif
diff --git a/kernel/panic.c b/kernel/panic.c
index ca8cea1ef673..e6480e20379e 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -71,6 +71,32 @@ void __weak nmi_panic_self_stop(struct pt_regs *regs)
71 panic_smp_self_stop(); 71 panic_smp_self_stop();
72} 72}
73 73
74/*
75 * Stop other CPUs in panic. Architecture dependent code may override this
76 * with more suitable version. For example, if the architecture supports
77 * crash dump, it should save registers of each stopped CPU and disable
78 * per-CPU features such as virtualization extensions.
79 */
80void __weak crash_smp_send_stop(void)
81{
82 static int cpus_stopped;
83
84 /*
85 * This function can be called twice in panic path, but obviously
86 * we execute this only once.
87 */
88 if (cpus_stopped)
89 return;
90
91 /*
92 * Note smp_send_stop is the usual smp shutdown function, which
93 * unfortunately means it may not be hardened to work in a panic
94 * situation.
95 */
96 smp_send_stop();
97 cpus_stopped = 1;
98}
99
74atomic_t panic_cpu = ATOMIC_INIT(PANIC_CPU_INVALID); 100atomic_t panic_cpu = ATOMIC_INIT(PANIC_CPU_INVALID);
75 101
76/* 102/*
@@ -164,14 +190,21 @@ void panic(const char *fmt, ...)
164 if (!_crash_kexec_post_notifiers) { 190 if (!_crash_kexec_post_notifiers) {
165 printk_nmi_flush_on_panic(); 191 printk_nmi_flush_on_panic();
166 __crash_kexec(NULL); 192 __crash_kexec(NULL);
167 }
168 193
169 /* 194 /*
170 * Note smp_send_stop is the usual smp shutdown function, which 195 * Note smp_send_stop is the usual smp shutdown function, which
171 * unfortunately means it may not be hardened to work in a panic 196 * unfortunately means it may not be hardened to work in a
172 * situation. 197 * panic situation.
173 */ 198 */
174 smp_send_stop(); 199 smp_send_stop();
200 } else {
201 /*
202 * If we want to do crash dump after notifier calls and
203 * kmsg_dump, we will need architecture dependent extra
204 * works in addition to stopping other CPUs.
205 */
206 crash_smp_send_stop();
207 }
175 208
176 /* 209 /*
177 * Run any panic handlers, including those that might need to 210 * Run any panic handlers, including those that might need to
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index a65ba137fd15..df9e8e9e0be7 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -79,23 +79,36 @@ static void proc_cleanup_work(struct work_struct *work)
79/* MAX_PID_NS_LEVEL is needed for limiting size of 'struct pid' */ 79/* MAX_PID_NS_LEVEL is needed for limiting size of 'struct pid' */
80#define MAX_PID_NS_LEVEL 32 80#define MAX_PID_NS_LEVEL 32
81 81
82static struct ucounts *inc_pid_namespaces(struct user_namespace *ns)
83{
84 return inc_ucount(ns, current_euid(), UCOUNT_PID_NAMESPACES);
85}
86
87static void dec_pid_namespaces(struct ucounts *ucounts)
88{
89 dec_ucount(ucounts, UCOUNT_PID_NAMESPACES);
90}
91
82static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns, 92static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns,
83 struct pid_namespace *parent_pid_ns) 93 struct pid_namespace *parent_pid_ns)
84{ 94{
85 struct pid_namespace *ns; 95 struct pid_namespace *ns;
86 unsigned int level = parent_pid_ns->level + 1; 96 unsigned int level = parent_pid_ns->level + 1;
97 struct ucounts *ucounts;
87 int i; 98 int i;
88 int err; 99 int err;
89 100
90 if (level > MAX_PID_NS_LEVEL) { 101 err = -ENOSPC;
91 err = -EINVAL; 102 if (level > MAX_PID_NS_LEVEL)
103 goto out;
104 ucounts = inc_pid_namespaces(user_ns);
105 if (!ucounts)
92 goto out; 106 goto out;
93 }
94 107
95 err = -ENOMEM; 108 err = -ENOMEM;
96 ns = kmem_cache_zalloc(pid_ns_cachep, GFP_KERNEL); 109 ns = kmem_cache_zalloc(pid_ns_cachep, GFP_KERNEL);
97 if (ns == NULL) 110 if (ns == NULL)
98 goto out; 111 goto out_dec;
99 112
100 ns->pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL); 113 ns->pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL);
101 if (!ns->pidmap[0].page) 114 if (!ns->pidmap[0].page)
@@ -114,6 +127,7 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
114 ns->level = level; 127 ns->level = level;
115 ns->parent = get_pid_ns(parent_pid_ns); 128 ns->parent = get_pid_ns(parent_pid_ns);
116 ns->user_ns = get_user_ns(user_ns); 129 ns->user_ns = get_user_ns(user_ns);
130 ns->ucounts = ucounts;
117 ns->nr_hashed = PIDNS_HASH_ADDING; 131 ns->nr_hashed = PIDNS_HASH_ADDING;
118 INIT_WORK(&ns->proc_work, proc_cleanup_work); 132 INIT_WORK(&ns->proc_work, proc_cleanup_work);
119 133
@@ -129,6 +143,8 @@ out_free_map:
129 kfree(ns->pidmap[0].page); 143 kfree(ns->pidmap[0].page);
130out_free: 144out_free:
131 kmem_cache_free(pid_ns_cachep, ns); 145 kmem_cache_free(pid_ns_cachep, ns);
146out_dec:
147 dec_pid_namespaces(ucounts);
132out: 148out:
133 return ERR_PTR(err); 149 return ERR_PTR(err);
134} 150}
@@ -146,6 +162,7 @@ static void destroy_pid_namespace(struct pid_namespace *ns)
146 ns_free_inum(&ns->ns); 162 ns_free_inum(&ns->ns);
147 for (i = 0; i < PIDMAP_ENTRIES; i++) 163 for (i = 0; i < PIDMAP_ENTRIES; i++)
148 kfree(ns->pidmap[i].page); 164 kfree(ns->pidmap[i].page);
165 dec_pid_namespaces(ns->ucounts);
149 put_user_ns(ns->user_ns); 166 put_user_ns(ns->user_ns);
150 call_rcu(&ns->rcu, delayed_free_pidns); 167 call_rcu(&ns->rcu, delayed_free_pidns);
151} 168}
@@ -388,12 +405,37 @@ static int pidns_install(struct nsproxy *nsproxy, struct ns_common *ns)
388 return 0; 405 return 0;
389} 406}
390 407
408static struct ns_common *pidns_get_parent(struct ns_common *ns)
409{
410 struct pid_namespace *active = task_active_pid_ns(current);
411 struct pid_namespace *pid_ns, *p;
412
413 /* See if the parent is in the current namespace */
414 pid_ns = p = to_pid_ns(ns)->parent;
415 for (;;) {
416 if (!p)
417 return ERR_PTR(-EPERM);
418 if (p == active)
419 break;
420 p = p->parent;
421 }
422
423 return &get_pid_ns(pid_ns)->ns;
424}
425
426static struct user_namespace *pidns_owner(struct ns_common *ns)
427{
428 return to_pid_ns(ns)->user_ns;
429}
430
391const struct proc_ns_operations pidns_operations = { 431const struct proc_ns_operations pidns_operations = {
392 .name = "pid", 432 .name = "pid",
393 .type = CLONE_NEWPID, 433 .type = CLONE_NEWPID,
394 .get = pidns_get, 434 .get = pidns_get,
395 .put = pidns_put, 435 .put = pidns_put,
396 .install = pidns_install, 436 .install = pidns_install,
437 .owner = pidns_owner,
438 .get_parent = pidns_get_parent,
397}; 439};
398 440
399static __init int pid_namespaces_init(void) 441static __init int pid_namespaces_init(void)
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 68d3ebc12601..e8517b63eb37 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -186,7 +186,7 @@ config PM_SLEEP_DEBUG
186 186
187config DPM_WATCHDOG 187config DPM_WATCHDOG
188 bool "Device suspend/resume watchdog" 188 bool "Device suspend/resume watchdog"
189 depends on PM_DEBUG && PSTORE 189 depends on PM_DEBUG && PSTORE && EXPERT
190 ---help--- 190 ---help---
191 Sets up a watchdog timer to capture drivers that are 191 Sets up a watchdog timer to capture drivers that are
192 locked up attempting to suspend/resume a device. 192 locked up attempting to suspend/resume a device.
@@ -197,7 +197,7 @@ config DPM_WATCHDOG
197config DPM_WATCHDOG_TIMEOUT 197config DPM_WATCHDOG_TIMEOUT
198 int "Watchdog timeout in seconds" 198 int "Watchdog timeout in seconds"
199 range 1 120 199 range 1 120
200 default 60 200 default 120
201 depends on DPM_WATCHDOG 201 depends on DPM_WATCHDOG
202 202
203config PM_TRACE 203config PM_TRACE
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 33c79b6105c5..b26dbc48c75b 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -306,8 +306,10 @@ static int create_image(int platform_mode)
306 if (error) 306 if (error)
307 printk(KERN_ERR "PM: Error %d creating hibernation image\n", 307 printk(KERN_ERR "PM: Error %d creating hibernation image\n",
308 error); 308 error);
309 if (!in_suspend) 309 if (!in_suspend) {
310 events_check_enabled = false; 310 events_check_enabled = false;
311 clear_free_pages();
312 }
311 313
312 platform_leave(platform_mode); 314 platform_leave(platform_mode);
313 315
@@ -1189,22 +1191,6 @@ static int __init nohibernate_setup(char *str)
1189 return 1; 1191 return 1;
1190} 1192}
1191 1193
1192static int __init page_poison_nohibernate_setup(char *str)
1193{
1194#ifdef CONFIG_PAGE_POISONING_ZERO
1195 /*
1196 * The zeroing option for page poison skips the checks on alloc.
1197 * since hibernation doesn't save free pages there's no way to
1198 * guarantee the pages will still be zeroed.
1199 */
1200 if (!strcmp(str, "on")) {
1201 pr_info("Disabling hibernation due to page poisoning\n");
1202 return nohibernate_setup(str);
1203 }
1204#endif
1205 return 1;
1206}
1207
1208__setup("noresume", noresume_setup); 1194__setup("noresume", noresume_setup);
1209__setup("resume_offset=", resume_offset_setup); 1195__setup("resume_offset=", resume_offset_setup);
1210__setup("resume=", resume_setup); 1196__setup("resume=", resume_setup);
@@ -1212,4 +1198,3 @@ __setup("hibernate=", hibernate_setup);
1212__setup("resumewait", resumewait_setup); 1198__setup("resumewait", resumewait_setup);
1213__setup("resumedelay=", resumedelay_setup); 1199__setup("resumedelay=", resumedelay_setup);
1214__setup("nohibernate", nohibernate_setup); 1200__setup("nohibernate", nohibernate_setup);
1215__setup("page_poison=", page_poison_nohibernate_setup);
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 5ea50b1b7595..281a697fd458 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -644,6 +644,7 @@ static int __init pm_init(void)
644 return error; 644 return error;
645 hibernate_image_size_init(); 645 hibernate_image_size_init();
646 hibernate_reserved_size_init(); 646 hibernate_reserved_size_init();
647 pm_states_init();
647 power_kobj = kobject_create_and_add("power", NULL); 648 power_kobj = kobject_create_and_add("power", NULL);
648 if (!power_kobj) 649 if (!power_kobj)
649 return -ENOMEM; 650 return -ENOMEM;
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 242d8b827dd5..56d1d0dedf76 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -110,6 +110,8 @@ extern int create_basic_memory_bitmaps(void);
110extern void free_basic_memory_bitmaps(void); 110extern void free_basic_memory_bitmaps(void);
111extern int hibernate_preallocate_memory(void); 111extern int hibernate_preallocate_memory(void);
112 112
113extern void clear_free_pages(void);
114
113/** 115/**
114 * Auxiliary structure used for reading the snapshot image data and 116 * Auxiliary structure used for reading the snapshot image data and
115 * metadata from and writing them to the list of page backup entries 117 * metadata from and writing them to the list of page backup entries
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 8f27d5a8adf6..2fba066e125f 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -144,23 +144,12 @@ int freeze_processes(void)
144 /* 144 /*
145 * Now that the whole userspace is frozen we need to disbale 145 * Now that the whole userspace is frozen we need to disbale
146 * the OOM killer to disallow any further interference with 146 * the OOM killer to disallow any further interference with
147 * killable tasks. 147 * killable tasks. There is no guarantee oom victims will
148 * ever reach a point they go away we have to wait with a timeout.
148 */ 149 */
149 if (!error && !oom_killer_disable()) 150 if (!error && !oom_killer_disable(msecs_to_jiffies(freeze_timeout_msecs)))
150 error = -EBUSY; 151 error = -EBUSY;
151 152
152 /*
153 * There is a hard to fix race between oom_reaper kernel thread
154 * and oom_killer_disable. oom_reaper calls exit_oom_victim
155 * before the victim reaches exit_mm so try to freeze all the tasks
156 * again and catch such a left over task.
157 */
158 if (!error) {
159 pr_info("Double checking all user space processes after OOM killer disable... ");
160 error = try_to_freeze_tasks(true);
161 pr_cont("\n");
162 }
163
164 if (error) 153 if (error)
165 thaw_processes(); 154 thaw_processes();
166 return error; 155 return error;
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index b02228411d57..4f0f0604f1c4 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -1132,6 +1132,28 @@ void free_basic_memory_bitmaps(void)
1132 pr_debug("PM: Basic memory bitmaps freed\n"); 1132 pr_debug("PM: Basic memory bitmaps freed\n");
1133} 1133}
1134 1134
1135void clear_free_pages(void)
1136{
1137#ifdef CONFIG_PAGE_POISONING_ZERO
1138 struct memory_bitmap *bm = free_pages_map;
1139 unsigned long pfn;
1140
1141 if (WARN_ON(!(free_pages_map)))
1142 return;
1143
1144 memory_bm_position_reset(bm);
1145 pfn = memory_bm_next_pfn(bm);
1146 while (pfn != BM_END_OF_MAP) {
1147 if (pfn_valid(pfn))
1148 clear_highpage(pfn_to_page(pfn));
1149
1150 pfn = memory_bm_next_pfn(bm);
1151 }
1152 memory_bm_position_reset(bm);
1153 pr_info("PM: free pages cleared after restore\n");
1154#endif /* PAGE_POISONING_ZERO */
1155}
1156
1135/** 1157/**
1136 * snapshot_additional_pages - Estimate the number of extra pages needed. 1158 * snapshot_additional_pages - Estimate the number of extra pages needed.
1137 * @zone: Memory zone to carry out the computation for. 1159 * @zone: Memory zone to carry out the computation for.
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 0acab9d7f96f..6ccb08f57fcb 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -118,10 +118,18 @@ static bool valid_state(suspend_state_t state)
118 */ 118 */
119static bool relative_states; 119static bool relative_states;
120 120
121void __init pm_states_init(void)
122{
123 /*
124 * freeze state should be supported even without any suspend_ops,
125 * initialize pm_states accordingly here
126 */
127 pm_states[PM_SUSPEND_FREEZE] = pm_labels[relative_states ? 0 : 2];
128}
129
121static int __init sleep_states_setup(char *str) 130static int __init sleep_states_setup(char *str)
122{ 131{
123 relative_states = !strncmp(str, "1", 1); 132 relative_states = !strncmp(str, "1", 1);
124 pm_states[PM_SUSPEND_FREEZE] = pm_labels[relative_states ? 0 : 2];
125 return 1; 133 return 1;
126} 134}
127 135
@@ -211,7 +219,7 @@ static int platform_suspend_begin(suspend_state_t state)
211{ 219{
212 if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->begin) 220 if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->begin)
213 return freeze_ops->begin(); 221 return freeze_ops->begin();
214 else if (suspend_ops->begin) 222 else if (suspend_ops && suspend_ops->begin)
215 return suspend_ops->begin(state); 223 return suspend_ops->begin(state);
216 else 224 else
217 return 0; 225 return 0;
@@ -221,7 +229,7 @@ static void platform_resume_end(suspend_state_t state)
221{ 229{
222 if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->end) 230 if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->end)
223 freeze_ops->end(); 231 freeze_ops->end();
224 else if (suspend_ops->end) 232 else if (suspend_ops && suspend_ops->end)
225 suspend_ops->end(); 233 suspend_ops->end();
226} 234}
227 235
@@ -490,9 +498,9 @@ static int enter_state(suspend_state_t state)
490 498
491#ifndef CONFIG_SUSPEND_SKIP_SYNC 499#ifndef CONFIG_SUSPEND_SKIP_SYNC
492 trace_suspend_resume(TPS("sync_filesystems"), 0, true); 500 trace_suspend_resume(TPS("sync_filesystems"), 0, true);
493 printk(KERN_INFO "PM: Syncing filesystems ... "); 501 pr_info("PM: Syncing filesystems ... ");
494 sys_sync(); 502 sys_sync();
495 printk("done.\n"); 503 pr_cont("done.\n");
496 trace_suspend_resume(TPS("sync_filesystems"), 0, false); 504 trace_suspend_resume(TPS("sync_filesystems"), 0, false);
497#endif 505#endif
498 506
diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c
index 084452e34a12..bdff5ed57f10 100644
--- a/kernel/power/suspend_test.c
+++ b/kernel/power/suspend_test.c
@@ -203,8 +203,10 @@ static int __init test_suspend(void)
203 203
204 /* RTCs have initialized by now too ... can we use one? */ 204 /* RTCs have initialized by now too ... can we use one? */
205 dev = class_find_device(rtc_class, NULL, NULL, has_wakealarm); 205 dev = class_find_device(rtc_class, NULL, NULL, has_wakealarm);
206 if (dev) 206 if (dev) {
207 rtc = rtc_class_open(dev_name(dev)); 207 rtc = rtc_class_open(dev_name(dev));
208 put_device(dev);
209 }
208 if (!rtc) { 210 if (!rtc) {
209 printk(warn_no_rtc); 211 printk(warn_no_rtc);
210 return 0; 212 return 0;
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index eea6dbc2d8cf..f7a55e9ff2f7 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -655,11 +655,8 @@ static ssize_t msg_print_ext_header(char *buf, size_t size,
655 * better readable output. 'c' in the record flags mark the first 655 * better readable output. 'c' in the record flags mark the first
656 * fragment of a line, '+' the following. 656 * fragment of a line, '+' the following.
657 */ 657 */
658 if (msg->flags & LOG_CONT && !(prev_flags & LOG_CONT)) 658 if (msg->flags & LOG_CONT)
659 cont = 'c'; 659 cont = (prev_flags & LOG_CONT) ? '+' : 'c';
660 else if ((msg->flags & LOG_CONT) ||
661 ((prev_flags & LOG_CONT) && !(msg->flags & LOG_PREFIX)))
662 cont = '+';
663 660
664 return scnprintf(buf, size, "%u,%llu,%llu,%c;", 661 return scnprintf(buf, size, "%u,%llu,%llu,%c;",
665 (msg->facility << 3) | msg->level, seq, ts_usec, cont); 662 (msg->facility << 3) | msg->level, seq, ts_usec, cont);
@@ -1643,35 +1640,33 @@ static struct cont {
1643 bool flushed:1; /* buffer sealed and committed */ 1640 bool flushed:1; /* buffer sealed and committed */
1644} cont; 1641} cont;
1645 1642
1646static void cont_flush(enum log_flags flags) 1643static void cont_flush(void)
1647{ 1644{
1648 if (cont.flushed) 1645 if (cont.flushed)
1649 return; 1646 return;
1650 if (cont.len == 0) 1647 if (cont.len == 0)
1651 return; 1648 return;
1652
1653 if (cont.cons) { 1649 if (cont.cons) {
1654 /* 1650 /*
1655 * If a fragment of this line was directly flushed to the 1651 * If a fragment of this line was directly flushed to the
1656 * console; wait for the console to pick up the rest of the 1652 * console; wait for the console to pick up the rest of the
1657 * line. LOG_NOCONS suppresses a duplicated output. 1653 * line. LOG_NOCONS suppresses a duplicated output.
1658 */ 1654 */
1659 log_store(cont.facility, cont.level, flags | LOG_NOCONS, 1655 log_store(cont.facility, cont.level, cont.flags | LOG_NOCONS,
1660 cont.ts_nsec, NULL, 0, cont.buf, cont.len); 1656 cont.ts_nsec, NULL, 0, cont.buf, cont.len);
1661 cont.flags = flags;
1662 cont.flushed = true; 1657 cont.flushed = true;
1663 } else { 1658 } else {
1664 /* 1659 /*
1665 * If no fragment of this line ever reached the console, 1660 * If no fragment of this line ever reached the console,
1666 * just submit it to the store and free the buffer. 1661 * just submit it to the store and free the buffer.
1667 */ 1662 */
1668 log_store(cont.facility, cont.level, flags, 0, 1663 log_store(cont.facility, cont.level, cont.flags, 0,
1669 NULL, 0, cont.buf, cont.len); 1664 NULL, 0, cont.buf, cont.len);
1670 cont.len = 0; 1665 cont.len = 0;
1671 } 1666 }
1672} 1667}
1673 1668
1674static bool cont_add(int facility, int level, const char *text, size_t len) 1669static bool cont_add(int facility, int level, enum log_flags flags, const char *text, size_t len)
1675{ 1670{
1676 if (cont.len && cont.flushed) 1671 if (cont.len && cont.flushed)
1677 return false; 1672 return false;
@@ -1682,7 +1677,7 @@ static bool cont_add(int facility, int level, const char *text, size_t len)
1682 * the line gets too long, split it up in separate records. 1677 * the line gets too long, split it up in separate records.
1683 */ 1678 */
1684 if (nr_ext_console_drivers || cont.len + len > sizeof(cont.buf)) { 1679 if (nr_ext_console_drivers || cont.len + len > sizeof(cont.buf)) {
1685 cont_flush(LOG_CONT); 1680 cont_flush();
1686 return false; 1681 return false;
1687 } 1682 }
1688 1683
@@ -1691,7 +1686,7 @@ static bool cont_add(int facility, int level, const char *text, size_t len)
1691 cont.level = level; 1686 cont.level = level;
1692 cont.owner = current; 1687 cont.owner = current;
1693 cont.ts_nsec = local_clock(); 1688 cont.ts_nsec = local_clock();
1694 cont.flags = 0; 1689 cont.flags = flags;
1695 cont.cons = 0; 1690 cont.cons = 0;
1696 cont.flushed = false; 1691 cont.flushed = false;
1697 } 1692 }
@@ -1699,8 +1694,15 @@ static bool cont_add(int facility, int level, const char *text, size_t len)
1699 memcpy(cont.buf + cont.len, text, len); 1694 memcpy(cont.buf + cont.len, text, len);
1700 cont.len += len; 1695 cont.len += len;
1701 1696
1697 // The original flags come from the first line,
1698 // but later continuations can add a newline.
1699 if (flags & LOG_NEWLINE) {
1700 cont.flags |= LOG_NEWLINE;
1701 cont_flush();
1702 }
1703
1702 if (cont.len > (sizeof(cont.buf) * 80) / 100) 1704 if (cont.len > (sizeof(cont.buf) * 80) / 100)
1703 cont_flush(LOG_CONT); 1705 cont_flush();
1704 1706
1705 return true; 1707 return true;
1706} 1708}
@@ -1733,6 +1735,35 @@ static size_t cont_print_text(char *text, size_t size)
1733 return textlen; 1735 return textlen;
1734} 1736}
1735 1737
1738static size_t log_output(int facility, int level, enum log_flags lflags, const char *dict, size_t dictlen, char *text, size_t text_len)
1739{
1740 /*
1741 * If an earlier line was buffered, and we're a continuation
1742 * write from the same process, try to add it to the buffer.
1743 */
1744 if (cont.len) {
1745 if (cont.owner == current && (lflags & LOG_CONT)) {
1746 if (cont_add(facility, level, lflags, text, text_len))
1747 return text_len;
1748 }
1749 /* Otherwise, make sure it's flushed */
1750 cont_flush();
1751 }
1752
1753 /* Skip empty continuation lines that couldn't be added - they just flush */
1754 if (!text_len && (lflags & LOG_CONT))
1755 return 0;
1756
1757 /* If it doesn't end in a newline, try to buffer the current line */
1758 if (!(lflags & LOG_NEWLINE)) {
1759 if (cont_add(facility, level, lflags, text, text_len))
1760 return text_len;
1761 }
1762
1763 /* Store it in the record log */
1764 return log_store(facility, level, lflags, 0, dict, dictlen, text, text_len);
1765}
1766
1736asmlinkage int vprintk_emit(int facility, int level, 1767asmlinkage int vprintk_emit(int facility, int level,
1737 const char *dict, size_t dictlen, 1768 const char *dict, size_t dictlen,
1738 const char *fmt, va_list args) 1769 const char *fmt, va_list args)
@@ -1819,10 +1850,9 @@ asmlinkage int vprintk_emit(int facility, int level,
1819 1850
1820 /* strip kernel syslog prefix and extract log level or control flags */ 1851 /* strip kernel syslog prefix and extract log level or control flags */
1821 if (facility == 0) { 1852 if (facility == 0) {
1822 int kern_level = printk_get_level(text); 1853 int kern_level;
1823 1854
1824 if (kern_level) { 1855 while ((kern_level = printk_get_level(text)) != 0) {
1825 const char *end_of_header = printk_skip_level(text);
1826 switch (kern_level) { 1856 switch (kern_level) {
1827 case '0' ... '7': 1857 case '0' ... '7':
1828 if (level == LOGLEVEL_DEFAULT) 1858 if (level == LOGLEVEL_DEFAULT)
@@ -1830,14 +1860,13 @@ asmlinkage int vprintk_emit(int facility, int level,
1830 /* fallthrough */ 1860 /* fallthrough */
1831 case 'd': /* KERN_DEFAULT */ 1861 case 'd': /* KERN_DEFAULT */
1832 lflags |= LOG_PREFIX; 1862 lflags |= LOG_PREFIX;
1863 break;
1864 case 'c': /* KERN_CONT */
1865 lflags |= LOG_CONT;
1833 } 1866 }
1834 /* 1867
1835 * No need to check length here because vscnprintf 1868 text_len -= 2;
1836 * put '\0' at the end of the string. Only valid and 1869 text += 2;
1837 * newly printed level is detected.
1838 */
1839 text_len -= end_of_header - text;
1840 text = (char *)end_of_header;
1841 } 1870 }
1842 } 1871 }
1843 1872
@@ -1847,45 +1876,7 @@ asmlinkage int vprintk_emit(int facility, int level,
1847 if (dict) 1876 if (dict)
1848 lflags |= LOG_PREFIX|LOG_NEWLINE; 1877 lflags |= LOG_PREFIX|LOG_NEWLINE;
1849 1878
1850 if (!(lflags & LOG_NEWLINE)) { 1879 printed_len += log_output(facility, level, lflags, dict, dictlen, text, text_len);
1851 /*
1852 * Flush the conflicting buffer. An earlier newline was missing,
1853 * or another task also prints continuation lines.
1854 */
1855 if (cont.len && (lflags & LOG_PREFIX || cont.owner != current))
1856 cont_flush(LOG_NEWLINE);
1857
1858 /* buffer line if possible, otherwise store it right away */
1859 if (cont_add(facility, level, text, text_len))
1860 printed_len += text_len;
1861 else
1862 printed_len += log_store(facility, level,
1863 lflags | LOG_CONT, 0,
1864 dict, dictlen, text, text_len);
1865 } else {
1866 bool stored = false;
1867
1868 /*
1869 * If an earlier newline was missing and it was the same task,
1870 * either merge it with the current buffer and flush, or if
1871 * there was a race with interrupts (prefix == true) then just
1872 * flush it out and store this line separately.
1873 * If the preceding printk was from a different task and missed
1874 * a newline, flush and append the newline.
1875 */
1876 if (cont.len) {
1877 if (cont.owner == current && !(lflags & LOG_PREFIX))
1878 stored = cont_add(facility, level, text,
1879 text_len);
1880 cont_flush(LOG_NEWLINE);
1881 }
1882
1883 if (stored)
1884 printed_len += text_len;
1885 else
1886 printed_len += log_store(facility, level, lflags, 0,
1887 dict, dictlen, text, text_len);
1888 }
1889 1880
1890 logbuf_cpu = UINT_MAX; 1881 logbuf_cpu = UINT_MAX;
1891 raw_spin_unlock(&logbuf_lock); 1882 raw_spin_unlock(&logbuf_lock);
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 1d3b7665d0be..e6474f7272ec 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -73,6 +73,8 @@ void __ptrace_unlink(struct task_struct *child)
73{ 73{
74 BUG_ON(!child->ptrace); 74 BUG_ON(!child->ptrace);
75 75
76 clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
77
76 child->parent = child->real_parent; 78 child->parent = child->real_parent;
77 list_del_init(&child->ptrace_entry); 79 list_del_init(&child->ptrace_entry);
78 80
@@ -489,7 +491,6 @@ static int ptrace_detach(struct task_struct *child, unsigned int data)
489 491
490 /* Architecture-specific hardware disable .. */ 492 /* Architecture-specific hardware disable .. */
491 ptrace_disable(child); 493 ptrace_disable(child);
492 clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
493 494
494 write_lock_irq(&tasklist_lock); 495 write_lock_irq(&tasklist_lock);
495 /* 496 /*
@@ -536,7 +537,7 @@ int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst
536 int this_len, retval; 537 int this_len, retval;
537 538
538 this_len = (len > sizeof(buf)) ? sizeof(buf) : len; 539 this_len = (len > sizeof(buf)) ? sizeof(buf) : len;
539 retval = access_process_vm(tsk, src, buf, this_len, 0); 540 retval = access_process_vm(tsk, src, buf, this_len, FOLL_FORCE);
540 if (!retval) { 541 if (!retval) {
541 if (copied) 542 if (copied)
542 break; 543 break;
@@ -563,7 +564,8 @@ int ptrace_writedata(struct task_struct *tsk, char __user *src, unsigned long ds
563 this_len = (len > sizeof(buf)) ? sizeof(buf) : len; 564 this_len = (len > sizeof(buf)) ? sizeof(buf) : len;
564 if (copy_from_user(buf, src, this_len)) 565 if (copy_from_user(buf, src, this_len))
565 return -EFAULT; 566 return -EFAULT;
566 retval = access_process_vm(tsk, dst, buf, this_len, 1); 567 retval = access_process_vm(tsk, dst, buf, this_len,
568 FOLL_FORCE | FOLL_WRITE);
567 if (!retval) { 569 if (!retval) {
568 if (copied) 570 if (copied)
569 break; 571 break;
@@ -1126,7 +1128,7 @@ int generic_ptrace_peekdata(struct task_struct *tsk, unsigned long addr,
1126 unsigned long tmp; 1128 unsigned long tmp;
1127 int copied; 1129 int copied;
1128 1130
1129 copied = access_process_vm(tsk, addr, &tmp, sizeof(tmp), 0); 1131 copied = access_process_vm(tsk, addr, &tmp, sizeof(tmp), FOLL_FORCE);
1130 if (copied != sizeof(tmp)) 1132 if (copied != sizeof(tmp))
1131 return -EIO; 1133 return -EIO;
1132 return put_user(tmp, (unsigned long __user *)data); 1134 return put_user(tmp, (unsigned long __user *)data);
@@ -1137,7 +1139,8 @@ int generic_ptrace_pokedata(struct task_struct *tsk, unsigned long addr,
1137{ 1139{
1138 int copied; 1140 int copied;
1139 1141
1140 copied = access_process_vm(tsk, addr, &data, sizeof(data), 1); 1142 copied = access_process_vm(tsk, addr, &data, sizeof(data),
1143 FOLL_FORCE | FOLL_WRITE);
1141 return (copied == sizeof(data)) ? 0 : -EIO; 1144 return (copied == sizeof(data)) ? 0 : -EIO;
1142} 1145}
1143 1146
@@ -1154,7 +1157,8 @@ int compat_ptrace_request(struct task_struct *child, compat_long_t request,
1154 switch (request) { 1157 switch (request) {
1155 case PTRACE_PEEKTEXT: 1158 case PTRACE_PEEKTEXT:
1156 case PTRACE_PEEKDATA: 1159 case PTRACE_PEEKDATA:
1157 ret = access_process_vm(child, addr, &word, sizeof(word), 0); 1160 ret = access_process_vm(child, addr, &word, sizeof(word),
1161 FOLL_FORCE);
1158 if (ret != sizeof(word)) 1162 if (ret != sizeof(word))
1159 ret = -EIO; 1163 ret = -EIO;
1160 else 1164 else
@@ -1163,7 +1167,8 @@ int compat_ptrace_request(struct task_struct *child, compat_long_t request,
1163 1167
1164 case PTRACE_POKETEXT: 1168 case PTRACE_POKETEXT:
1165 case PTRACE_POKEDATA: 1169 case PTRACE_POKEDATA:
1166 ret = access_process_vm(child, addr, &data, sizeof(data), 1); 1170 ret = access_process_vm(child, addr, &data, sizeof(data),
1171 FOLL_FORCE | FOLL_WRITE);
1167 ret = (ret != sizeof(data) ? -EIO : 0); 1172 ret = (ret != sizeof(data) ? -EIO : 0);
1168 break; 1173 break;
1169 1174
diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c
index d38ab08a3fe7..123ccbd22449 100644
--- a/kernel/rcu/rcuperf.c
+++ b/kernel/rcu/rcuperf.c
@@ -52,7 +52,7 @@ MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.vnet.ibm.com>");
52 52
53#define PERF_FLAG "-perf:" 53#define PERF_FLAG "-perf:"
54#define PERFOUT_STRING(s) \ 54#define PERFOUT_STRING(s) \
55 pr_alert("%s" PERF_FLAG s "\n", perf_type) 55 pr_alert("%s" PERF_FLAG " %s\n", perf_type, s)
56#define VERBOSE_PERFOUT_STRING(s) \ 56#define VERBOSE_PERFOUT_STRING(s) \
57 do { if (verbose) pr_alert("%s" PERF_FLAG " %s\n", perf_type, s); } while (0) 57 do { if (verbose) pr_alert("%s" PERF_FLAG " %s\n", perf_type, s); } while (0)
58#define VERBOSE_PERFOUT_ERRSTRING(s) \ 58#define VERBOSE_PERFOUT_ERRSTRING(s) \
@@ -400,9 +400,8 @@ rcu_perf_writer(void *arg)
400 sp.sched_priority = 0; 400 sp.sched_priority = 0;
401 sched_setscheduler_nocheck(current, 401 sched_setscheduler_nocheck(current,
402 SCHED_NORMAL, &sp); 402 SCHED_NORMAL, &sp);
403 pr_alert("%s" PERF_FLAG 403 pr_alert("%s%s rcu_perf_writer %ld has %d measurements\n",
404 "rcu_perf_writer %ld has %d measurements\n", 404 perf_type, PERF_FLAG, me, MIN_MEAS);
405 perf_type, me, MIN_MEAS);
406 if (atomic_inc_return(&n_rcu_perf_writer_finished) >= 405 if (atomic_inc_return(&n_rcu_perf_writer_finished) >=
407 nrealwriters) { 406 nrealwriters) {
408 schedule_timeout_interruptible(10); 407 schedule_timeout_interruptible(10);
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 971e2b138063..bf08fee53dc7 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -1238,6 +1238,7 @@ rcu_torture_stats_print(void)
1238 long pipesummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 }; 1238 long pipesummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 };
1239 long batchsummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 }; 1239 long batchsummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 };
1240 static unsigned long rtcv_snap = ULONG_MAX; 1240 static unsigned long rtcv_snap = ULONG_MAX;
1241 struct task_struct *wtp;
1241 1242
1242 for_each_possible_cpu(cpu) { 1243 for_each_possible_cpu(cpu) {
1243 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { 1244 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) {
@@ -1258,8 +1259,9 @@ rcu_torture_stats_print(void)
1258 atomic_read(&n_rcu_torture_alloc), 1259 atomic_read(&n_rcu_torture_alloc),
1259 atomic_read(&n_rcu_torture_alloc_fail), 1260 atomic_read(&n_rcu_torture_alloc_fail),
1260 atomic_read(&n_rcu_torture_free)); 1261 atomic_read(&n_rcu_torture_free));
1261 pr_cont("rtmbe: %d rtbke: %ld rtbre: %ld ", 1262 pr_cont("rtmbe: %d rtbe: %ld rtbke: %ld rtbre: %ld ",
1262 atomic_read(&n_rcu_torture_mberror), 1263 atomic_read(&n_rcu_torture_mberror),
1264 n_rcu_torture_barrier_error,
1263 n_rcu_torture_boost_ktrerror, 1265 n_rcu_torture_boost_ktrerror,
1264 n_rcu_torture_boost_rterror); 1266 n_rcu_torture_boost_rterror);
1265 pr_cont("rtbf: %ld rtb: %ld nt: %ld ", 1267 pr_cont("rtbf: %ld rtb: %ld nt: %ld ",
@@ -1312,10 +1314,12 @@ rcu_torture_stats_print(void)
1312 1314
1313 rcutorture_get_gp_data(cur_ops->ttype, 1315 rcutorture_get_gp_data(cur_ops->ttype,
1314 &flags, &gpnum, &completed); 1316 &flags, &gpnum, &completed);
1315 pr_alert("??? Writer stall state %s(%d) g%lu c%lu f%#x\n", 1317 wtp = READ_ONCE(writer_task);
1318 pr_alert("??? Writer stall state %s(%d) g%lu c%lu f%#x ->state %#lx\n",
1316 rcu_torture_writer_state_getname(), 1319 rcu_torture_writer_state_getname(),
1317 rcu_torture_writer_state, 1320 rcu_torture_writer_state,
1318 gpnum, completed, flags); 1321 gpnum, completed, flags,
1322 wtp == NULL ? ~0UL : wtp->state);
1319 show_rcu_gp_kthreads(); 1323 show_rcu_gp_kthreads();
1320 rcu_ftrace_dump(DUMP_ALL); 1324 rcu_ftrace_dump(DUMP_ALL);
1321 } 1325 }
@@ -1362,12 +1366,12 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag)
1362 onoff_interval, onoff_holdoff); 1366 onoff_interval, onoff_holdoff);
1363} 1367}
1364 1368
1365static void rcutorture_booster_cleanup(int cpu) 1369static int rcutorture_booster_cleanup(unsigned int cpu)
1366{ 1370{
1367 struct task_struct *t; 1371 struct task_struct *t;
1368 1372
1369 if (boost_tasks[cpu] == NULL) 1373 if (boost_tasks[cpu] == NULL)
1370 return; 1374 return 0;
1371 mutex_lock(&boost_mutex); 1375 mutex_lock(&boost_mutex);
1372 t = boost_tasks[cpu]; 1376 t = boost_tasks[cpu];
1373 boost_tasks[cpu] = NULL; 1377 boost_tasks[cpu] = NULL;
@@ -1375,9 +1379,10 @@ static void rcutorture_booster_cleanup(int cpu)
1375 1379
1376 /* This must be outside of the mutex, otherwise deadlock! */ 1380 /* This must be outside of the mutex, otherwise deadlock! */
1377 torture_stop_kthread(rcu_torture_boost, t); 1381 torture_stop_kthread(rcu_torture_boost, t);
1382 return 0;
1378} 1383}
1379 1384
1380static int rcutorture_booster_init(int cpu) 1385static int rcutorture_booster_init(unsigned int cpu)
1381{ 1386{
1382 int retval; 1387 int retval;
1383 1388
@@ -1577,28 +1582,7 @@ static void rcu_torture_barrier_cleanup(void)
1577 } 1582 }
1578} 1583}
1579 1584
1580static int rcutorture_cpu_notify(struct notifier_block *self, 1585static enum cpuhp_state rcutor_hp;
1581 unsigned long action, void *hcpu)
1582{
1583 long cpu = (long)hcpu;
1584
1585 switch (action & ~CPU_TASKS_FROZEN) {
1586 case CPU_ONLINE:
1587 case CPU_DOWN_FAILED:
1588 (void)rcutorture_booster_init(cpu);
1589 break;
1590 case CPU_DOWN_PREPARE:
1591 rcutorture_booster_cleanup(cpu);
1592 break;
1593 default:
1594 break;
1595 }
1596 return NOTIFY_OK;
1597}
1598
1599static struct notifier_block rcutorture_cpu_nb = {
1600 .notifier_call = rcutorture_cpu_notify,
1601};
1602 1586
1603static void 1587static void
1604rcu_torture_cleanup(void) 1588rcu_torture_cleanup(void)
@@ -1638,11 +1622,8 @@ rcu_torture_cleanup(void)
1638 for (i = 0; i < ncbflooders; i++) 1622 for (i = 0; i < ncbflooders; i++)
1639 torture_stop_kthread(rcu_torture_cbflood, cbflood_task[i]); 1623 torture_stop_kthread(rcu_torture_cbflood, cbflood_task[i]);
1640 if ((test_boost == 1 && cur_ops->can_boost) || 1624 if ((test_boost == 1 && cur_ops->can_boost) ||
1641 test_boost == 2) { 1625 test_boost == 2)
1642 unregister_cpu_notifier(&rcutorture_cpu_nb); 1626 cpuhp_remove_state(rcutor_hp);
1643 for_each_possible_cpu(i)
1644 rcutorture_booster_cleanup(i);
1645 }
1646 1627
1647 /* 1628 /*
1648 * Wait for all RCU callbacks to fire, then do flavor-specific 1629 * Wait for all RCU callbacks to fire, then do flavor-specific
@@ -1869,14 +1850,13 @@ rcu_torture_init(void)
1869 test_boost == 2) { 1850 test_boost == 2) {
1870 1851
1871 boost_starttime = jiffies + test_boost_interval * HZ; 1852 boost_starttime = jiffies + test_boost_interval * HZ;
1872 register_cpu_notifier(&rcutorture_cpu_nb); 1853
1873 for_each_possible_cpu(i) { 1854 firsterr = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "RCU_TORTURE",
1874 if (cpu_is_offline(i)) 1855 rcutorture_booster_init,
1875 continue; /* Heuristic: CPU can go offline. */ 1856 rcutorture_booster_cleanup);
1876 firsterr = rcutorture_booster_init(i); 1857 if (firsterr < 0)
1877 if (firsterr) 1858 goto unwind;
1878 goto unwind; 1859 rcutor_hp = firsterr;
1879 }
1880 } 1860 }
1881 firsterr = torture_shutdown_init(shutdown_secs, rcu_torture_cleanup); 1861 firsterr = torture_shutdown_init(shutdown_secs, rcu_torture_cleanup);
1882 if (firsterr) 1862 if (firsterr)
diff --git a/kernel/rcu/sync.c b/kernel/rcu/sync.c
index be922c9f3d37..50d1861f7759 100644
--- a/kernel/rcu/sync.c
+++ b/kernel/rcu/sync.c
@@ -68,6 +68,8 @@ void rcu_sync_lockdep_assert(struct rcu_sync *rsp)
68 RCU_LOCKDEP_WARN(!gp_ops[rsp->gp_type].held(), 68 RCU_LOCKDEP_WARN(!gp_ops[rsp->gp_type].held(),
69 "suspicious rcu_sync_is_idle() usage"); 69 "suspicious rcu_sync_is_idle() usage");
70} 70}
71
72EXPORT_SYMBOL_GPL(rcu_sync_lockdep_assert);
71#endif 73#endif
72 74
73/** 75/**
@@ -83,6 +85,18 @@ void rcu_sync_init(struct rcu_sync *rsp, enum rcu_sync_type type)
83} 85}
84 86
85/** 87/**
88 * Must be called after rcu_sync_init() and before first use.
89 *
90 * Ensures rcu_sync_is_idle() returns false and rcu_sync_{enter,exit}()
91 * pairs turn into NO-OPs.
92 */
93void rcu_sync_enter_start(struct rcu_sync *rsp)
94{
95 rsp->gp_count++;
96 rsp->gp_state = GP_PASSED;
97}
98
99/**
86 * rcu_sync_enter() - Force readers onto slowpath 100 * rcu_sync_enter() - Force readers onto slowpath
87 * @rsp: Pointer to rcu_sync structure to use for synchronization 101 * @rsp: Pointer to rcu_sync structure to use for synchronization
88 * 102 *
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index 944b1b491ed8..1898559e6b60 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -170,7 +170,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
170 false)); 170 false));
171} 171}
172 172
173static void rcu_process_callbacks(struct softirq_action *unused) 173static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused)
174{ 174{
175 __rcu_process_callbacks(&rcu_sched_ctrlblk); 175 __rcu_process_callbacks(&rcu_sched_ctrlblk);
176 __rcu_process_callbacks(&rcu_bh_ctrlblk); 176 __rcu_process_callbacks(&rcu_bh_ctrlblk);
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 5d80925e7fc8..69a5611a7e7c 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -41,7 +41,6 @@
41#include <linux/export.h> 41#include <linux/export.h>
42#include <linux/completion.h> 42#include <linux/completion.h>
43#include <linux/moduleparam.h> 43#include <linux/moduleparam.h>
44#include <linux/module.h>
45#include <linux/percpu.h> 44#include <linux/percpu.h>
46#include <linux/notifier.h> 45#include <linux/notifier.h>
47#include <linux/cpu.h> 46#include <linux/cpu.h>
@@ -60,7 +59,6 @@
60#include "tree.h" 59#include "tree.h"
61#include "rcu.h" 60#include "rcu.h"
62 61
63MODULE_ALIAS("rcutree");
64#ifdef MODULE_PARAM_PREFIX 62#ifdef MODULE_PARAM_PREFIX
65#undef MODULE_PARAM_PREFIX 63#undef MODULE_PARAM_PREFIX
66#endif 64#endif
@@ -1848,6 +1846,7 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp,
1848 struct rcu_data *rdp) 1846 struct rcu_data *rdp)
1849{ 1847{
1850 bool ret; 1848 bool ret;
1849 bool need_gp;
1851 1850
1852 /* Handle the ends of any preceding grace periods first. */ 1851 /* Handle the ends of any preceding grace periods first. */
1853 if (rdp->completed == rnp->completed && 1852 if (rdp->completed == rnp->completed &&
@@ -1874,9 +1873,10 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp,
1874 */ 1873 */
1875 rdp->gpnum = rnp->gpnum; 1874 rdp->gpnum = rnp->gpnum;
1876 trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpustart")); 1875 trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpustart"));
1877 rdp->cpu_no_qs.b.norm = true; 1876 need_gp = !!(rnp->qsmask & rdp->grpmask);
1877 rdp->cpu_no_qs.b.norm = need_gp;
1878 rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr); 1878 rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr);
1879 rdp->core_needs_qs = !!(rnp->qsmask & rdp->grpmask); 1879 rdp->core_needs_qs = need_gp;
1880 zero_cpu_stall_ticks(rdp); 1880 zero_cpu_stall_ticks(rdp);
1881 WRITE_ONCE(rdp->gpwrap, false); 1881 WRITE_ONCE(rdp->gpwrap, false);
1882 } 1882 }
@@ -2344,7 +2344,7 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
2344 WARN_ON_ONCE(!rcu_gp_in_progress(rsp)); 2344 WARN_ON_ONCE(!rcu_gp_in_progress(rsp));
2345 WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS); 2345 WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS);
2346 raw_spin_unlock_irqrestore_rcu_node(rcu_get_root(rsp), flags); 2346 raw_spin_unlock_irqrestore_rcu_node(rcu_get_root(rsp), flags);
2347 swake_up(&rsp->gp_wq); /* Memory barrier implied by swake_up() path. */ 2347 rcu_gp_kthread_wake(rsp);
2348} 2348}
2349 2349
2350/* 2350/*
@@ -2970,7 +2970,7 @@ static void force_quiescent_state(struct rcu_state *rsp)
2970 } 2970 }
2971 WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS); 2971 WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS);
2972 raw_spin_unlock_irqrestore_rcu_node(rnp_old, flags); 2972 raw_spin_unlock_irqrestore_rcu_node(rnp_old, flags);
2973 swake_up(&rsp->gp_wq); /* Memory barrier implied by swake_up() path. */ 2973 rcu_gp_kthread_wake(rsp);
2974} 2974}
2975 2975
2976/* 2976/*
@@ -3013,7 +3013,7 @@ __rcu_process_callbacks(struct rcu_state *rsp)
3013/* 3013/*
3014 * Do RCU core processing for the current CPU. 3014 * Do RCU core processing for the current CPU.
3015 */ 3015 */
3016static void rcu_process_callbacks(struct softirq_action *unused) 3016static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused)
3017{ 3017{
3018 struct rcu_state *rsp; 3018 struct rcu_state *rsp;
3019 3019
@@ -3792,8 +3792,6 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
3792 rnp = rdp->mynode; 3792 rnp = rdp->mynode;
3793 mask = rdp->grpmask; 3793 mask = rdp->grpmask;
3794 raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ 3794 raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */
3795 rnp->qsmaskinitnext |= mask;
3796 rnp->expmaskinitnext |= mask;
3797 if (!rdp->beenonline) 3795 if (!rdp->beenonline)
3798 WRITE_ONCE(rsp->ncpus, READ_ONCE(rsp->ncpus) + 1); 3796 WRITE_ONCE(rsp->ncpus, READ_ONCE(rsp->ncpus) + 1);
3799 rdp->beenonline = true; /* We have now been online. */ 3797 rdp->beenonline = true; /* We have now been online. */
@@ -3860,6 +3858,32 @@ int rcutree_dead_cpu(unsigned int cpu)
3860 return 0; 3858 return 0;
3861} 3859}
3862 3860
3861/*
3862 * Mark the specified CPU as being online so that subsequent grace periods
3863 * (both expedited and normal) will wait on it. Note that this means that
3864 * incoming CPUs are not allowed to use RCU read-side critical sections
3865 * until this function is called. Failing to observe this restriction
3866 * will result in lockdep splats.
3867 */
3868void rcu_cpu_starting(unsigned int cpu)
3869{
3870 unsigned long flags;
3871 unsigned long mask;
3872 struct rcu_data *rdp;
3873 struct rcu_node *rnp;
3874 struct rcu_state *rsp;
3875
3876 for_each_rcu_flavor(rsp) {
3877 rdp = this_cpu_ptr(rsp->rda);
3878 rnp = rdp->mynode;
3879 mask = rdp->grpmask;
3880 raw_spin_lock_irqsave_rcu_node(rnp, flags);
3881 rnp->qsmaskinitnext |= mask;
3882 rnp->expmaskinitnext |= mask;
3883 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
3884 }
3885}
3886
3863#ifdef CONFIG_HOTPLUG_CPU 3887#ifdef CONFIG_HOTPLUG_CPU
3864/* 3888/*
3865 * The CPU is exiting the idle loop into the arch_cpu_idle_dead() 3889 * The CPU is exiting the idle loop into the arch_cpu_idle_dead()
@@ -4209,8 +4233,10 @@ void __init rcu_init(void)
4209 * or the scheduler are operational. 4233 * or the scheduler are operational.
4210 */ 4234 */
4211 pm_notifier(rcu_pm_notify, 0); 4235 pm_notifier(rcu_pm_notify, 0);
4212 for_each_online_cpu(cpu) 4236 for_each_online_cpu(cpu) {
4213 rcutree_prepare_cpu(cpu); 4237 rcutree_prepare_cpu(cpu);
4238 rcu_cpu_starting(cpu);
4239 }
4214} 4240}
4215 4241
4216#include "tree_exp.h" 4242#include "tree_exp.h"
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index f714f873bf9d..e99a5234d9ed 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -400,6 +400,7 @@ struct rcu_data {
400#ifdef CONFIG_RCU_FAST_NO_HZ 400#ifdef CONFIG_RCU_FAST_NO_HZ
401 struct rcu_head oom_head; 401 struct rcu_head oom_head;
402#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ 402#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */
403 atomic_long_t exp_workdone0; /* # done by workqueue. */
403 atomic_long_t exp_workdone1; /* # done by others #1. */ 404 atomic_long_t exp_workdone1; /* # done by others #1. */
404 atomic_long_t exp_workdone2; /* # done by others #2. */ 405 atomic_long_t exp_workdone2; /* # done by others #2. */
405 atomic_long_t exp_workdone3; /* # done by others #3. */ 406 atomic_long_t exp_workdone3; /* # done by others #3. */
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index 6d86ab6ec2c9..24343eb87b58 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -359,7 +359,8 @@ static void sync_rcu_exp_select_cpus(struct rcu_state *rsp,
359 struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); 359 struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
360 360
361 if (raw_smp_processor_id() == cpu || 361 if (raw_smp_processor_id() == cpu ||
362 !(atomic_add_return(0, &rdtp->dynticks) & 0x1)) 362 !(atomic_add_return(0, &rdtp->dynticks) & 0x1) ||
363 !(rnp->qsmaskinitnext & rdp->grpmask))
363 mask_ofl_test |= rdp->grpmask; 364 mask_ofl_test |= rdp->grpmask;
364 } 365 }
365 mask_ofl_ipi = rnp->expmask & ~mask_ofl_test; 366 mask_ofl_ipi = rnp->expmask & ~mask_ofl_test;
@@ -384,17 +385,16 @@ retry_ipi:
384 mask_ofl_ipi &= ~mask; 385 mask_ofl_ipi &= ~mask;
385 continue; 386 continue;
386 } 387 }
387 /* Failed, raced with offline. */ 388 /* Failed, raced with CPU hotplug operation. */
388 raw_spin_lock_irqsave_rcu_node(rnp, flags); 389 raw_spin_lock_irqsave_rcu_node(rnp, flags);
389 if (cpu_online(cpu) && 390 if ((rnp->qsmaskinitnext & mask) &&
390 (rnp->expmask & mask)) { 391 (rnp->expmask & mask)) {
392 /* Online, so delay for a bit and try again. */
391 raw_spin_unlock_irqrestore_rcu_node(rnp, flags); 393 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
392 schedule_timeout_uninterruptible(1); 394 schedule_timeout_uninterruptible(1);
393 if (cpu_online(cpu) && 395 goto retry_ipi;
394 (rnp->expmask & mask))
395 goto retry_ipi;
396 raw_spin_lock_irqsave_rcu_node(rnp, flags);
397 } 396 }
397 /* CPU really is offline, so we can ignore it. */
398 if (!(rnp->expmask & mask)) 398 if (!(rnp->expmask & mask))
399 mask_ofl_ipi &= ~mask; 399 mask_ofl_ipi &= ~mask;
400 raw_spin_unlock_irqrestore_rcu_node(rnp, flags); 400 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
@@ -427,12 +427,10 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
427 jiffies_stall); 427 jiffies_stall);
428 if (ret > 0 || sync_rcu_preempt_exp_done(rnp_root)) 428 if (ret > 0 || sync_rcu_preempt_exp_done(rnp_root))
429 return; 429 return;
430 if (ret < 0) { 430 WARN_ON(ret < 0); /* workqueues should not be signaled. */
431 /* Hit a signal, disable CPU stall warnings. */ 431 if (rcu_cpu_stall_suppress)
432 swait_event(rsp->expedited_wq, 432 continue;
433 sync_rcu_preempt_exp_done(rnp_root)); 433 panic_on_rcu_stall();
434 return;
435 }
436 pr_err("INFO: %s detected expedited stalls on CPUs/tasks: {", 434 pr_err("INFO: %s detected expedited stalls on CPUs/tasks: {",
437 rsp->name); 435 rsp->name);
438 ndetected = 0; 436 ndetected = 0;
@@ -500,7 +498,6 @@ static void rcu_exp_wait_wake(struct rcu_state *rsp, unsigned long s)
500 * next GP, to proceed. 498 * next GP, to proceed.
501 */ 499 */
502 mutex_lock(&rsp->exp_wake_mutex); 500 mutex_lock(&rsp->exp_wake_mutex);
503 mutex_unlock(&rsp->exp_mutex);
504 501
505 rcu_for_each_node_breadth_first(rsp, rnp) { 502 rcu_for_each_node_breadth_first(rsp, rnp) {
506 if (ULONG_CMP_LT(READ_ONCE(rnp->exp_seq_rq), s)) { 503 if (ULONG_CMP_LT(READ_ONCE(rnp->exp_seq_rq), s)) {
@@ -516,6 +513,70 @@ static void rcu_exp_wait_wake(struct rcu_state *rsp, unsigned long s)
516 mutex_unlock(&rsp->exp_wake_mutex); 513 mutex_unlock(&rsp->exp_wake_mutex);
517} 514}
518 515
516/* Let the workqueue handler know what it is supposed to do. */
517struct rcu_exp_work {
518 smp_call_func_t rew_func;
519 struct rcu_state *rew_rsp;
520 unsigned long rew_s;
521 struct work_struct rew_work;
522};
523
524/*
525 * Work-queue handler to drive an expedited grace period forward.
526 */
527static void wait_rcu_exp_gp(struct work_struct *wp)
528{
529 struct rcu_exp_work *rewp;
530
531 /* Initialize the rcu_node tree in preparation for the wait. */
532 rewp = container_of(wp, struct rcu_exp_work, rew_work);
533 sync_rcu_exp_select_cpus(rewp->rew_rsp, rewp->rew_func);
534
535 /* Wait and clean up, including waking everyone. */
536 rcu_exp_wait_wake(rewp->rew_rsp, rewp->rew_s);
537}
538
539/*
540 * Given an rcu_state pointer and a smp_call_function() handler, kick
541 * off the specified flavor of expedited grace period.
542 */
543static void _synchronize_rcu_expedited(struct rcu_state *rsp,
544 smp_call_func_t func)
545{
546 struct rcu_data *rdp;
547 struct rcu_exp_work rew;
548 struct rcu_node *rnp;
549 unsigned long s;
550
551 /* If expedited grace periods are prohibited, fall back to normal. */
552 if (rcu_gp_is_normal()) {
553 wait_rcu_gp(rsp->call);
554 return;
555 }
556
557 /* Take a snapshot of the sequence number. */
558 s = rcu_exp_gp_seq_snap(rsp);
559 if (exp_funnel_lock(rsp, s))
560 return; /* Someone else did our work for us. */
561
562 /* Marshall arguments and schedule the expedited grace period. */
563 rew.rew_func = func;
564 rew.rew_rsp = rsp;
565 rew.rew_s = s;
566 INIT_WORK_ONSTACK(&rew.rew_work, wait_rcu_exp_gp);
567 schedule_work(&rew.rew_work);
568
569 /* Wait for expedited grace period to complete. */
570 rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id());
571 rnp = rcu_get_root(rsp);
572 wait_event(rnp->exp_wq[(s >> 1) & 0x3],
573 sync_exp_work_done(rsp,
574 &rdp->exp_workdone0, s));
575
576 /* Let the next expedited grace period start. */
577 mutex_unlock(&rsp->exp_mutex);
578}
579
519/** 580/**
520 * synchronize_sched_expedited - Brute-force RCU-sched grace period 581 * synchronize_sched_expedited - Brute-force RCU-sched grace period
521 * 582 *
@@ -534,29 +595,13 @@ static void rcu_exp_wait_wake(struct rcu_state *rsp, unsigned long s)
534 */ 595 */
535void synchronize_sched_expedited(void) 596void synchronize_sched_expedited(void)
536{ 597{
537 unsigned long s;
538 struct rcu_state *rsp = &rcu_sched_state; 598 struct rcu_state *rsp = &rcu_sched_state;
539 599
540 /* If only one CPU, this is automatically a grace period. */ 600 /* If only one CPU, this is automatically a grace period. */
541 if (rcu_blocking_is_gp()) 601 if (rcu_blocking_is_gp())
542 return; 602 return;
543 603
544 /* If expedited grace periods are prohibited, fall back to normal. */ 604 _synchronize_rcu_expedited(rsp, sync_sched_exp_handler);
545 if (rcu_gp_is_normal()) {
546 wait_rcu_gp(call_rcu_sched);
547 return;
548 }
549
550 /* Take a snapshot of the sequence number. */
551 s = rcu_exp_gp_seq_snap(rsp);
552 if (exp_funnel_lock(rsp, s))
553 return; /* Someone else did our work for us. */
554
555 /* Initialize the rcu_node tree in preparation for the wait. */
556 sync_rcu_exp_select_cpus(rsp, sync_sched_exp_handler);
557
558 /* Wait and clean up, including waking everyone. */
559 rcu_exp_wait_wake(rsp, s);
560} 605}
561EXPORT_SYMBOL_GPL(synchronize_sched_expedited); 606EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
562 607
@@ -620,23 +665,8 @@ static void sync_rcu_exp_handler(void *info)
620void synchronize_rcu_expedited(void) 665void synchronize_rcu_expedited(void)
621{ 666{
622 struct rcu_state *rsp = rcu_state_p; 667 struct rcu_state *rsp = rcu_state_p;
623 unsigned long s;
624
625 /* If expedited grace periods are prohibited, fall back to normal. */
626 if (rcu_gp_is_normal()) {
627 wait_rcu_gp(call_rcu);
628 return;
629 }
630
631 s = rcu_exp_gp_seq_snap(rsp);
632 if (exp_funnel_lock(rsp, s))
633 return; /* Someone else did our work for us. */
634
635 /* Initialize the rcu_node tree in preparation for the wait. */
636 sync_rcu_exp_select_cpus(rsp, sync_rcu_exp_handler);
637 668
638 /* Wait for ->blkd_tasks lists to drain, then wake everyone up. */ 669 _synchronize_rcu_expedited(rsp, sync_rcu_exp_handler);
639 rcu_exp_wait_wake(rsp, s);
640} 670}
641EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); 671EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
642 672
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 0082fce402a0..85c5a883c6e3 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -2173,6 +2173,7 @@ static int rcu_nocb_kthread(void *arg)
2173 cl++; 2173 cl++;
2174 c++; 2174 c++;
2175 local_bh_enable(); 2175 local_bh_enable();
2176 cond_resched_rcu_qs();
2176 list = next; 2177 list = next;
2177 } 2178 }
2178 trace_rcu_batch_end(rdp->rsp->name, c, !!list, 0, 0, 1); 2179 trace_rcu_batch_end(rdp->rsp->name, c, !!list, 0, 0, 1);
diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c
index 86782f9a4604..b1f28972872c 100644
--- a/kernel/rcu/tree_trace.c
+++ b/kernel/rcu/tree_trace.c
@@ -185,16 +185,17 @@ static int show_rcuexp(struct seq_file *m, void *v)
185 int cpu; 185 int cpu;
186 struct rcu_state *rsp = (struct rcu_state *)m->private; 186 struct rcu_state *rsp = (struct rcu_state *)m->private;
187 struct rcu_data *rdp; 187 struct rcu_data *rdp;
188 unsigned long s1 = 0, s2 = 0, s3 = 0; 188 unsigned long s0 = 0, s1 = 0, s2 = 0, s3 = 0;
189 189
190 for_each_possible_cpu(cpu) { 190 for_each_possible_cpu(cpu) {
191 rdp = per_cpu_ptr(rsp->rda, cpu); 191 rdp = per_cpu_ptr(rsp->rda, cpu);
192 s0 += atomic_long_read(&rdp->exp_workdone0);
192 s1 += atomic_long_read(&rdp->exp_workdone1); 193 s1 += atomic_long_read(&rdp->exp_workdone1);
193 s2 += atomic_long_read(&rdp->exp_workdone2); 194 s2 += atomic_long_read(&rdp->exp_workdone2);
194 s3 += atomic_long_read(&rdp->exp_workdone3); 195 s3 += atomic_long_read(&rdp->exp_workdone3);
195 } 196 }
196 seq_printf(m, "s=%lu wd1=%lu wd2=%lu wd3=%lu n=%lu enq=%d sc=%lu\n", 197 seq_printf(m, "s=%lu wd0=%lu wd1=%lu wd2=%lu wd3=%lu n=%lu enq=%d sc=%lu\n",
197 rsp->expedited_sequence, s1, s2, s3, 198 rsp->expedited_sequence, s0, s1, s2, s3,
198 atomic_long_read(&rsp->expedited_normal), 199 atomic_long_read(&rsp->expedited_normal),
199 atomic_read(&rsp->expedited_need_qs), 200 atomic_read(&rsp->expedited_need_qs),
200 rsp->expedited_sequence / 2); 201 rsp->expedited_sequence / 2);
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index f0d8322bc3ec..f19271dce0a9 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -46,7 +46,7 @@
46#include <linux/export.h> 46#include <linux/export.h>
47#include <linux/hardirq.h> 47#include <linux/hardirq.h>
48#include <linux/delay.h> 48#include <linux/delay.h>
49#include <linux/module.h> 49#include <linux/moduleparam.h>
50#include <linux/kthread.h> 50#include <linux/kthread.h>
51#include <linux/tick.h> 51#include <linux/tick.h>
52 52
@@ -54,7 +54,6 @@
54 54
55#include "rcu.h" 55#include "rcu.h"
56 56
57MODULE_ALIAS("rcupdate");
58#ifdef MODULE_PARAM_PREFIX 57#ifdef MODULE_PARAM_PREFIX
59#undef MODULE_PARAM_PREFIX 58#undef MODULE_PARAM_PREFIX
60#endif 59#endif
diff --git a/kernel/relay.c b/kernel/relay.c
index d797502140b9..da79a109dbeb 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -214,7 +214,7 @@ static void relay_destroy_buf(struct rchan_buf *buf)
214 __free_page(buf->page_array[i]); 214 __free_page(buf->page_array[i]);
215 relay_free_page_array(buf->page_array); 215 relay_free_page_array(buf->page_array);
216 } 216 }
217 chan->buf[buf->cpu] = NULL; 217 *per_cpu_ptr(chan->buf, buf->cpu) = NULL;
218 kfree(buf->padding); 218 kfree(buf->padding);
219 kfree(buf); 219 kfree(buf);
220 kref_put(&chan->kref, relay_destroy_channel); 220 kref_put(&chan->kref, relay_destroy_channel);
@@ -328,13 +328,15 @@ static struct rchan_callbacks default_channel_callbacks = {
328 328
329/** 329/**
330 * wakeup_readers - wake up readers waiting on a channel 330 * wakeup_readers - wake up readers waiting on a channel
331 * @data: contains the channel buffer 331 * @work: contains the channel buffer
332 * 332 *
333 * This is the timer function used to defer reader waking. 333 * This is the function used to defer reader waking
334 */ 334 */
335static void wakeup_readers(unsigned long data) 335static void wakeup_readers(struct irq_work *work)
336{ 336{
337 struct rchan_buf *buf = (struct rchan_buf *)data; 337 struct rchan_buf *buf;
338
339 buf = container_of(work, struct rchan_buf, wakeup_work);
338 wake_up_interruptible(&buf->read_wait); 340 wake_up_interruptible(&buf->read_wait);
339} 341}
340 342
@@ -352,9 +354,10 @@ static void __relay_reset(struct rchan_buf *buf, unsigned int init)
352 if (init) { 354 if (init) {
353 init_waitqueue_head(&buf->read_wait); 355 init_waitqueue_head(&buf->read_wait);
354 kref_init(&buf->kref); 356 kref_init(&buf->kref);
355 setup_timer(&buf->timer, wakeup_readers, (unsigned long)buf); 357 init_irq_work(&buf->wakeup_work, wakeup_readers);
356 } else 358 } else {
357 del_timer_sync(&buf->timer); 359 irq_work_sync(&buf->wakeup_work);
360 }
358 361
359 buf->subbufs_produced = 0; 362 buf->subbufs_produced = 0;
360 buf->subbufs_consumed = 0; 363 buf->subbufs_consumed = 0;
@@ -382,20 +385,21 @@ static void __relay_reset(struct rchan_buf *buf, unsigned int init)
382 */ 385 */
383void relay_reset(struct rchan *chan) 386void relay_reset(struct rchan *chan)
384{ 387{
388 struct rchan_buf *buf;
385 unsigned int i; 389 unsigned int i;
386 390
387 if (!chan) 391 if (!chan)
388 return; 392 return;
389 393
390 if (chan->is_global && chan->buf[0]) { 394 if (chan->is_global && (buf = *per_cpu_ptr(chan->buf, 0))) {
391 __relay_reset(chan->buf[0], 0); 395 __relay_reset(buf, 0);
392 return; 396 return;
393 } 397 }
394 398
395 mutex_lock(&relay_channels_mutex); 399 mutex_lock(&relay_channels_mutex);
396 for_each_possible_cpu(i) 400 for_each_possible_cpu(i)
397 if (chan->buf[i]) 401 if ((buf = *per_cpu_ptr(chan->buf, i)))
398 __relay_reset(chan->buf[i], 0); 402 __relay_reset(buf, 0);
399 mutex_unlock(&relay_channels_mutex); 403 mutex_unlock(&relay_channels_mutex);
400} 404}
401EXPORT_SYMBOL_GPL(relay_reset); 405EXPORT_SYMBOL_GPL(relay_reset);
@@ -440,7 +444,7 @@ static struct rchan_buf *relay_open_buf(struct rchan *chan, unsigned int cpu)
440 struct dentry *dentry; 444 struct dentry *dentry;
441 445
442 if (chan->is_global) 446 if (chan->is_global)
443 return chan->buf[0]; 447 return *per_cpu_ptr(chan->buf, 0);
444 448
445 buf = relay_create_buf(chan); 449 buf = relay_create_buf(chan);
446 if (!buf) 450 if (!buf)
@@ -464,7 +468,7 @@ static struct rchan_buf *relay_open_buf(struct rchan *chan, unsigned int cpu)
464 __relay_reset(buf, 1); 468 __relay_reset(buf, 1);
465 469
466 if(chan->is_global) { 470 if(chan->is_global) {
467 chan->buf[0] = buf; 471 *per_cpu_ptr(chan->buf, 0) = buf;
468 buf->cpu = 0; 472 buf->cpu = 0;
469 } 473 }
470 474
@@ -486,7 +490,7 @@ free_buf:
486static void relay_close_buf(struct rchan_buf *buf) 490static void relay_close_buf(struct rchan_buf *buf)
487{ 491{
488 buf->finalized = 1; 492 buf->finalized = 1;
489 del_timer_sync(&buf->timer); 493 irq_work_sync(&buf->wakeup_work);
490 buf->chan->cb->remove_buf_file(buf->dentry); 494 buf->chan->cb->remove_buf_file(buf->dentry);
491 kref_put(&buf->kref, relay_remove_buf); 495 kref_put(&buf->kref, relay_remove_buf);
492} 496}
@@ -512,46 +516,25 @@ static void setup_callbacks(struct rchan *chan,
512 chan->cb = cb; 516 chan->cb = cb;
513} 517}
514 518
515/** 519int relay_prepare_cpu(unsigned int cpu)
516 * relay_hotcpu_callback - CPU hotplug callback
517 * @nb: notifier block
518 * @action: hotplug action to take
519 * @hcpu: CPU number
520 *
521 * Returns the success/failure of the operation. (%NOTIFY_OK, %NOTIFY_BAD)
522 */
523static int relay_hotcpu_callback(struct notifier_block *nb,
524 unsigned long action,
525 void *hcpu)
526{ 520{
527 unsigned int hotcpu = (unsigned long)hcpu;
528 struct rchan *chan; 521 struct rchan *chan;
522 struct rchan_buf *buf;
529 523
530 switch(action) { 524 mutex_lock(&relay_channels_mutex);
531 case CPU_UP_PREPARE: 525 list_for_each_entry(chan, &relay_channels, list) {
532 case CPU_UP_PREPARE_FROZEN: 526 if ((buf = *per_cpu_ptr(chan->buf, cpu)))
533 mutex_lock(&relay_channels_mutex); 527 continue;
534 list_for_each_entry(chan, &relay_channels, list) { 528 buf = relay_open_buf(chan, cpu);
535 if (chan->buf[hotcpu]) 529 if (!buf) {
536 continue; 530 pr_err("relay: cpu %d buffer creation failed\n", cpu);
537 chan->buf[hotcpu] = relay_open_buf(chan, hotcpu); 531 mutex_unlock(&relay_channels_mutex);
538 if(!chan->buf[hotcpu]) { 532 return -ENOMEM;
539 printk(KERN_ERR
540 "relay_hotcpu_callback: cpu %d buffer "
541 "creation failed\n", hotcpu);
542 mutex_unlock(&relay_channels_mutex);
543 return notifier_from_errno(-ENOMEM);
544 }
545 } 533 }
546 mutex_unlock(&relay_channels_mutex); 534 *per_cpu_ptr(chan->buf, cpu) = buf;
547 break;
548 case CPU_DEAD:
549 case CPU_DEAD_FROZEN:
550 /* No need to flush the cpu : will be flushed upon
551 * final relay_flush() call. */
552 break;
553 } 535 }
554 return NOTIFY_OK; 536 mutex_unlock(&relay_channels_mutex);
537 return 0;
555} 538}
556 539
557/** 540/**
@@ -583,6 +566,7 @@ struct rchan *relay_open(const char *base_filename,
583{ 566{
584 unsigned int i; 567 unsigned int i;
585 struct rchan *chan; 568 struct rchan *chan;
569 struct rchan_buf *buf;
586 570
587 if (!(subbuf_size && n_subbufs)) 571 if (!(subbuf_size && n_subbufs))
588 return NULL; 572 return NULL;
@@ -593,6 +577,7 @@ struct rchan *relay_open(const char *base_filename,
593 if (!chan) 577 if (!chan)
594 return NULL; 578 return NULL;
595 579
580 chan->buf = alloc_percpu(struct rchan_buf *);
596 chan->version = RELAYFS_CHANNEL_VERSION; 581 chan->version = RELAYFS_CHANNEL_VERSION;
597 chan->n_subbufs = n_subbufs; 582 chan->n_subbufs = n_subbufs;
598 chan->subbuf_size = subbuf_size; 583 chan->subbuf_size = subbuf_size;
@@ -608,9 +593,10 @@ struct rchan *relay_open(const char *base_filename,
608 593
609 mutex_lock(&relay_channels_mutex); 594 mutex_lock(&relay_channels_mutex);
610 for_each_online_cpu(i) { 595 for_each_online_cpu(i) {
611 chan->buf[i] = relay_open_buf(chan, i); 596 buf = relay_open_buf(chan, i);
612 if (!chan->buf[i]) 597 if (!buf)
613 goto free_bufs; 598 goto free_bufs;
599 *per_cpu_ptr(chan->buf, i) = buf;
614 } 600 }
615 list_add(&chan->list, &relay_channels); 601 list_add(&chan->list, &relay_channels);
616 mutex_unlock(&relay_channels_mutex); 602 mutex_unlock(&relay_channels_mutex);
@@ -619,8 +605,8 @@ struct rchan *relay_open(const char *base_filename,
619 605
620free_bufs: 606free_bufs:
621 for_each_possible_cpu(i) { 607 for_each_possible_cpu(i) {
622 if (chan->buf[i]) 608 if ((buf = *per_cpu_ptr(chan->buf, i)))
623 relay_close_buf(chan->buf[i]); 609 relay_close_buf(buf);
624 } 610 }
625 611
626 kref_put(&chan->kref, relay_destroy_channel); 612 kref_put(&chan->kref, relay_destroy_channel);
@@ -666,6 +652,7 @@ int relay_late_setup_files(struct rchan *chan,
666 unsigned int i, curr_cpu; 652 unsigned int i, curr_cpu;
667 unsigned long flags; 653 unsigned long flags;
668 struct dentry *dentry; 654 struct dentry *dentry;
655 struct rchan_buf *buf;
669 struct rchan_percpu_buf_dispatcher disp; 656 struct rchan_percpu_buf_dispatcher disp;
670 657
671 if (!chan || !base_filename) 658 if (!chan || !base_filename)
@@ -684,10 +671,11 @@ int relay_late_setup_files(struct rchan *chan,
684 671
685 if (chan->is_global) { 672 if (chan->is_global) {
686 err = -EINVAL; 673 err = -EINVAL;
687 if (!WARN_ON_ONCE(!chan->buf[0])) { 674 buf = *per_cpu_ptr(chan->buf, 0);
688 dentry = relay_create_buf_file(chan, chan->buf[0], 0); 675 if (!WARN_ON_ONCE(!buf)) {
676 dentry = relay_create_buf_file(chan, buf, 0);
689 if (dentry && !WARN_ON_ONCE(!chan->is_global)) { 677 if (dentry && !WARN_ON_ONCE(!chan->is_global)) {
690 relay_set_buf_dentry(chan->buf[0], dentry); 678 relay_set_buf_dentry(buf, dentry);
691 err = 0; 679 err = 0;
692 } 680 }
693 } 681 }
@@ -702,13 +690,14 @@ int relay_late_setup_files(struct rchan *chan,
702 * on all currently online CPUs. 690 * on all currently online CPUs.
703 */ 691 */
704 for_each_online_cpu(i) { 692 for_each_online_cpu(i) {
705 if (unlikely(!chan->buf[i])) { 693 buf = *per_cpu_ptr(chan->buf, i);
694 if (unlikely(!buf)) {
706 WARN_ONCE(1, KERN_ERR "CPU has no buffer!\n"); 695 WARN_ONCE(1, KERN_ERR "CPU has no buffer!\n");
707 err = -EINVAL; 696 err = -EINVAL;
708 break; 697 break;
709 } 698 }
710 699
711 dentry = relay_create_buf_file(chan, chan->buf[i], i); 700 dentry = relay_create_buf_file(chan, buf, i);
712 if (unlikely(!dentry)) { 701 if (unlikely(!dentry)) {
713 err = -EINVAL; 702 err = -EINVAL;
714 break; 703 break;
@@ -716,10 +705,10 @@ int relay_late_setup_files(struct rchan *chan,
716 705
717 if (curr_cpu == i) { 706 if (curr_cpu == i) {
718 local_irq_save(flags); 707 local_irq_save(flags);
719 relay_set_buf_dentry(chan->buf[i], dentry); 708 relay_set_buf_dentry(buf, dentry);
720 local_irq_restore(flags); 709 local_irq_restore(flags);
721 } else { 710 } else {
722 disp.buf = chan->buf[i]; 711 disp.buf = buf;
723 disp.dentry = dentry; 712 disp.dentry = dentry;
724 smp_mb(); 713 smp_mb();
725 /* relay_channels_mutex must be held, so wait. */ 714 /* relay_channels_mutex must be held, so wait. */
@@ -768,14 +757,15 @@ size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length)
768 buf->early_bytes += buf->chan->subbuf_size - 757 buf->early_bytes += buf->chan->subbuf_size -
769 buf->padding[old_subbuf]; 758 buf->padding[old_subbuf];
770 smp_mb(); 759 smp_mb();
771 if (waitqueue_active(&buf->read_wait)) 760 if (waitqueue_active(&buf->read_wait)) {
772 /* 761 /*
773 * Calling wake_up_interruptible() from here 762 * Calling wake_up_interruptible() from here
774 * will deadlock if we happen to be logging 763 * will deadlock if we happen to be logging
775 * from the scheduler (trying to re-grab 764 * from the scheduler (trying to re-grab
776 * rq->lock), so defer it. 765 * rq->lock), so defer it.
777 */ 766 */
778 mod_timer(&buf->timer, jiffies + 1); 767 irq_work_queue(&buf->wakeup_work);
768 }
779 } 769 }
780 770
781 old = buf->data; 771 old = buf->data;
@@ -822,11 +812,10 @@ void relay_subbufs_consumed(struct rchan *chan,
822 if (!chan) 812 if (!chan)
823 return; 813 return;
824 814
825 if (cpu >= NR_CPUS || !chan->buf[cpu] || 815 buf = *per_cpu_ptr(chan->buf, cpu);
826 subbufs_consumed > chan->n_subbufs) 816 if (cpu >= NR_CPUS || !buf || subbufs_consumed > chan->n_subbufs)
827 return; 817 return;
828 818
829 buf = chan->buf[cpu];
830 if (subbufs_consumed > buf->subbufs_produced - buf->subbufs_consumed) 819 if (subbufs_consumed > buf->subbufs_produced - buf->subbufs_consumed)
831 buf->subbufs_consumed = buf->subbufs_produced; 820 buf->subbufs_consumed = buf->subbufs_produced;
832 else 821 else
@@ -842,18 +831,19 @@ EXPORT_SYMBOL_GPL(relay_subbufs_consumed);
842 */ 831 */
843void relay_close(struct rchan *chan) 832void relay_close(struct rchan *chan)
844{ 833{
834 struct rchan_buf *buf;
845 unsigned int i; 835 unsigned int i;
846 836
847 if (!chan) 837 if (!chan)
848 return; 838 return;
849 839
850 mutex_lock(&relay_channels_mutex); 840 mutex_lock(&relay_channels_mutex);
851 if (chan->is_global && chan->buf[0]) 841 if (chan->is_global && (buf = *per_cpu_ptr(chan->buf, 0)))
852 relay_close_buf(chan->buf[0]); 842 relay_close_buf(buf);
853 else 843 else
854 for_each_possible_cpu(i) 844 for_each_possible_cpu(i)
855 if (chan->buf[i]) 845 if ((buf = *per_cpu_ptr(chan->buf, i)))
856 relay_close_buf(chan->buf[i]); 846 relay_close_buf(buf);
857 847
858 if (chan->last_toobig) 848 if (chan->last_toobig)
859 printk(KERN_WARNING "relay: one or more items not logged " 849 printk(KERN_WARNING "relay: one or more items not logged "
@@ -874,20 +864,21 @@ EXPORT_SYMBOL_GPL(relay_close);
874 */ 864 */
875void relay_flush(struct rchan *chan) 865void relay_flush(struct rchan *chan)
876{ 866{
867 struct rchan_buf *buf;
877 unsigned int i; 868 unsigned int i;
878 869
879 if (!chan) 870 if (!chan)
880 return; 871 return;
881 872
882 if (chan->is_global && chan->buf[0]) { 873 if (chan->is_global && (buf = *per_cpu_ptr(chan->buf, 0))) {
883 relay_switch_subbuf(chan->buf[0], 0); 874 relay_switch_subbuf(buf, 0);
884 return; 875 return;
885 } 876 }
886 877
887 mutex_lock(&relay_channels_mutex); 878 mutex_lock(&relay_channels_mutex);
888 for_each_possible_cpu(i) 879 for_each_possible_cpu(i)
889 if (chan->buf[i]) 880 if ((buf = *per_cpu_ptr(chan->buf, i)))
890 relay_switch_subbuf(chan->buf[i], 0); 881 relay_switch_subbuf(buf, 0);
891 mutex_unlock(&relay_channels_mutex); 882 mutex_unlock(&relay_channels_mutex);
892} 883}
893EXPORT_SYMBOL_GPL(relay_flush); 884EXPORT_SYMBOL_GPL(relay_flush);
@@ -1121,51 +1112,23 @@ static size_t relay_file_read_end_pos(struct rchan_buf *buf,
1121 return end_pos; 1112 return end_pos;
1122} 1113}
1123 1114
1124/* 1115static ssize_t relay_file_read(struct file *filp,
1125 * subbuf_read_actor - read up to one subbuf's worth of data 1116 char __user *buffer,
1126 */ 1117 size_t count,
1127static int subbuf_read_actor(size_t read_start, 1118 loff_t *ppos)
1128 struct rchan_buf *buf,
1129 size_t avail,
1130 read_descriptor_t *desc)
1131{
1132 void *from;
1133 int ret = 0;
1134
1135 from = buf->start + read_start;
1136 ret = avail;
1137 if (copy_to_user(desc->arg.buf, from, avail)) {
1138 desc->error = -EFAULT;
1139 ret = 0;
1140 }
1141 desc->arg.data += ret;
1142 desc->written += ret;
1143 desc->count -= ret;
1144
1145 return ret;
1146}
1147
1148typedef int (*subbuf_actor_t) (size_t read_start,
1149 struct rchan_buf *buf,
1150 size_t avail,
1151 read_descriptor_t *desc);
1152
1153/*
1154 * relay_file_read_subbufs - read count bytes, bridging subbuf boundaries
1155 */
1156static ssize_t relay_file_read_subbufs(struct file *filp, loff_t *ppos,
1157 subbuf_actor_t subbuf_actor,
1158 read_descriptor_t *desc)
1159{ 1119{
1160 struct rchan_buf *buf = filp->private_data; 1120 struct rchan_buf *buf = filp->private_data;
1161 size_t read_start, avail; 1121 size_t read_start, avail;
1122 size_t written = 0;
1162 int ret; 1123 int ret;
1163 1124
1164 if (!desc->count) 1125 if (!count)
1165 return 0; 1126 return 0;
1166 1127
1167 inode_lock(file_inode(filp)); 1128 inode_lock(file_inode(filp));
1168 do { 1129 do {
1130 void *from;
1131
1169 if (!relay_file_read_avail(buf, *ppos)) 1132 if (!relay_file_read_avail(buf, *ppos))
1170 break; 1133 break;
1171 1134
@@ -1174,32 +1137,22 @@ static ssize_t relay_file_read_subbufs(struct file *filp, loff_t *ppos,
1174 if (!avail) 1137 if (!avail)
1175 break; 1138 break;
1176 1139
1177 avail = min(desc->count, avail); 1140 avail = min(count, avail);
1178 ret = subbuf_actor(read_start, buf, avail, desc); 1141 from = buf->start + read_start;
1179 if (desc->error < 0) 1142 ret = avail;
1143 if (copy_to_user(buffer, from, avail))
1180 break; 1144 break;
1181 1145
1182 if (ret) { 1146 buffer += ret;
1183 relay_file_read_consume(buf, read_start, ret); 1147 written += ret;
1184 *ppos = relay_file_read_end_pos(buf, read_start, ret); 1148 count -= ret;
1185 }
1186 } while (desc->count && ret);
1187 inode_unlock(file_inode(filp));
1188 1149
1189 return desc->written; 1150 relay_file_read_consume(buf, read_start, ret);
1190} 1151 *ppos = relay_file_read_end_pos(buf, read_start, ret);
1152 } while (count);
1153 inode_unlock(file_inode(filp));
1191 1154
1192static ssize_t relay_file_read(struct file *filp, 1155 return written;
1193 char __user *buffer,
1194 size_t count,
1195 loff_t *ppos)
1196{
1197 read_descriptor_t desc;
1198 desc.written = 0;
1199 desc.count = count;
1200 desc.arg.buf = buffer;
1201 desc.error = 0;
1202 return relay_file_read_subbufs(filp, ppos, subbuf_read_actor, &desc);
1203} 1156}
1204 1157
1205static void relay_consume_bytes(struct rchan_buf *rbuf, int bytes_consumed) 1158static void relay_consume_bytes(struct rchan_buf *rbuf, int bytes_consumed)
@@ -1377,12 +1330,3 @@ const struct file_operations relay_file_operations = {
1377 .splice_read = relay_file_splice_read, 1330 .splice_read = relay_file_splice_read,
1378}; 1331};
1379EXPORT_SYMBOL_GPL(relay_file_operations); 1332EXPORT_SYMBOL_GPL(relay_file_operations);
1380
1381static __init int relay_init(void)
1382{
1383
1384 hotcpu_notifier(relay_hotcpu_callback, 0);
1385 return 0;
1386}
1387
1388early_initcall(relay_init);
diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c
index a5d966cb8891..f1c8fd566246 100644
--- a/kernel/sched/auto_group.c
+++ b/kernel/sched/auto_group.c
@@ -111,10 +111,13 @@ bool task_wants_autogroup(struct task_struct *p, struct task_group *tg)
111{ 111{
112 if (tg != &root_task_group) 112 if (tg != &root_task_group)
113 return false; 113 return false;
114
115 /* 114 /*
116 * We can only assume the task group can't go away on us if 115 * If we race with autogroup_move_group() the caller can use the old
117 * autogroup_move_group() can see us on ->thread_group list. 116 * value of signal->autogroup but in this case sched_move_task() will
117 * be called again before autogroup_kref_put().
118 *
119 * However, there is no way sched_autogroup_exit_task() could tell us
120 * to avoid autogroup->tg, so we abuse PF_EXITING flag for this case.
118 */ 121 */
119 if (p->flags & PF_EXITING) 122 if (p->flags & PF_EXITING)
120 return false; 123 return false;
@@ -122,6 +125,16 @@ bool task_wants_autogroup(struct task_struct *p, struct task_group *tg)
122 return true; 125 return true;
123} 126}
124 127
128void sched_autogroup_exit_task(struct task_struct *p)
129{
130 /*
131 * We are going to call exit_notify() and autogroup_move_group() can't
132 * see this thread after that: we can no longer use signal->autogroup.
133 * See the PF_EXITING check in task_wants_autogroup().
134 */
135 sched_move_task(p);
136}
137
125static void 138static void
126autogroup_move_group(struct task_struct *p, struct autogroup *ag) 139autogroup_move_group(struct task_struct *p, struct autogroup *ag)
127{ 140{
@@ -138,13 +151,20 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag)
138 } 151 }
139 152
140 p->signal->autogroup = autogroup_kref_get(ag); 153 p->signal->autogroup = autogroup_kref_get(ag);
141 154 /*
142 if (!READ_ONCE(sysctl_sched_autogroup_enabled)) 155 * We can't avoid sched_move_task() after we changed signal->autogroup,
143 goto out; 156 * this process can already run with task_group() == prev->tg or we can
144 157 * race with cgroup code which can read autogroup = prev under rq->lock.
158 * In the latter case for_each_thread() can not miss a migrating thread,
159 * cpu_cgroup_attach() must not be possible after cgroup_exit() and it
160 * can't be removed from thread list, we hold ->siglock.
161 *
162 * If an exiting thread was already removed from thread list we rely on
163 * sched_autogroup_exit_task().
164 */
145 for_each_thread(p, t) 165 for_each_thread(p, t)
146 sched_move_task(t); 166 sched_move_task(t);
147out: 167
148 unlock_task_sighand(p, &flags); 168 unlock_task_sighand(p, &flags);
149 autogroup_kref_put(prev); 169 autogroup_kref_put(prev);
150} 170}
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 44817c640e99..154fd689fe02 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -581,6 +581,8 @@ static bool wake_up_full_nohz_cpu(int cpu)
581 * If needed we can still optimize that later with an 581 * If needed we can still optimize that later with an
582 * empty IRQ. 582 * empty IRQ.
583 */ 583 */
584 if (cpu_is_offline(cpu))
585 return true; /* Don't try to wake offline CPUs. */
584 if (tick_nohz_full_cpu(cpu)) { 586 if (tick_nohz_full_cpu(cpu)) {
585 if (cpu != smp_processor_id() || 587 if (cpu != smp_processor_id() ||
586 tick_nohz_tick_stopped()) 588 tick_nohz_tick_stopped())
@@ -591,6 +593,11 @@ static bool wake_up_full_nohz_cpu(int cpu)
591 return false; 593 return false;
592} 594}
593 595
596/*
597 * Wake up the specified CPU. If the CPU is going offline, it is the
598 * caller's responsibility to deal with the lost wakeup, for example,
599 * by hooking into the CPU_DEAD notifier like timers and hrtimers do.
600 */
594void wake_up_nohz_cpu(int cpu) 601void wake_up_nohz_cpu(int cpu)
595{ 602{
596 if (!wake_up_full_nohz_cpu(cpu)) 603 if (!wake_up_full_nohz_cpu(cpu))
@@ -1063,8 +1070,12 @@ static int migration_cpu_stop(void *data)
1063 * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because 1070 * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because
1064 * we're holding p->pi_lock. 1071 * we're holding p->pi_lock.
1065 */ 1072 */
1066 if (task_rq(p) == rq && task_on_rq_queued(p)) 1073 if (task_rq(p) == rq) {
1067 rq = __migrate_task(rq, p, arg->dest_cpu); 1074 if (task_on_rq_queued(p))
1075 rq = __migrate_task(rq, p, arg->dest_cpu);
1076 else
1077 p->wake_cpu = arg->dest_cpu;
1078 }
1068 raw_spin_unlock(&rq->lock); 1079 raw_spin_unlock(&rq->lock);
1069 raw_spin_unlock(&p->pi_lock); 1080 raw_spin_unlock(&p->pi_lock);
1070 1081
@@ -1105,10 +1116,10 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
1105 1116
1106 p->sched_class->set_cpus_allowed(p, new_mask); 1117 p->sched_class->set_cpus_allowed(p, new_mask);
1107 1118
1108 if (running)
1109 p->sched_class->set_curr_task(rq);
1110 if (queued) 1119 if (queued)
1111 enqueue_task(rq, p, ENQUEUE_RESTORE); 1120 enqueue_task(rq, p, ENQUEUE_RESTORE);
1121 if (running)
1122 set_curr_task(rq, p);
1112} 1123}
1113 1124
1114/* 1125/*
@@ -1265,7 +1276,7 @@ static void __migrate_swap_task(struct task_struct *p, int cpu)
1265 /* 1276 /*
1266 * Task isn't running anymore; make it appear like we migrated 1277 * Task isn't running anymore; make it appear like we migrated
1267 * it before it went to sleep. This means on wakeup we make the 1278 * it before it went to sleep. This means on wakeup we make the
1268 * previous cpu our targer instead of where it really is. 1279 * previous cpu our target instead of where it really is.
1269 */ 1280 */
1270 p->wake_cpu = cpu; 1281 p->wake_cpu = cpu;
1271 } 1282 }
@@ -1629,23 +1640,25 @@ static inline int __set_cpus_allowed_ptr(struct task_struct *p,
1629static void 1640static void
1630ttwu_stat(struct task_struct *p, int cpu, int wake_flags) 1641ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
1631{ 1642{
1632#ifdef CONFIG_SCHEDSTATS 1643 struct rq *rq;
1633 struct rq *rq = this_rq();
1634 1644
1635#ifdef CONFIG_SMP 1645 if (!schedstat_enabled())
1636 int this_cpu = smp_processor_id(); 1646 return;
1637 1647
1638 if (cpu == this_cpu) { 1648 rq = this_rq();
1639 schedstat_inc(rq, ttwu_local); 1649
1640 schedstat_inc(p, se.statistics.nr_wakeups_local); 1650#ifdef CONFIG_SMP
1651 if (cpu == rq->cpu) {
1652 schedstat_inc(rq->ttwu_local);
1653 schedstat_inc(p->se.statistics.nr_wakeups_local);
1641 } else { 1654 } else {
1642 struct sched_domain *sd; 1655 struct sched_domain *sd;
1643 1656
1644 schedstat_inc(p, se.statistics.nr_wakeups_remote); 1657 schedstat_inc(p->se.statistics.nr_wakeups_remote);
1645 rcu_read_lock(); 1658 rcu_read_lock();
1646 for_each_domain(this_cpu, sd) { 1659 for_each_domain(rq->cpu, sd) {
1647 if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { 1660 if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
1648 schedstat_inc(sd, ttwu_wake_remote); 1661 schedstat_inc(sd->ttwu_wake_remote);
1649 break; 1662 break;
1650 } 1663 }
1651 } 1664 }
@@ -1653,17 +1666,14 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
1653 } 1666 }
1654 1667
1655 if (wake_flags & WF_MIGRATED) 1668 if (wake_flags & WF_MIGRATED)
1656 schedstat_inc(p, se.statistics.nr_wakeups_migrate); 1669 schedstat_inc(p->se.statistics.nr_wakeups_migrate);
1657
1658#endif /* CONFIG_SMP */ 1670#endif /* CONFIG_SMP */
1659 1671
1660 schedstat_inc(rq, ttwu_count); 1672 schedstat_inc(rq->ttwu_count);
1661 schedstat_inc(p, se.statistics.nr_wakeups); 1673 schedstat_inc(p->se.statistics.nr_wakeups);
1662 1674
1663 if (wake_flags & WF_SYNC) 1675 if (wake_flags & WF_SYNC)
1664 schedstat_inc(p, se.statistics.nr_wakeups_sync); 1676 schedstat_inc(p->se.statistics.nr_wakeups_sync);
1665
1666#endif /* CONFIG_SCHEDSTATS */
1667} 1677}
1668 1678
1669static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags) 1679static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
@@ -2084,8 +2094,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
2084 2094
2085 ttwu_queue(p, cpu, wake_flags); 2095 ttwu_queue(p, cpu, wake_flags);
2086stat: 2096stat:
2087 if (schedstat_enabled()) 2097 ttwu_stat(p, cpu, wake_flags);
2088 ttwu_stat(p, cpu, wake_flags);
2089out: 2098out:
2090 raw_spin_unlock_irqrestore(&p->pi_lock, flags); 2099 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
2091 2100
@@ -2095,6 +2104,7 @@ out:
2095/** 2104/**
2096 * try_to_wake_up_local - try to wake up a local task with rq lock held 2105 * try_to_wake_up_local - try to wake up a local task with rq lock held
2097 * @p: the thread to be awakened 2106 * @p: the thread to be awakened
2107 * @cookie: context's cookie for pinning
2098 * 2108 *
2099 * Put @p on the run-queue if it's not already there. The caller must 2109 * Put @p on the run-queue if it's not already there. The caller must
2100 * ensure that this_rq() is locked, @p is bound to this_rq() and not 2110 * ensure that this_rq() is locked, @p is bound to this_rq() and not
@@ -2133,8 +2143,7 @@ static void try_to_wake_up_local(struct task_struct *p, struct pin_cookie cookie
2133 ttwu_activate(rq, p, ENQUEUE_WAKEUP); 2143 ttwu_activate(rq, p, ENQUEUE_WAKEUP);
2134 2144
2135 ttwu_do_wakeup(rq, p, 0, cookie); 2145 ttwu_do_wakeup(rq, p, 0, cookie);
2136 if (schedstat_enabled()) 2146 ttwu_stat(p, smp_processor_id(), 0);
2137 ttwu_stat(p, smp_processor_id(), 0);
2138out: 2147out:
2139 raw_spin_unlock(&p->pi_lock); 2148 raw_spin_unlock(&p->pi_lock);
2140} 2149}
@@ -2772,6 +2781,10 @@ static struct rq *finish_task_switch(struct task_struct *prev)
2772 * task and put them back on the free list. 2781 * task and put them back on the free list.
2773 */ 2782 */
2774 kprobe_flush_task(prev); 2783 kprobe_flush_task(prev);
2784
2785 /* Task is done with its stack. */
2786 put_task_stack(prev);
2787
2775 put_task_struct(prev); 2788 put_task_struct(prev);
2776 } 2789 }
2777 2790
@@ -3192,6 +3205,9 @@ static inline void preempt_latency_stop(int val) { }
3192 */ 3205 */
3193static noinline void __schedule_bug(struct task_struct *prev) 3206static noinline void __schedule_bug(struct task_struct *prev)
3194{ 3207{
3208 /* Save this before calling printk(), since that will clobber it */
3209 unsigned long preempt_disable_ip = get_preempt_disable_ip(current);
3210
3195 if (oops_in_progress) 3211 if (oops_in_progress)
3196 return; 3212 return;
3197 3213
@@ -3202,13 +3218,12 @@ static noinline void __schedule_bug(struct task_struct *prev)
3202 print_modules(); 3218 print_modules();
3203 if (irqs_disabled()) 3219 if (irqs_disabled())
3204 print_irqtrace_events(prev); 3220 print_irqtrace_events(prev);
3205#ifdef CONFIG_DEBUG_PREEMPT 3221 if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)
3206 if (in_atomic_preempt_off()) { 3222 && in_atomic_preempt_off()) {
3207 pr_err("Preemption disabled at:"); 3223 pr_err("Preemption disabled at:");
3208 print_ip_sym(current->preempt_disable_ip); 3224 print_ip_sym(preempt_disable_ip);
3209 pr_cont("\n"); 3225 pr_cont("\n");
3210 } 3226 }
3211#endif
3212 if (panic_on_warn) 3227 if (panic_on_warn)
3213 panic("scheduling while atomic\n"); 3228 panic("scheduling while atomic\n");
3214 3229
@@ -3234,7 +3249,7 @@ static inline void schedule_debug(struct task_struct *prev)
3234 3249
3235 profile_hit(SCHED_PROFILING, __builtin_return_address(0)); 3250 profile_hit(SCHED_PROFILING, __builtin_return_address(0));
3236 3251
3237 schedstat_inc(this_rq(), sched_count); 3252 schedstat_inc(this_rq()->sched_count);
3238} 3253}
3239 3254
3240/* 3255/*
@@ -3327,17 +3342,6 @@ static void __sched notrace __schedule(bool preempt)
3327 rq = cpu_rq(cpu); 3342 rq = cpu_rq(cpu);
3328 prev = rq->curr; 3343 prev = rq->curr;
3329 3344
3330 /*
3331 * do_exit() calls schedule() with preemption disabled as an exception;
3332 * however we must fix that up, otherwise the next task will see an
3333 * inconsistent (higher) preempt count.
3334 *
3335 * It also avoids the below schedule_debug() test from complaining
3336 * about this.
3337 */
3338 if (unlikely(prev->state == TASK_DEAD))
3339 preempt_enable_no_resched_notrace();
3340
3341 schedule_debug(prev); 3345 schedule_debug(prev);
3342 3346
3343 if (sched_feat(HRTICK)) 3347 if (sched_feat(HRTICK))
@@ -3403,7 +3407,33 @@ static void __sched notrace __schedule(bool preempt)
3403 3407
3404 balance_callback(rq); 3408 balance_callback(rq);
3405} 3409}
3406STACK_FRAME_NON_STANDARD(__schedule); /* switch_to() */ 3410
3411void __noreturn do_task_dead(void)
3412{
3413 /*
3414 * The setting of TASK_RUNNING by try_to_wake_up() may be delayed
3415 * when the following two conditions become true.
3416 * - There is race condition of mmap_sem (It is acquired by
3417 * exit_mm()), and
3418 * - SMI occurs before setting TASK_RUNINNG.
3419 * (or hypervisor of virtual machine switches to other guest)
3420 * As a result, we may become TASK_RUNNING after becoming TASK_DEAD
3421 *
3422 * To avoid it, we have to wait for releasing tsk->pi_lock which
3423 * is held by try_to_wake_up()
3424 */
3425 smp_mb();
3426 raw_spin_unlock_wait(&current->pi_lock);
3427
3428 /* causes final put_task_struct in finish_task_switch(). */
3429 __set_current_state(TASK_DEAD);
3430 current->flags |= PF_NOFREEZE; /* tell freezer to ignore us */
3431 __schedule(false);
3432 BUG();
3433 /* Avoid "noreturn function does return". */
3434 for (;;)
3435 cpu_relax(); /* For when BUG is null */
3436}
3407 3437
3408static inline void sched_submit_work(struct task_struct *tsk) 3438static inline void sched_submit_work(struct task_struct *tsk)
3409{ 3439{
@@ -3687,10 +3717,10 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
3687 3717
3688 p->prio = prio; 3718 p->prio = prio;
3689 3719
3690 if (running)
3691 p->sched_class->set_curr_task(rq);
3692 if (queued) 3720 if (queued)
3693 enqueue_task(rq, p, queue_flag); 3721 enqueue_task(rq, p, queue_flag);
3722 if (running)
3723 set_curr_task(rq, p);
3694 3724
3695 check_class_changed(rq, p, prev_class, oldprio); 3725 check_class_changed(rq, p, prev_class, oldprio);
3696out_unlock: 3726out_unlock:
@@ -3704,7 +3734,8 @@ out_unlock:
3704 3734
3705void set_user_nice(struct task_struct *p, long nice) 3735void set_user_nice(struct task_struct *p, long nice)
3706{ 3736{
3707 int old_prio, delta, queued; 3737 bool queued, running;
3738 int old_prio, delta;
3708 struct rq_flags rf; 3739 struct rq_flags rf;
3709 struct rq *rq; 3740 struct rq *rq;
3710 3741
@@ -3726,8 +3757,11 @@ void set_user_nice(struct task_struct *p, long nice)
3726 goto out_unlock; 3757 goto out_unlock;
3727 } 3758 }
3728 queued = task_on_rq_queued(p); 3759 queued = task_on_rq_queued(p);
3760 running = task_current(rq, p);
3729 if (queued) 3761 if (queued)
3730 dequeue_task(rq, p, DEQUEUE_SAVE); 3762 dequeue_task(rq, p, DEQUEUE_SAVE);
3763 if (running)
3764 put_prev_task(rq, p);
3731 3765
3732 p->static_prio = NICE_TO_PRIO(nice); 3766 p->static_prio = NICE_TO_PRIO(nice);
3733 set_load_weight(p); 3767 set_load_weight(p);
@@ -3744,6 +3778,8 @@ void set_user_nice(struct task_struct *p, long nice)
3744 if (delta < 0 || (delta > 0 && task_running(rq, p))) 3778 if (delta < 0 || (delta > 0 && task_running(rq, p)))
3745 resched_curr(rq); 3779 resched_curr(rq);
3746 } 3780 }
3781 if (running)
3782 set_curr_task(rq, p);
3747out_unlock: 3783out_unlock:
3748 task_rq_unlock(rq, p, &rf); 3784 task_rq_unlock(rq, p, &rf);
3749} 3785}
@@ -4243,8 +4279,6 @@ change:
4243 prev_class = p->sched_class; 4279 prev_class = p->sched_class;
4244 __setscheduler(rq, p, attr, pi); 4280 __setscheduler(rq, p, attr, pi);
4245 4281
4246 if (running)
4247 p->sched_class->set_curr_task(rq);
4248 if (queued) { 4282 if (queued) {
4249 /* 4283 /*
4250 * We enqueue to tail when the priority of a task is 4284 * We enqueue to tail when the priority of a task is
@@ -4255,6 +4289,8 @@ change:
4255 4289
4256 enqueue_task(rq, p, queue_flags); 4290 enqueue_task(rq, p, queue_flags);
4257 } 4291 }
4292 if (running)
4293 set_curr_task(rq, p);
4258 4294
4259 check_class_changed(rq, p, prev_class, oldprio); 4295 check_class_changed(rq, p, prev_class, oldprio);
4260 preempt_disable(); /* avoid rq from going away on us */ 4296 preempt_disable(); /* avoid rq from going away on us */
@@ -4846,7 +4882,7 @@ SYSCALL_DEFINE0(sched_yield)
4846{ 4882{
4847 struct rq *rq = this_rq_lock(); 4883 struct rq *rq = this_rq_lock();
4848 4884
4849 schedstat_inc(rq, yld_count); 4885 schedstat_inc(rq->yld_count);
4850 current->sched_class->yield_task(rq); 4886 current->sched_class->yield_task(rq);
4851 4887
4852 /* 4888 /*
@@ -4863,6 +4899,7 @@ SYSCALL_DEFINE0(sched_yield)
4863 return 0; 4899 return 0;
4864} 4900}
4865 4901
4902#ifndef CONFIG_PREEMPT
4866int __sched _cond_resched(void) 4903int __sched _cond_resched(void)
4867{ 4904{
4868 if (should_resched(0)) { 4905 if (should_resched(0)) {
@@ -4872,6 +4909,7 @@ int __sched _cond_resched(void)
4872 return 0; 4909 return 0;
4873} 4910}
4874EXPORT_SYMBOL(_cond_resched); 4911EXPORT_SYMBOL(_cond_resched);
4912#endif
4875 4913
4876/* 4914/*
4877 * __cond_resched_lock() - if a reschedule is pending, drop the given lock, 4915 * __cond_resched_lock() - if a reschedule is pending, drop the given lock,
@@ -4997,7 +5035,7 @@ again:
4997 5035
4998 yielded = curr->sched_class->yield_to_task(rq, p, preempt); 5036 yielded = curr->sched_class->yield_to_task(rq, p, preempt);
4999 if (yielded) { 5037 if (yielded) {
5000 schedstat_inc(rq, yld_count); 5038 schedstat_inc(rq->yld_count);
5001 /* 5039 /*
5002 * Make p's CPU reschedule; pick_next_entity takes care of 5040 * Make p's CPU reschedule; pick_next_entity takes care of
5003 * fairness. 5041 * fairness.
@@ -5154,21 +5192,14 @@ void sched_show_task(struct task_struct *p)
5154 int ppid; 5192 int ppid;
5155 unsigned long state = p->state; 5193 unsigned long state = p->state;
5156 5194
5195 if (!try_get_task_stack(p))
5196 return;
5157 if (state) 5197 if (state)
5158 state = __ffs(state) + 1; 5198 state = __ffs(state) + 1;
5159 printk(KERN_INFO "%-15.15s %c", p->comm, 5199 printk(KERN_INFO "%-15.15s %c", p->comm,
5160 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); 5200 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
5161#if BITS_PER_LONG == 32
5162 if (state == TASK_RUNNING)
5163 printk(KERN_CONT " running ");
5164 else
5165 printk(KERN_CONT " %08lx ", thread_saved_pc(p));
5166#else
5167 if (state == TASK_RUNNING) 5201 if (state == TASK_RUNNING)
5168 printk(KERN_CONT " running task "); 5202 printk(KERN_CONT " running task ");
5169 else
5170 printk(KERN_CONT " %016lx ", thread_saved_pc(p));
5171#endif
5172#ifdef CONFIG_DEBUG_STACK_USAGE 5203#ifdef CONFIG_DEBUG_STACK_USAGE
5173 free = stack_not_used(p); 5204 free = stack_not_used(p);
5174#endif 5205#endif
@@ -5183,6 +5214,7 @@ void sched_show_task(struct task_struct *p)
5183 5214
5184 print_worker_info(KERN_INFO, p); 5215 print_worker_info(KERN_INFO, p);
5185 show_stack(p, NULL); 5216 show_stack(p, NULL);
5217 put_task_stack(p);
5186} 5218}
5187 5219
5188void show_state_filter(unsigned long state_filter) 5220void show_state_filter(unsigned long state_filter)
@@ -5417,10 +5449,10 @@ void sched_setnuma(struct task_struct *p, int nid)
5417 5449
5418 p->numa_preferred_nid = nid; 5450 p->numa_preferred_nid = nid;
5419 5451
5420 if (running)
5421 p->sched_class->set_curr_task(rq);
5422 if (queued) 5452 if (queued)
5423 enqueue_task(rq, p, ENQUEUE_RESTORE); 5453 enqueue_task(rq, p, ENQUEUE_RESTORE);
5454 if (running)
5455 set_curr_task(rq, p);
5424 task_rq_unlock(rq, p, &rf); 5456 task_rq_unlock(rq, p, &rf);
5425} 5457}
5426#endif /* CONFIG_NUMA_BALANCING */ 5458#endif /* CONFIG_NUMA_BALANCING */
@@ -5717,6 +5749,8 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
5717 } 5749 }
5718} 5750}
5719#else /* !CONFIG_SCHED_DEBUG */ 5751#else /* !CONFIG_SCHED_DEBUG */
5752
5753# define sched_debug_enabled 0
5720# define sched_domain_debug(sd, cpu) do { } while (0) 5754# define sched_domain_debug(sd, cpu) do { } while (0)
5721static inline bool sched_debug(void) 5755static inline bool sched_debug(void)
5722{ 5756{
@@ -5735,6 +5769,7 @@ static int sd_degenerate(struct sched_domain *sd)
5735 SD_BALANCE_FORK | 5769 SD_BALANCE_FORK |
5736 SD_BALANCE_EXEC | 5770 SD_BALANCE_EXEC |
5737 SD_SHARE_CPUCAPACITY | 5771 SD_SHARE_CPUCAPACITY |
5772 SD_ASYM_CPUCAPACITY |
5738 SD_SHARE_PKG_RESOURCES | 5773 SD_SHARE_PKG_RESOURCES |
5739 SD_SHARE_POWERDOMAIN)) { 5774 SD_SHARE_POWERDOMAIN)) {
5740 if (sd->groups != sd->groups->next) 5775 if (sd->groups != sd->groups->next)
@@ -5765,6 +5800,7 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
5765 SD_BALANCE_NEWIDLE | 5800 SD_BALANCE_NEWIDLE |
5766 SD_BALANCE_FORK | 5801 SD_BALANCE_FORK |
5767 SD_BALANCE_EXEC | 5802 SD_BALANCE_EXEC |
5803 SD_ASYM_CPUCAPACITY |
5768 SD_SHARE_CPUCAPACITY | 5804 SD_SHARE_CPUCAPACITY |
5769 SD_SHARE_PKG_RESOURCES | 5805 SD_SHARE_PKG_RESOURCES |
5770 SD_PREFER_SIBLING | 5806 SD_PREFER_SIBLING |
@@ -5909,10 +5945,8 @@ static void free_sched_groups(struct sched_group *sg, int free_sgc)
5909 } while (sg != first); 5945 } while (sg != first);
5910} 5946}
5911 5947
5912static void free_sched_domain(struct rcu_head *rcu) 5948static void destroy_sched_domain(struct sched_domain *sd)
5913{ 5949{
5914 struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
5915
5916 /* 5950 /*
5917 * If its an overlapping domain it has private groups, iterate and 5951 * If its an overlapping domain it has private groups, iterate and
5918 * nuke them all. 5952 * nuke them all.
@@ -5923,18 +5957,26 @@ static void free_sched_domain(struct rcu_head *rcu)
5923 kfree(sd->groups->sgc); 5957 kfree(sd->groups->sgc);
5924 kfree(sd->groups); 5958 kfree(sd->groups);
5925 } 5959 }
5960 if (sd->shared && atomic_dec_and_test(&sd->shared->ref))
5961 kfree(sd->shared);
5926 kfree(sd); 5962 kfree(sd);
5927} 5963}
5928 5964
5929static void destroy_sched_domain(struct sched_domain *sd, int cpu) 5965static void destroy_sched_domains_rcu(struct rcu_head *rcu)
5930{ 5966{
5931 call_rcu(&sd->rcu, free_sched_domain); 5967 struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
5968
5969 while (sd) {
5970 struct sched_domain *parent = sd->parent;
5971 destroy_sched_domain(sd);
5972 sd = parent;
5973 }
5932} 5974}
5933 5975
5934static void destroy_sched_domains(struct sched_domain *sd, int cpu) 5976static void destroy_sched_domains(struct sched_domain *sd)
5935{ 5977{
5936 for (; sd; sd = sd->parent) 5978 if (sd)
5937 destroy_sched_domain(sd, cpu); 5979 call_rcu(&sd->rcu, destroy_sched_domains_rcu);
5938} 5980}
5939 5981
5940/* 5982/*
@@ -5949,14 +5991,14 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu)
5949DEFINE_PER_CPU(struct sched_domain *, sd_llc); 5991DEFINE_PER_CPU(struct sched_domain *, sd_llc);
5950DEFINE_PER_CPU(int, sd_llc_size); 5992DEFINE_PER_CPU(int, sd_llc_size);
5951DEFINE_PER_CPU(int, sd_llc_id); 5993DEFINE_PER_CPU(int, sd_llc_id);
5994DEFINE_PER_CPU(struct sched_domain_shared *, sd_llc_shared);
5952DEFINE_PER_CPU(struct sched_domain *, sd_numa); 5995DEFINE_PER_CPU(struct sched_domain *, sd_numa);
5953DEFINE_PER_CPU(struct sched_domain *, sd_busy);
5954DEFINE_PER_CPU(struct sched_domain *, sd_asym); 5996DEFINE_PER_CPU(struct sched_domain *, sd_asym);
5955 5997
5956static void update_top_cache_domain(int cpu) 5998static void update_top_cache_domain(int cpu)
5957{ 5999{
6000 struct sched_domain_shared *sds = NULL;
5958 struct sched_domain *sd; 6001 struct sched_domain *sd;
5959 struct sched_domain *busy_sd = NULL;
5960 int id = cpu; 6002 int id = cpu;
5961 int size = 1; 6003 int size = 1;
5962 6004
@@ -5964,13 +6006,13 @@ static void update_top_cache_domain(int cpu)
5964 if (sd) { 6006 if (sd) {
5965 id = cpumask_first(sched_domain_span(sd)); 6007 id = cpumask_first(sched_domain_span(sd));
5966 size = cpumask_weight(sched_domain_span(sd)); 6008 size = cpumask_weight(sched_domain_span(sd));
5967 busy_sd = sd->parent; /* sd_busy */ 6009 sds = sd->shared;
5968 } 6010 }
5969 rcu_assign_pointer(per_cpu(sd_busy, cpu), busy_sd);
5970 6011
5971 rcu_assign_pointer(per_cpu(sd_llc, cpu), sd); 6012 rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
5972 per_cpu(sd_llc_size, cpu) = size; 6013 per_cpu(sd_llc_size, cpu) = size;
5973 per_cpu(sd_llc_id, cpu) = id; 6014 per_cpu(sd_llc_id, cpu) = id;
6015 rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds);
5974 6016
5975 sd = lowest_flag_domain(cpu, SD_NUMA); 6017 sd = lowest_flag_domain(cpu, SD_NUMA);
5976 rcu_assign_pointer(per_cpu(sd_numa, cpu), sd); 6018 rcu_assign_pointer(per_cpu(sd_numa, cpu), sd);
@@ -6006,7 +6048,7 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
6006 */ 6048 */
6007 if (parent->flags & SD_PREFER_SIBLING) 6049 if (parent->flags & SD_PREFER_SIBLING)
6008 tmp->flags |= SD_PREFER_SIBLING; 6050 tmp->flags |= SD_PREFER_SIBLING;
6009 destroy_sched_domain(parent, cpu); 6051 destroy_sched_domain(parent);
6010 } else 6052 } else
6011 tmp = tmp->parent; 6053 tmp = tmp->parent;
6012 } 6054 }
@@ -6014,7 +6056,7 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
6014 if (sd && sd_degenerate(sd)) { 6056 if (sd && sd_degenerate(sd)) {
6015 tmp = sd; 6057 tmp = sd;
6016 sd = sd->parent; 6058 sd = sd->parent;
6017 destroy_sched_domain(tmp, cpu); 6059 destroy_sched_domain(tmp);
6018 if (sd) 6060 if (sd)
6019 sd->child = NULL; 6061 sd->child = NULL;
6020 } 6062 }
@@ -6024,7 +6066,7 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
6024 rq_attach_root(rq, rd); 6066 rq_attach_root(rq, rd);
6025 tmp = rq->sd; 6067 tmp = rq->sd;
6026 rcu_assign_pointer(rq->sd, sd); 6068 rcu_assign_pointer(rq->sd, sd);
6027 destroy_sched_domains(tmp, cpu); 6069 destroy_sched_domains(tmp);
6028 6070
6029 update_top_cache_domain(cpu); 6071 update_top_cache_domain(cpu);
6030} 6072}
@@ -6267,7 +6309,6 @@ static void init_sched_groups_capacity(int cpu, struct sched_domain *sd)
6267 return; 6309 return;
6268 6310
6269 update_group_capacity(sd, cpu); 6311 update_group_capacity(sd, cpu);
6270 atomic_set(&sg->sgc->nr_busy_cpus, sg->group_weight);
6271} 6312}
6272 6313
6273/* 6314/*
@@ -6355,6 +6396,9 @@ static void claim_allocations(int cpu, struct sched_domain *sd)
6355 WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd); 6396 WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
6356 *per_cpu_ptr(sdd->sd, cpu) = NULL; 6397 *per_cpu_ptr(sdd->sd, cpu) = NULL;
6357 6398
6399 if (atomic_read(&(*per_cpu_ptr(sdd->sds, cpu))->ref))
6400 *per_cpu_ptr(sdd->sds, cpu) = NULL;
6401
6358 if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref)) 6402 if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref))
6359 *per_cpu_ptr(sdd->sg, cpu) = NULL; 6403 *per_cpu_ptr(sdd->sg, cpu) = NULL;
6360 6404
@@ -6374,26 +6418,37 @@ static int sched_domains_curr_level;
6374/* 6418/*
6375 * SD_flags allowed in topology descriptions. 6419 * SD_flags allowed in topology descriptions.
6376 * 6420 *
6377 * SD_SHARE_CPUCAPACITY - describes SMT topologies 6421 * These flags are purely descriptive of the topology and do not prescribe
6378 * SD_SHARE_PKG_RESOURCES - describes shared caches 6422 * behaviour. Behaviour is artificial and mapped in the below sd_init()
6379 * SD_NUMA - describes NUMA topologies 6423 * function:
6380 * SD_SHARE_POWERDOMAIN - describes shared power domain 6424 *
6425 * SD_SHARE_CPUCAPACITY - describes SMT topologies
6426 * SD_SHARE_PKG_RESOURCES - describes shared caches
6427 * SD_NUMA - describes NUMA topologies
6428 * SD_SHARE_POWERDOMAIN - describes shared power domain
6429 * SD_ASYM_CPUCAPACITY - describes mixed capacity topologies
6381 * 6430 *
6382 * Odd one out: 6431 * Odd one out, which beside describing the topology has a quirk also
6383 * SD_ASYM_PACKING - describes SMT quirks 6432 * prescribes the desired behaviour that goes along with it:
6433 *
6434 * SD_ASYM_PACKING - describes SMT quirks
6384 */ 6435 */
6385#define TOPOLOGY_SD_FLAGS \ 6436#define TOPOLOGY_SD_FLAGS \
6386 (SD_SHARE_CPUCAPACITY | \ 6437 (SD_SHARE_CPUCAPACITY | \
6387 SD_SHARE_PKG_RESOURCES | \ 6438 SD_SHARE_PKG_RESOURCES | \
6388 SD_NUMA | \ 6439 SD_NUMA | \
6389 SD_ASYM_PACKING | \ 6440 SD_ASYM_PACKING | \
6441 SD_ASYM_CPUCAPACITY | \
6390 SD_SHARE_POWERDOMAIN) 6442 SD_SHARE_POWERDOMAIN)
6391 6443
6392static struct sched_domain * 6444static struct sched_domain *
6393sd_init(struct sched_domain_topology_level *tl, int cpu) 6445sd_init(struct sched_domain_topology_level *tl,
6446 const struct cpumask *cpu_map,
6447 struct sched_domain *child, int cpu)
6394{ 6448{
6395 struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); 6449 struct sd_data *sdd = &tl->data;
6396 int sd_weight, sd_flags = 0; 6450 struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
6451 int sd_id, sd_weight, sd_flags = 0;
6397 6452
6398#ifdef CONFIG_NUMA 6453#ifdef CONFIG_NUMA
6399 /* 6454 /*
@@ -6442,15 +6497,26 @@ sd_init(struct sched_domain_topology_level *tl, int cpu)
6442 .smt_gain = 0, 6497 .smt_gain = 0,
6443 .max_newidle_lb_cost = 0, 6498 .max_newidle_lb_cost = 0,
6444 .next_decay_max_lb_cost = jiffies, 6499 .next_decay_max_lb_cost = jiffies,
6500 .child = child,
6445#ifdef CONFIG_SCHED_DEBUG 6501#ifdef CONFIG_SCHED_DEBUG
6446 .name = tl->name, 6502 .name = tl->name,
6447#endif 6503#endif
6448 }; 6504 };
6449 6505
6506 cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
6507 sd_id = cpumask_first(sched_domain_span(sd));
6508
6450 /* 6509 /*
6451 * Convert topological properties into behaviour. 6510 * Convert topological properties into behaviour.
6452 */ 6511 */
6453 6512
6513 if (sd->flags & SD_ASYM_CPUCAPACITY) {
6514 struct sched_domain *t = sd;
6515
6516 for_each_lower_domain(t)
6517 t->flags |= SD_BALANCE_WAKE;
6518 }
6519
6454 if (sd->flags & SD_SHARE_CPUCAPACITY) { 6520 if (sd->flags & SD_SHARE_CPUCAPACITY) {
6455 sd->flags |= SD_PREFER_SIBLING; 6521 sd->flags |= SD_PREFER_SIBLING;
6456 sd->imbalance_pct = 110; 6522 sd->imbalance_pct = 110;
@@ -6482,7 +6548,17 @@ sd_init(struct sched_domain_topology_level *tl, int cpu)
6482 sd->idle_idx = 1; 6548 sd->idle_idx = 1;
6483 } 6549 }
6484 6550
6485 sd->private = &tl->data; 6551 /*
6552 * For all levels sharing cache; connect a sched_domain_shared
6553 * instance.
6554 */
6555 if (sd->flags & SD_SHARE_PKG_RESOURCES) {
6556 sd->shared = *per_cpu_ptr(sdd->sds, sd_id);
6557 atomic_inc(&sd->shared->ref);
6558 atomic_set(&sd->shared->nr_busy_cpus, sd_weight);
6559 }
6560
6561 sd->private = sdd;
6486 6562
6487 return sd; 6563 return sd;
6488} 6564}
@@ -6509,6 +6585,9 @@ static struct sched_domain_topology_level *sched_domain_topology =
6509 6585
6510void set_sched_topology(struct sched_domain_topology_level *tl) 6586void set_sched_topology(struct sched_domain_topology_level *tl)
6511{ 6587{
6588 if (WARN_ON_ONCE(sched_smp_initialized))
6589 return;
6590
6512 sched_domain_topology = tl; 6591 sched_domain_topology = tl;
6513} 6592}
6514 6593
@@ -6789,6 +6868,10 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
6789 if (!sdd->sd) 6868 if (!sdd->sd)
6790 return -ENOMEM; 6869 return -ENOMEM;
6791 6870
6871 sdd->sds = alloc_percpu(struct sched_domain_shared *);
6872 if (!sdd->sds)
6873 return -ENOMEM;
6874
6792 sdd->sg = alloc_percpu(struct sched_group *); 6875 sdd->sg = alloc_percpu(struct sched_group *);
6793 if (!sdd->sg) 6876 if (!sdd->sg)
6794 return -ENOMEM; 6877 return -ENOMEM;
@@ -6799,6 +6882,7 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
6799 6882
6800 for_each_cpu(j, cpu_map) { 6883 for_each_cpu(j, cpu_map) {
6801 struct sched_domain *sd; 6884 struct sched_domain *sd;
6885 struct sched_domain_shared *sds;
6802 struct sched_group *sg; 6886 struct sched_group *sg;
6803 struct sched_group_capacity *sgc; 6887 struct sched_group_capacity *sgc;
6804 6888
@@ -6809,6 +6893,13 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
6809 6893
6810 *per_cpu_ptr(sdd->sd, j) = sd; 6894 *per_cpu_ptr(sdd->sd, j) = sd;
6811 6895
6896 sds = kzalloc_node(sizeof(struct sched_domain_shared),
6897 GFP_KERNEL, cpu_to_node(j));
6898 if (!sds)
6899 return -ENOMEM;
6900
6901 *per_cpu_ptr(sdd->sds, j) = sds;
6902
6812 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), 6903 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
6813 GFP_KERNEL, cpu_to_node(j)); 6904 GFP_KERNEL, cpu_to_node(j));
6814 if (!sg) 6905 if (!sg)
@@ -6848,6 +6939,8 @@ static void __sdt_free(const struct cpumask *cpu_map)
6848 kfree(*per_cpu_ptr(sdd->sd, j)); 6939 kfree(*per_cpu_ptr(sdd->sd, j));
6849 } 6940 }
6850 6941
6942 if (sdd->sds)
6943 kfree(*per_cpu_ptr(sdd->sds, j));
6851 if (sdd->sg) 6944 if (sdd->sg)
6852 kfree(*per_cpu_ptr(sdd->sg, j)); 6945 kfree(*per_cpu_ptr(sdd->sg, j));
6853 if (sdd->sgc) 6946 if (sdd->sgc)
@@ -6855,6 +6948,8 @@ static void __sdt_free(const struct cpumask *cpu_map)
6855 } 6948 }
6856 free_percpu(sdd->sd); 6949 free_percpu(sdd->sd);
6857 sdd->sd = NULL; 6950 sdd->sd = NULL;
6951 free_percpu(sdd->sds);
6952 sdd->sds = NULL;
6858 free_percpu(sdd->sg); 6953 free_percpu(sdd->sg);
6859 sdd->sg = NULL; 6954 sdd->sg = NULL;
6860 free_percpu(sdd->sgc); 6955 free_percpu(sdd->sgc);
@@ -6866,16 +6961,12 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
6866 const struct cpumask *cpu_map, struct sched_domain_attr *attr, 6961 const struct cpumask *cpu_map, struct sched_domain_attr *attr,
6867 struct sched_domain *child, int cpu) 6962 struct sched_domain *child, int cpu)
6868{ 6963{
6869 struct sched_domain *sd = sd_init(tl, cpu); 6964 struct sched_domain *sd = sd_init(tl, cpu_map, child, cpu);
6870 if (!sd)
6871 return child;
6872 6965
6873 cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
6874 if (child) { 6966 if (child) {
6875 sd->level = child->level + 1; 6967 sd->level = child->level + 1;
6876 sched_domain_level_max = max(sched_domain_level_max, sd->level); 6968 sched_domain_level_max = max(sched_domain_level_max, sd->level);
6877 child->parent = sd; 6969 child->parent = sd;
6878 sd->child = child;
6879 6970
6880 if (!cpumask_subset(sched_domain_span(child), 6971 if (!cpumask_subset(sched_domain_span(child),
6881 sched_domain_span(sd))) { 6972 sched_domain_span(sd))) {
@@ -6906,6 +6997,7 @@ static int build_sched_domains(const struct cpumask *cpu_map,
6906 enum s_alloc alloc_state; 6997 enum s_alloc alloc_state;
6907 struct sched_domain *sd; 6998 struct sched_domain *sd;
6908 struct s_data d; 6999 struct s_data d;
7000 struct rq *rq = NULL;
6909 int i, ret = -ENOMEM; 7001 int i, ret = -ENOMEM;
6910 7002
6911 alloc_state = __visit_domain_allocation_hell(&d, cpu_map); 7003 alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
@@ -6956,11 +7048,22 @@ static int build_sched_domains(const struct cpumask *cpu_map,
6956 /* Attach the domains */ 7048 /* Attach the domains */
6957 rcu_read_lock(); 7049 rcu_read_lock();
6958 for_each_cpu(i, cpu_map) { 7050 for_each_cpu(i, cpu_map) {
7051 rq = cpu_rq(i);
6959 sd = *per_cpu_ptr(d.sd, i); 7052 sd = *per_cpu_ptr(d.sd, i);
7053
7054 /* Use READ_ONCE()/WRITE_ONCE() to avoid load/store tearing: */
7055 if (rq->cpu_capacity_orig > READ_ONCE(d.rd->max_cpu_capacity))
7056 WRITE_ONCE(d.rd->max_cpu_capacity, rq->cpu_capacity_orig);
7057
6960 cpu_attach_domain(sd, d.rd, i); 7058 cpu_attach_domain(sd, d.rd, i);
6961 } 7059 }
6962 rcu_read_unlock(); 7060 rcu_read_unlock();
6963 7061
7062 if (rq && sched_debug_enabled) {
7063 pr_info("span: %*pbl (max cpu_capacity = %lu)\n",
7064 cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity);
7065 }
7066
6964 ret = 0; 7067 ret = 0;
6965error: 7068error:
6966 __free_domain_allocs(&d, alloc_state, cpu_map); 7069 __free_domain_allocs(&d, alloc_state, cpu_map);
@@ -7319,6 +7422,22 @@ int sched_cpu_dying(unsigned int cpu)
7319} 7422}
7320#endif 7423#endif
7321 7424
7425#ifdef CONFIG_SCHED_SMT
7426DEFINE_STATIC_KEY_FALSE(sched_smt_present);
7427
7428static void sched_init_smt(void)
7429{
7430 /*
7431 * We've enumerated all CPUs and will assume that if any CPU
7432 * has SMT siblings, CPU0 will too.
7433 */
7434 if (cpumask_weight(cpu_smt_mask(0)) > 1)
7435 static_branch_enable(&sched_smt_present);
7436}
7437#else
7438static inline void sched_init_smt(void) { }
7439#endif
7440
7322void __init sched_init_smp(void) 7441void __init sched_init_smp(void)
7323{ 7442{
7324 cpumask_var_t non_isolated_cpus; 7443 cpumask_var_t non_isolated_cpus;
@@ -7348,6 +7467,9 @@ void __init sched_init_smp(void)
7348 7467
7349 init_sched_rt_class(); 7468 init_sched_rt_class();
7350 init_sched_dl_class(); 7469 init_sched_dl_class();
7470
7471 sched_init_smt();
7472
7351 sched_smp_initialized = true; 7473 sched_smp_initialized = true;
7352} 7474}
7353 7475
@@ -7385,12 +7507,29 @@ static struct kmem_cache *task_group_cache __read_mostly;
7385#endif 7507#endif
7386 7508
7387DECLARE_PER_CPU(cpumask_var_t, load_balance_mask); 7509DECLARE_PER_CPU(cpumask_var_t, load_balance_mask);
7510DECLARE_PER_CPU(cpumask_var_t, select_idle_mask);
7511
7512#define WAIT_TABLE_BITS 8
7513#define WAIT_TABLE_SIZE (1 << WAIT_TABLE_BITS)
7514static wait_queue_head_t bit_wait_table[WAIT_TABLE_SIZE] __cacheline_aligned;
7515
7516wait_queue_head_t *bit_waitqueue(void *word, int bit)
7517{
7518 const int shift = BITS_PER_LONG == 32 ? 5 : 6;
7519 unsigned long val = (unsigned long)word << shift | bit;
7520
7521 return bit_wait_table + hash_long(val, WAIT_TABLE_BITS);
7522}
7523EXPORT_SYMBOL(bit_waitqueue);
7388 7524
7389void __init sched_init(void) 7525void __init sched_init(void)
7390{ 7526{
7391 int i, j; 7527 int i, j;
7392 unsigned long alloc_size = 0, ptr; 7528 unsigned long alloc_size = 0, ptr;
7393 7529
7530 for (i = 0; i < WAIT_TABLE_SIZE; i++)
7531 init_waitqueue_head(bit_wait_table + i);
7532
7394#ifdef CONFIG_FAIR_GROUP_SCHED 7533#ifdef CONFIG_FAIR_GROUP_SCHED
7395 alloc_size += 2 * nr_cpu_ids * sizeof(void **); 7534 alloc_size += 2 * nr_cpu_ids * sizeof(void **);
7396#endif 7535#endif
@@ -7421,6 +7560,8 @@ void __init sched_init(void)
7421 for_each_possible_cpu(i) { 7560 for_each_possible_cpu(i) {
7422 per_cpu(load_balance_mask, i) = (cpumask_var_t)kzalloc_node( 7561 per_cpu(load_balance_mask, i) = (cpumask_var_t)kzalloc_node(
7423 cpumask_size(), GFP_KERNEL, cpu_to_node(i)); 7562 cpumask_size(), GFP_KERNEL, cpu_to_node(i));
7563 per_cpu(select_idle_mask, i) = (cpumask_var_t)kzalloc_node(
7564 cpumask_size(), GFP_KERNEL, cpu_to_node(i));
7424 } 7565 }
7425#endif /* CONFIG_CPUMASK_OFFSTACK */ 7566#endif /* CONFIG_CPUMASK_OFFSTACK */
7426 7567
@@ -7523,10 +7664,6 @@ void __init sched_init(void)
7523 7664
7524 set_load_weight(&init_task); 7665 set_load_weight(&init_task);
7525 7666
7526#ifdef CONFIG_PREEMPT_NOTIFIERS
7527 INIT_HLIST_HEAD(&init_task.preempt_notifiers);
7528#endif
7529
7530 /* 7667 /*
7531 * The boot idle thread does lazy MMU switching as well: 7668 * The boot idle thread does lazy MMU switching as well:
7532 */ 7669 */
@@ -7534,11 +7671,6 @@ void __init sched_init(void)
7534 enter_lazy_tlb(&init_mm, current); 7671 enter_lazy_tlb(&init_mm, current);
7535 7672
7536 /* 7673 /*
7537 * During early bootup we pretend to be a normal task:
7538 */
7539 current->sched_class = &fair_sched_class;
7540
7541 /*
7542 * Make us the idle thread. Technically, schedule() should not be 7674 * Make us the idle thread. Technically, schedule() should not be
7543 * called from this thread, however somewhere below it might be, 7675 * called from this thread, however somewhere below it might be,
7544 * but because we are the idle thread, we just pick up running again 7676 * but because we are the idle thread, we just pick up running again
@@ -7592,6 +7724,7 @@ EXPORT_SYMBOL(__might_sleep);
7592void ___might_sleep(const char *file, int line, int preempt_offset) 7724void ___might_sleep(const char *file, int line, int preempt_offset)
7593{ 7725{
7594 static unsigned long prev_jiffy; /* ratelimiting */ 7726 static unsigned long prev_jiffy; /* ratelimiting */
7727 unsigned long preempt_disable_ip;
7595 7728
7596 rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */ 7729 rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */
7597 if ((preempt_count_equals(preempt_offset) && !irqs_disabled() && 7730 if ((preempt_count_equals(preempt_offset) && !irqs_disabled() &&
@@ -7602,6 +7735,9 @@ void ___might_sleep(const char *file, int line, int preempt_offset)
7602 return; 7735 return;
7603 prev_jiffy = jiffies; 7736 prev_jiffy = jiffies;
7604 7737
7738 /* Save this before calling printk(), since that will clobber it */
7739 preempt_disable_ip = get_preempt_disable_ip(current);
7740
7605 printk(KERN_ERR 7741 printk(KERN_ERR
7606 "BUG: sleeping function called from invalid context at %s:%d\n", 7742 "BUG: sleeping function called from invalid context at %s:%d\n",
7607 file, line); 7743 file, line);
@@ -7616,14 +7752,14 @@ void ___might_sleep(const char *file, int line, int preempt_offset)
7616 debug_show_held_locks(current); 7752 debug_show_held_locks(current);
7617 if (irqs_disabled()) 7753 if (irqs_disabled())
7618 print_irqtrace_events(current); 7754 print_irqtrace_events(current);
7619#ifdef CONFIG_DEBUG_PREEMPT 7755 if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)
7620 if (!preempt_count_equals(preempt_offset)) { 7756 && !preempt_count_equals(preempt_offset)) {
7621 pr_err("Preemption disabled at:"); 7757 pr_err("Preemption disabled at:");
7622 print_ip_sym(current->preempt_disable_ip); 7758 print_ip_sym(preempt_disable_ip);
7623 pr_cont("\n"); 7759 pr_cont("\n");
7624 } 7760 }
7625#endif
7626 dump_stack(); 7761 dump_stack();
7762 add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
7627} 7763}
7628EXPORT_SYMBOL(___might_sleep); 7764EXPORT_SYMBOL(___might_sleep);
7629#endif 7765#endif
@@ -7644,12 +7780,10 @@ void normalize_rt_tasks(void)
7644 if (p->flags & PF_KTHREAD) 7780 if (p->flags & PF_KTHREAD)
7645 continue; 7781 continue;
7646 7782
7647 p->se.exec_start = 0; 7783 p->se.exec_start = 0;
7648#ifdef CONFIG_SCHEDSTATS 7784 schedstat_set(p->se.statistics.wait_start, 0);
7649 p->se.statistics.wait_start = 0; 7785 schedstat_set(p->se.statistics.sleep_start, 0);
7650 p->se.statistics.sleep_start = 0; 7786 schedstat_set(p->se.statistics.block_start, 0);
7651 p->se.statistics.block_start = 0;
7652#endif
7653 7787
7654 if (!dl_task(p) && !rt_task(p)) { 7788 if (!dl_task(p) && !rt_task(p)) {
7655 /* 7789 /*
@@ -7710,7 +7844,7 @@ struct task_struct *curr_task(int cpu)
7710 * 7844 *
7711 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! 7845 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
7712 */ 7846 */
7713void set_curr_task(int cpu, struct task_struct *p) 7847void ia64_set_curr_task(int cpu, struct task_struct *p)
7714{ 7848{
7715 cpu_curr(cpu) = p; 7849 cpu_curr(cpu) = p;
7716} 7850}
@@ -7841,10 +7975,10 @@ void sched_move_task(struct task_struct *tsk)
7841 7975
7842 sched_change_group(tsk, TASK_MOVE_GROUP); 7976 sched_change_group(tsk, TASK_MOVE_GROUP);
7843 7977
7844 if (unlikely(running))
7845 tsk->sched_class->set_curr_task(rq);
7846 if (queued) 7978 if (queued)
7847 enqueue_task(rq, tsk, ENQUEUE_RESTORE | ENQUEUE_MOVE); 7979 enqueue_task(rq, tsk, ENQUEUE_RESTORE | ENQUEUE_MOVE);
7980 if (unlikely(running))
7981 set_curr_task(rq, tsk);
7848 7982
7849 task_rq_unlock(rq, tsk, &rf); 7983 task_rq_unlock(rq, tsk, &rf);
7850} 7984}
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
index d4184498c9f5..e73119013c53 100644
--- a/kernel/sched/cpudeadline.c
+++ b/kernel/sched/cpudeadline.c
@@ -31,56 +31,81 @@ static inline int right_child(int i)
31 return (i << 1) + 2; 31 return (i << 1) + 2;
32} 32}
33 33
34static void cpudl_exchange(struct cpudl *cp, int a, int b) 34static void cpudl_heapify_down(struct cpudl *cp, int idx)
35{ 35{
36 int cpu_a = cp->elements[a].cpu, cpu_b = cp->elements[b].cpu; 36 int l, r, largest;
37 37
38 swap(cp->elements[a].cpu, cp->elements[b].cpu); 38 int orig_cpu = cp->elements[idx].cpu;
39 swap(cp->elements[a].dl , cp->elements[b].dl ); 39 u64 orig_dl = cp->elements[idx].dl;
40 40
41 swap(cp->elements[cpu_a].idx, cp->elements[cpu_b].idx); 41 if (left_child(idx) >= cp->size)
42} 42 return;
43
44static void cpudl_heapify(struct cpudl *cp, int idx)
45{
46 int l, r, largest;
47 43
48 /* adapted from lib/prio_heap.c */ 44 /* adapted from lib/prio_heap.c */
49 while(1) { 45 while(1) {
46 u64 largest_dl;
50 l = left_child(idx); 47 l = left_child(idx);
51 r = right_child(idx); 48 r = right_child(idx);
52 largest = idx; 49 largest = idx;
50 largest_dl = orig_dl;
53 51
54 if ((l < cp->size) && dl_time_before(cp->elements[idx].dl, 52 if ((l < cp->size) && dl_time_before(orig_dl,
55 cp->elements[l].dl)) 53 cp->elements[l].dl)) {
56 largest = l; 54 largest = l;
57 if ((r < cp->size) && dl_time_before(cp->elements[largest].dl, 55 largest_dl = cp->elements[l].dl;
58 cp->elements[r].dl)) 56 }
57 if ((r < cp->size) && dl_time_before(largest_dl,
58 cp->elements[r].dl))
59 largest = r; 59 largest = r;
60
60 if (largest == idx) 61 if (largest == idx)
61 break; 62 break;
62 63
63 /* Push idx down the heap one level and bump one up */ 64 /* pull largest child onto idx */
64 cpudl_exchange(cp, largest, idx); 65 cp->elements[idx].cpu = cp->elements[largest].cpu;
66 cp->elements[idx].dl = cp->elements[largest].dl;
67 cp->elements[cp->elements[idx].cpu].idx = idx;
65 idx = largest; 68 idx = largest;
66 } 69 }
70 /* actual push down of saved original values orig_* */
71 cp->elements[idx].cpu = orig_cpu;
72 cp->elements[idx].dl = orig_dl;
73 cp->elements[cp->elements[idx].cpu].idx = idx;
67} 74}
68 75
69static void cpudl_change_key(struct cpudl *cp, int idx, u64 new_dl) 76static void cpudl_heapify_up(struct cpudl *cp, int idx)
70{ 77{
71 WARN_ON(idx == IDX_INVALID || !cpu_present(idx)); 78 int p;
72 79
73 if (dl_time_before(new_dl, cp->elements[idx].dl)) { 80 int orig_cpu = cp->elements[idx].cpu;
74 cp->elements[idx].dl = new_dl; 81 u64 orig_dl = cp->elements[idx].dl;
75 cpudl_heapify(cp, idx); 82
76 } else { 83 if (idx == 0)
77 cp->elements[idx].dl = new_dl; 84 return;
78 while (idx > 0 && dl_time_before(cp->elements[parent(idx)].dl, 85
79 cp->elements[idx].dl)) { 86 do {
80 cpudl_exchange(cp, idx, parent(idx)); 87 p = parent(idx);
81 idx = parent(idx); 88 if (dl_time_before(orig_dl, cp->elements[p].dl))
82 } 89 break;
83 } 90 /* pull parent onto idx */
91 cp->elements[idx].cpu = cp->elements[p].cpu;
92 cp->elements[idx].dl = cp->elements[p].dl;
93 cp->elements[cp->elements[idx].cpu].idx = idx;
94 idx = p;
95 } while (idx != 0);
96 /* actual push up of saved original values orig_* */
97 cp->elements[idx].cpu = orig_cpu;
98 cp->elements[idx].dl = orig_dl;
99 cp->elements[cp->elements[idx].cpu].idx = idx;
100}
101
102static void cpudl_heapify(struct cpudl *cp, int idx)
103{
104 if (idx > 0 && dl_time_before(cp->elements[parent(idx)].dl,
105 cp->elements[idx].dl))
106 cpudl_heapify_up(cp, idx);
107 else
108 cpudl_heapify_down(cp, idx);
84} 109}
85 110
86static inline int cpudl_maximum(struct cpudl *cp) 111static inline int cpudl_maximum(struct cpudl *cp)
@@ -120,16 +145,15 @@ out:
120} 145}
121 146
122/* 147/*
123 * cpudl_set - update the cpudl max-heap 148 * cpudl_clear - remove a cpu from the cpudl max-heap
124 * @cp: the cpudl max-heap context 149 * @cp: the cpudl max-heap context
125 * @cpu: the target cpu 150 * @cpu: the target cpu
126 * @dl: the new earliest deadline for this cpu
127 * 151 *
128 * Notes: assumes cpu_rq(cpu)->lock is locked 152 * Notes: assumes cpu_rq(cpu)->lock is locked
129 * 153 *
130 * Returns: (void) 154 * Returns: (void)
131 */ 155 */
132void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid) 156void cpudl_clear(struct cpudl *cp, int cpu)
133{ 157{
134 int old_idx, new_cpu; 158 int old_idx, new_cpu;
135 unsigned long flags; 159 unsigned long flags;
@@ -137,47 +161,60 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid)
137 WARN_ON(!cpu_present(cpu)); 161 WARN_ON(!cpu_present(cpu));
138 162
139 raw_spin_lock_irqsave(&cp->lock, flags); 163 raw_spin_lock_irqsave(&cp->lock, flags);
164
140 old_idx = cp->elements[cpu].idx; 165 old_idx = cp->elements[cpu].idx;
141 if (!is_valid) { 166 if (old_idx == IDX_INVALID) {
142 /* remove item */ 167 /*
143 if (old_idx == IDX_INVALID) { 168 * Nothing to remove if old_idx was invalid.
144 /* 169 * This could happen if a rq_offline_dl is
145 * Nothing to remove if old_idx was invalid. 170 * called for a CPU without -dl tasks running.
146 * This could happen if a rq_offline_dl is 171 */
147 * called for a CPU without -dl tasks running. 172 } else {
148 */
149 goto out;
150 }
151 new_cpu = cp->elements[cp->size - 1].cpu; 173 new_cpu = cp->elements[cp->size - 1].cpu;
152 cp->elements[old_idx].dl = cp->elements[cp->size - 1].dl; 174 cp->elements[old_idx].dl = cp->elements[cp->size - 1].dl;
153 cp->elements[old_idx].cpu = new_cpu; 175 cp->elements[old_idx].cpu = new_cpu;
154 cp->size--; 176 cp->size--;
155 cp->elements[new_cpu].idx = old_idx; 177 cp->elements[new_cpu].idx = old_idx;
156 cp->elements[cpu].idx = IDX_INVALID; 178 cp->elements[cpu].idx = IDX_INVALID;
157 while (old_idx > 0 && dl_time_before( 179 cpudl_heapify(cp, old_idx);
158 cp->elements[parent(old_idx)].dl,
159 cp->elements[old_idx].dl)) {
160 cpudl_exchange(cp, old_idx, parent(old_idx));
161 old_idx = parent(old_idx);
162 }
163 cpumask_set_cpu(cpu, cp->free_cpus);
164 cpudl_heapify(cp, old_idx);
165 180
166 goto out; 181 cpumask_set_cpu(cpu, cp->free_cpus);
167 } 182 }
183 raw_spin_unlock_irqrestore(&cp->lock, flags);
184}
185
186/*
187 * cpudl_set - update the cpudl max-heap
188 * @cp: the cpudl max-heap context
189 * @cpu: the target cpu
190 * @dl: the new earliest deadline for this cpu
191 *
192 * Notes: assumes cpu_rq(cpu)->lock is locked
193 *
194 * Returns: (void)
195 */
196void cpudl_set(struct cpudl *cp, int cpu, u64 dl)
197{
198 int old_idx;
199 unsigned long flags;
168 200
201 WARN_ON(!cpu_present(cpu));
202
203 raw_spin_lock_irqsave(&cp->lock, flags);
204
205 old_idx = cp->elements[cpu].idx;
169 if (old_idx == IDX_INVALID) { 206 if (old_idx == IDX_INVALID) {
170 cp->size++; 207 int new_idx = cp->size++;
171 cp->elements[cp->size - 1].dl = dl; 208 cp->elements[new_idx].dl = dl;
172 cp->elements[cp->size - 1].cpu = cpu; 209 cp->elements[new_idx].cpu = cpu;
173 cp->elements[cpu].idx = cp->size - 1; 210 cp->elements[cpu].idx = new_idx;
174 cpudl_change_key(cp, cp->size - 1, dl); 211 cpudl_heapify_up(cp, new_idx);
175 cpumask_clear_cpu(cpu, cp->free_cpus); 212 cpumask_clear_cpu(cpu, cp->free_cpus);
176 } else { 213 } else {
177 cpudl_change_key(cp, old_idx, dl); 214 cp->elements[old_idx].dl = dl;
215 cpudl_heapify(cp, old_idx);
178 } 216 }
179 217
180out:
181 raw_spin_unlock_irqrestore(&cp->lock, flags); 218 raw_spin_unlock_irqrestore(&cp->lock, flags);
182} 219}
183 220
diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h
index fcbdf83fed7e..f7da8c55bba0 100644
--- a/kernel/sched/cpudeadline.h
+++ b/kernel/sched/cpudeadline.h
@@ -23,7 +23,8 @@ struct cpudl {
23#ifdef CONFIG_SMP 23#ifdef CONFIG_SMP
24int cpudl_find(struct cpudl *cp, struct task_struct *p, 24int cpudl_find(struct cpudl *cp, struct task_struct *p,
25 struct cpumask *later_mask); 25 struct cpumask *later_mask);
26void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid); 26void cpudl_set(struct cpudl *cp, int cpu, u64 dl);
27void cpudl_clear(struct cpudl *cp, int cpu);
27int cpudl_init(struct cpudl *cp); 28int cpudl_init(struct cpudl *cp);
28void cpudl_set_freecpu(struct cpudl *cp, int cpu); 29void cpudl_set_freecpu(struct cpudl *cp, int cpu);
29void cpudl_clear_freecpu(struct cpudl *cp, int cpu); 30void cpudl_clear_freecpu(struct cpudl *cp, int cpu);
diff --git a/kernel/sched/cpufreq.c b/kernel/sched/cpufreq.c
index 1141954e73b4..dbc51442ecbc 100644
--- a/kernel/sched/cpufreq.c
+++ b/kernel/sched/cpufreq.c
@@ -33,7 +33,7 @@ DEFINE_PER_CPU(struct update_util_data *, cpufreq_update_util_data);
33 */ 33 */
34void cpufreq_add_update_util_hook(int cpu, struct update_util_data *data, 34void cpufreq_add_update_util_hook(int cpu, struct update_util_data *data,
35 void (*func)(struct update_util_data *data, u64 time, 35 void (*func)(struct update_util_data *data, u64 time,
36 unsigned long util, unsigned long max)) 36 unsigned int flags))
37{ 37{
38 if (WARN_ON(!data || !func)) 38 if (WARN_ON(!data || !func))
39 return; 39 return;
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index a84641b222c1..69e06898997d 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -12,7 +12,6 @@
12#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 12#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
13 13
14#include <linux/cpufreq.h> 14#include <linux/cpufreq.h>
15#include <linux/module.h>
16#include <linux/slab.h> 15#include <linux/slab.h>
17#include <trace/events/power.h> 16#include <trace/events/power.h>
18 17
@@ -48,11 +47,14 @@ struct sugov_cpu {
48 struct sugov_policy *sg_policy; 47 struct sugov_policy *sg_policy;
49 48
50 unsigned int cached_raw_freq; 49 unsigned int cached_raw_freq;
50 unsigned long iowait_boost;
51 unsigned long iowait_boost_max;
52 u64 last_update;
51 53
52 /* The fields below are only needed when sharing a policy. */ 54 /* The fields below are only needed when sharing a policy. */
53 unsigned long util; 55 unsigned long util;
54 unsigned long max; 56 unsigned long max;
55 u64 last_update; 57 unsigned int flags;
56}; 58};
57 59
58static DEFINE_PER_CPU(struct sugov_cpu, sugov_cpu); 60static DEFINE_PER_CPU(struct sugov_cpu, sugov_cpu);
@@ -144,24 +146,75 @@ static unsigned int get_next_freq(struct sugov_cpu *sg_cpu, unsigned long util,
144 return cpufreq_driver_resolve_freq(policy, freq); 146 return cpufreq_driver_resolve_freq(policy, freq);
145} 147}
146 148
149static void sugov_get_util(unsigned long *util, unsigned long *max)
150{
151 struct rq *rq = this_rq();
152 unsigned long cfs_max;
153
154 cfs_max = arch_scale_cpu_capacity(NULL, smp_processor_id());
155
156 *util = min(rq->cfs.avg.util_avg, cfs_max);
157 *max = cfs_max;
158}
159
160static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time,
161 unsigned int flags)
162{
163 if (flags & SCHED_CPUFREQ_IOWAIT) {
164 sg_cpu->iowait_boost = sg_cpu->iowait_boost_max;
165 } else if (sg_cpu->iowait_boost) {
166 s64 delta_ns = time - sg_cpu->last_update;
167
168 /* Clear iowait_boost if the CPU apprears to have been idle. */
169 if (delta_ns > TICK_NSEC)
170 sg_cpu->iowait_boost = 0;
171 }
172}
173
174static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, unsigned long *util,
175 unsigned long *max)
176{
177 unsigned long boost_util = sg_cpu->iowait_boost;
178 unsigned long boost_max = sg_cpu->iowait_boost_max;
179
180 if (!boost_util)
181 return;
182
183 if (*util * boost_max < *max * boost_util) {
184 *util = boost_util;
185 *max = boost_max;
186 }
187 sg_cpu->iowait_boost >>= 1;
188}
189
147static void sugov_update_single(struct update_util_data *hook, u64 time, 190static void sugov_update_single(struct update_util_data *hook, u64 time,
148 unsigned long util, unsigned long max) 191 unsigned int flags)
149{ 192{
150 struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util); 193 struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util);
151 struct sugov_policy *sg_policy = sg_cpu->sg_policy; 194 struct sugov_policy *sg_policy = sg_cpu->sg_policy;
152 struct cpufreq_policy *policy = sg_policy->policy; 195 struct cpufreq_policy *policy = sg_policy->policy;
196 unsigned long util, max;
153 unsigned int next_f; 197 unsigned int next_f;
154 198
199 sugov_set_iowait_boost(sg_cpu, time, flags);
200 sg_cpu->last_update = time;
201
155 if (!sugov_should_update_freq(sg_policy, time)) 202 if (!sugov_should_update_freq(sg_policy, time))
156 return; 203 return;
157 204
158 next_f = util == ULONG_MAX ? policy->cpuinfo.max_freq : 205 if (flags & SCHED_CPUFREQ_RT_DL) {
159 get_next_freq(sg_cpu, util, max); 206 next_f = policy->cpuinfo.max_freq;
207 } else {
208 sugov_get_util(&util, &max);
209 sugov_iowait_boost(sg_cpu, &util, &max);
210 next_f = get_next_freq(sg_cpu, util, max);
211 }
160 sugov_update_commit(sg_policy, time, next_f); 212 sugov_update_commit(sg_policy, time, next_f);
161} 213}
162 214
163static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, 215static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu,
164 unsigned long util, unsigned long max) 216 unsigned long util, unsigned long max,
217 unsigned int flags)
165{ 218{
166 struct sugov_policy *sg_policy = sg_cpu->sg_policy; 219 struct sugov_policy *sg_policy = sg_cpu->sg_policy;
167 struct cpufreq_policy *policy = sg_policy->policy; 220 struct cpufreq_policy *policy = sg_policy->policy;
@@ -169,9 +222,11 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu,
169 u64 last_freq_update_time = sg_policy->last_freq_update_time; 222 u64 last_freq_update_time = sg_policy->last_freq_update_time;
170 unsigned int j; 223 unsigned int j;
171 224
172 if (util == ULONG_MAX) 225 if (flags & SCHED_CPUFREQ_RT_DL)
173 return max_f; 226 return max_f;
174 227
228 sugov_iowait_boost(sg_cpu, &util, &max);
229
175 for_each_cpu(j, policy->cpus) { 230 for_each_cpu(j, policy->cpus) {
176 struct sugov_cpu *j_sg_cpu; 231 struct sugov_cpu *j_sg_cpu;
177 unsigned long j_util, j_max; 232 unsigned long j_util, j_max;
@@ -186,41 +241,50 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu,
186 * frequency update and the time elapsed between the last update 241 * frequency update and the time elapsed between the last update
187 * of the CPU utilization and the last frequency update is long 242 * of the CPU utilization and the last frequency update is long
188 * enough, don't take the CPU into account as it probably is 243 * enough, don't take the CPU into account as it probably is
189 * idle now. 244 * idle now (and clear iowait_boost for it).
190 */ 245 */
191 delta_ns = last_freq_update_time - j_sg_cpu->last_update; 246 delta_ns = last_freq_update_time - j_sg_cpu->last_update;
192 if (delta_ns > TICK_NSEC) 247 if (delta_ns > TICK_NSEC) {
248 j_sg_cpu->iowait_boost = 0;
193 continue; 249 continue;
194 250 }
195 j_util = j_sg_cpu->util; 251 if (j_sg_cpu->flags & SCHED_CPUFREQ_RT_DL)
196 if (j_util == ULONG_MAX)
197 return max_f; 252 return max_f;
198 253
254 j_util = j_sg_cpu->util;
199 j_max = j_sg_cpu->max; 255 j_max = j_sg_cpu->max;
200 if (j_util * max > j_max * util) { 256 if (j_util * max > j_max * util) {
201 util = j_util; 257 util = j_util;
202 max = j_max; 258 max = j_max;
203 } 259 }
260
261 sugov_iowait_boost(j_sg_cpu, &util, &max);
204 } 262 }
205 263
206 return get_next_freq(sg_cpu, util, max); 264 return get_next_freq(sg_cpu, util, max);
207} 265}
208 266
209static void sugov_update_shared(struct update_util_data *hook, u64 time, 267static void sugov_update_shared(struct update_util_data *hook, u64 time,
210 unsigned long util, unsigned long max) 268 unsigned int flags)
211{ 269{
212 struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util); 270 struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util);
213 struct sugov_policy *sg_policy = sg_cpu->sg_policy; 271 struct sugov_policy *sg_policy = sg_cpu->sg_policy;
272 unsigned long util, max;
214 unsigned int next_f; 273 unsigned int next_f;
215 274
275 sugov_get_util(&util, &max);
276
216 raw_spin_lock(&sg_policy->update_lock); 277 raw_spin_lock(&sg_policy->update_lock);
217 278
218 sg_cpu->util = util; 279 sg_cpu->util = util;
219 sg_cpu->max = max; 280 sg_cpu->max = max;
281 sg_cpu->flags = flags;
282
283 sugov_set_iowait_boost(sg_cpu, time, flags);
220 sg_cpu->last_update = time; 284 sg_cpu->last_update = time;
221 285
222 if (sugov_should_update_freq(sg_policy, time)) { 286 if (sugov_should_update_freq(sg_policy, time)) {
223 next_f = sugov_next_freq_shared(sg_cpu, util, max); 287 next_f = sugov_next_freq_shared(sg_cpu, util, max, flags);
224 sugov_update_commit(sg_policy, time, next_f); 288 sugov_update_commit(sg_policy, time, next_f);
225 } 289 }
226 290
@@ -444,10 +508,13 @@ static int sugov_start(struct cpufreq_policy *policy)
444 508
445 sg_cpu->sg_policy = sg_policy; 509 sg_cpu->sg_policy = sg_policy;
446 if (policy_is_shared(policy)) { 510 if (policy_is_shared(policy)) {
447 sg_cpu->util = ULONG_MAX; 511 sg_cpu->util = 0;
448 sg_cpu->max = 0; 512 sg_cpu->max = 0;
513 sg_cpu->flags = SCHED_CPUFREQ_RT;
449 sg_cpu->last_update = 0; 514 sg_cpu->last_update = 0;
450 sg_cpu->cached_raw_freq = 0; 515 sg_cpu->cached_raw_freq = 0;
516 sg_cpu->iowait_boost = 0;
517 sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq;
451 cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util, 518 cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util,
452 sugov_update_shared); 519 sugov_update_shared);
453 } else { 520 } else {
@@ -495,28 +562,15 @@ static struct cpufreq_governor schedutil_gov = {
495 .limits = sugov_limits, 562 .limits = sugov_limits,
496}; 563};
497 564
498static int __init sugov_module_init(void)
499{
500 return cpufreq_register_governor(&schedutil_gov);
501}
502
503static void __exit sugov_module_exit(void)
504{
505 cpufreq_unregister_governor(&schedutil_gov);
506}
507
508MODULE_AUTHOR("Rafael J. Wysocki <rafael.j.wysocki@intel.com>");
509MODULE_DESCRIPTION("Utilization-based CPU frequency selection");
510MODULE_LICENSE("GPL");
511
512#ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL 565#ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL
513struct cpufreq_governor *cpufreq_default_governor(void) 566struct cpufreq_governor *cpufreq_default_governor(void)
514{ 567{
515 return &schedutil_gov; 568 return &schedutil_gov;
516} 569}
517
518fs_initcall(sugov_module_init);
519#else
520module_init(sugov_module_init);
521#endif 570#endif
522module_exit(sugov_module_exit); 571
572static int __init sugov_register(void)
573{
574 return cpufreq_register_governor(&schedutil_gov);
575}
576fs_initcall(sugov_register);
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index a846cf89eb96..5ebee3164e64 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -23,10 +23,8 @@
23 * task when irq is in progress while we read rq->clock. That is a worthy 23 * task when irq is in progress while we read rq->clock. That is a worthy
24 * compromise in place of having locks on each irq in account_system_time. 24 * compromise in place of having locks on each irq in account_system_time.
25 */ 25 */
26DEFINE_PER_CPU(u64, cpu_hardirq_time); 26DEFINE_PER_CPU(struct irqtime, cpu_irqtime);
27DEFINE_PER_CPU(u64, cpu_softirq_time);
28 27
29static DEFINE_PER_CPU(u64, irq_start_time);
30static int sched_clock_irqtime; 28static int sched_clock_irqtime;
31 29
32void enable_sched_clock_irqtime(void) 30void enable_sched_clock_irqtime(void)
@@ -39,16 +37,13 @@ void disable_sched_clock_irqtime(void)
39 sched_clock_irqtime = 0; 37 sched_clock_irqtime = 0;
40} 38}
41 39
42#ifndef CONFIG_64BIT
43DEFINE_PER_CPU(seqcount_t, irq_time_seq);
44#endif /* CONFIG_64BIT */
45
46/* 40/*
47 * Called before incrementing preempt_count on {soft,}irq_enter 41 * Called before incrementing preempt_count on {soft,}irq_enter
48 * and before decrementing preempt_count on {soft,}irq_exit. 42 * and before decrementing preempt_count on {soft,}irq_exit.
49 */ 43 */
50void irqtime_account_irq(struct task_struct *curr) 44void irqtime_account_irq(struct task_struct *curr)
51{ 45{
46 struct irqtime *irqtime = this_cpu_ptr(&cpu_irqtime);
52 s64 delta; 47 s64 delta;
53 int cpu; 48 int cpu;
54 49
@@ -56,10 +51,10 @@ void irqtime_account_irq(struct task_struct *curr)
56 return; 51 return;
57 52
58 cpu = smp_processor_id(); 53 cpu = smp_processor_id();
59 delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time); 54 delta = sched_clock_cpu(cpu) - irqtime->irq_start_time;
60 __this_cpu_add(irq_start_time, delta); 55 irqtime->irq_start_time += delta;
61 56
62 irq_time_write_begin(); 57 u64_stats_update_begin(&irqtime->sync);
63 /* 58 /*
64 * We do not account for softirq time from ksoftirqd here. 59 * We do not account for softirq time from ksoftirqd here.
65 * We want to continue accounting softirq time to ksoftirqd thread 60 * We want to continue accounting softirq time to ksoftirqd thread
@@ -67,42 +62,36 @@ void irqtime_account_irq(struct task_struct *curr)
67 * that do not consume any time, but still wants to run. 62 * that do not consume any time, but still wants to run.
68 */ 63 */
69 if (hardirq_count()) 64 if (hardirq_count())
70 __this_cpu_add(cpu_hardirq_time, delta); 65 irqtime->hardirq_time += delta;
71 else if (in_serving_softirq() && curr != this_cpu_ksoftirqd()) 66 else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
72 __this_cpu_add(cpu_softirq_time, delta); 67 irqtime->softirq_time += delta;
73 68
74 irq_time_write_end(); 69 u64_stats_update_end(&irqtime->sync);
75} 70}
76EXPORT_SYMBOL_GPL(irqtime_account_irq); 71EXPORT_SYMBOL_GPL(irqtime_account_irq);
77 72
78static cputime_t irqtime_account_hi_update(cputime_t maxtime) 73static cputime_t irqtime_account_update(u64 irqtime, int idx, cputime_t maxtime)
79{ 74{
80 u64 *cpustat = kcpustat_this_cpu->cpustat; 75 u64 *cpustat = kcpustat_this_cpu->cpustat;
81 unsigned long flags;
82 cputime_t irq_cputime; 76 cputime_t irq_cputime;
83 77
84 local_irq_save(flags); 78 irq_cputime = nsecs_to_cputime64(irqtime) - cpustat[idx];
85 irq_cputime = nsecs_to_cputime64(this_cpu_read(cpu_hardirq_time)) -
86 cpustat[CPUTIME_IRQ];
87 irq_cputime = min(irq_cputime, maxtime); 79 irq_cputime = min(irq_cputime, maxtime);
88 cpustat[CPUTIME_IRQ] += irq_cputime; 80 cpustat[idx] += irq_cputime;
89 local_irq_restore(flags); 81
90 return irq_cputime; 82 return irq_cputime;
91} 83}
92 84
93static cputime_t irqtime_account_si_update(cputime_t maxtime) 85static cputime_t irqtime_account_hi_update(cputime_t maxtime)
94{ 86{
95 u64 *cpustat = kcpustat_this_cpu->cpustat; 87 return irqtime_account_update(__this_cpu_read(cpu_irqtime.hardirq_time),
96 unsigned long flags; 88 CPUTIME_IRQ, maxtime);
97 cputime_t softirq_cputime; 89}
98 90
99 local_irq_save(flags); 91static cputime_t irqtime_account_si_update(cputime_t maxtime)
100 softirq_cputime = nsecs_to_cputime64(this_cpu_read(cpu_softirq_time)) - 92{
101 cpustat[CPUTIME_SOFTIRQ]; 93 return irqtime_account_update(__this_cpu_read(cpu_irqtime.softirq_time),
102 softirq_cputime = min(softirq_cputime, maxtime); 94 CPUTIME_SOFTIRQ, maxtime);
103 cpustat[CPUTIME_SOFTIRQ] += softirq_cputime;
104 local_irq_restore(flags);
105 return softirq_cputime;
106} 95}
107 96
108#else /* CONFIG_IRQ_TIME_ACCOUNTING */ 97#else /* CONFIG_IRQ_TIME_ACCOUNTING */
@@ -295,6 +284,9 @@ static inline cputime_t account_other_time(cputime_t max)
295{ 284{
296 cputime_t accounted; 285 cputime_t accounted;
297 286
287 /* Shall be converted to a lockdep-enabled lightweight check */
288 WARN_ON_ONCE(!irqs_disabled());
289
298 accounted = steal_account_process_time(max); 290 accounted = steal_account_process_time(max);
299 291
300 if (accounted < max) 292 if (accounted < max)
@@ -306,6 +298,26 @@ static inline cputime_t account_other_time(cputime_t max)
306 return accounted; 298 return accounted;
307} 299}
308 300
301#ifdef CONFIG_64BIT
302static inline u64 read_sum_exec_runtime(struct task_struct *t)
303{
304 return t->se.sum_exec_runtime;
305}
306#else
307static u64 read_sum_exec_runtime(struct task_struct *t)
308{
309 u64 ns;
310 struct rq_flags rf;
311 struct rq *rq;
312
313 rq = task_rq_lock(t, &rf);
314 ns = t->se.sum_exec_runtime;
315 task_rq_unlock(rq, t, &rf);
316
317 return ns;
318}
319#endif
320
309/* 321/*
310 * Accumulate raw cputime values of dead tasks (sig->[us]time) and live 322 * Accumulate raw cputime values of dead tasks (sig->[us]time) and live
311 * tasks (sum on group iteration) belonging to @tsk's group. 323 * tasks (sum on group iteration) belonging to @tsk's group.
@@ -318,6 +330,17 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
318 unsigned int seq, nextseq; 330 unsigned int seq, nextseq;
319 unsigned long flags; 331 unsigned long flags;
320 332
333 /*
334 * Update current task runtime to account pending time since last
335 * scheduler action or thread_group_cputime() call. This thread group
336 * might have other running tasks on different CPUs, but updating
337 * their runtime can affect syscall performance, so we skip account
338 * those pending times and rely only on values updated on tick or
339 * other scheduler action.
340 */
341 if (same_thread_group(current, tsk))
342 (void) task_sched_runtime(current);
343
321 rcu_read_lock(); 344 rcu_read_lock();
322 /* Attempt a lockless read on the first round. */ 345 /* Attempt a lockless read on the first round. */
323 nextseq = 0; 346 nextseq = 0;
@@ -332,7 +355,7 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
332 task_cputime(t, &utime, &stime); 355 task_cputime(t, &utime, &stime);
333 times->utime += utime; 356 times->utime += utime;
334 times->stime += stime; 357 times->stime += stime;
335 times->sum_exec_runtime += task_sched_runtime(t); 358 times->sum_exec_runtime += read_sum_exec_runtime(t);
336 } 359 }
337 /* If lockless access failed, take the lock. */ 360 /* If lockless access failed, take the lock. */
338 nextseq = 1; 361 nextseq = 1;
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 1ce8867283dc..37e2449186c4 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -243,10 +243,8 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq);
243static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p) 243static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p)
244{ 244{
245 struct rq *later_rq = NULL; 245 struct rq *later_rq = NULL;
246 bool fallback = false;
247 246
248 later_rq = find_lock_later_rq(p, rq); 247 later_rq = find_lock_later_rq(p, rq);
249
250 if (!later_rq) { 248 if (!later_rq) {
251 int cpu; 249 int cpu;
252 250
@@ -254,7 +252,6 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p
254 * If we cannot preempt any rq, fall back to pick any 252 * If we cannot preempt any rq, fall back to pick any
255 * online cpu. 253 * online cpu.
256 */ 254 */
257 fallback = true;
258 cpu = cpumask_any_and(cpu_active_mask, tsk_cpus_allowed(p)); 255 cpu = cpumask_any_and(cpu_active_mask, tsk_cpus_allowed(p));
259 if (cpu >= nr_cpu_ids) { 256 if (cpu >= nr_cpu_ids) {
260 /* 257 /*
@@ -274,16 +271,7 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p
274 double_lock_balance(rq, later_rq); 271 double_lock_balance(rq, later_rq);
275 } 272 }
276 273
277 /*
278 * By now the task is replenished and enqueued; migrate it.
279 */
280 deactivate_task(rq, p, 0);
281 set_task_cpu(p, later_rq->cpu); 274 set_task_cpu(p, later_rq->cpu);
282 activate_task(later_rq, p, 0);
283
284 if (!fallback)
285 resched_curr(later_rq);
286
287 double_unlock_balance(later_rq, rq); 275 double_unlock_balance(later_rq, rq);
288 276
289 return later_rq; 277 return later_rq;
@@ -346,12 +334,12 @@ static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p,
346 * one, and to (try to!) reconcile itself with its own scheduling 334 * one, and to (try to!) reconcile itself with its own scheduling
347 * parameters. 335 * parameters.
348 */ 336 */
349static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se, 337static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se)
350 struct sched_dl_entity *pi_se)
351{ 338{
352 struct dl_rq *dl_rq = dl_rq_of_se(dl_se); 339 struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
353 struct rq *rq = rq_of_dl_rq(dl_rq); 340 struct rq *rq = rq_of_dl_rq(dl_rq);
354 341
342 WARN_ON(dl_se->dl_boosted);
355 WARN_ON(dl_time_before(rq_clock(rq), dl_se->deadline)); 343 WARN_ON(dl_time_before(rq_clock(rq), dl_se->deadline));
356 344
357 /* 345 /*
@@ -367,8 +355,8 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se,
367 * future; in fact, we must consider execution overheads (time 355 * future; in fact, we must consider execution overheads (time
368 * spent on hardirq context, etc.). 356 * spent on hardirq context, etc.).
369 */ 357 */
370 dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; 358 dl_se->deadline = rq_clock(rq) + dl_se->dl_deadline;
371 dl_se->runtime = pi_se->dl_runtime; 359 dl_se->runtime = dl_se->dl_runtime;
372} 360}
373 361
374/* 362/*
@@ -641,29 +629,31 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
641 goto unlock; 629 goto unlock;
642 } 630 }
643 631
644 enqueue_task_dl(rq, p, ENQUEUE_REPLENISH);
645 if (dl_task(rq->curr))
646 check_preempt_curr_dl(rq, p, 0);
647 else
648 resched_curr(rq);
649
650#ifdef CONFIG_SMP 632#ifdef CONFIG_SMP
651 /*
652 * Perform balancing operations here; after the replenishments. We
653 * cannot drop rq->lock before this, otherwise the assertion in
654 * start_dl_timer() about not missing updates is not true.
655 *
656 * If we find that the rq the task was on is no longer available, we
657 * need to select a new rq.
658 *
659 * XXX figure out if select_task_rq_dl() deals with offline cpus.
660 */
661 if (unlikely(!rq->online)) { 633 if (unlikely(!rq->online)) {
634 /*
635 * If the runqueue is no longer available, migrate the
636 * task elsewhere. This necessarily changes rq.
637 */
662 lockdep_unpin_lock(&rq->lock, rf.cookie); 638 lockdep_unpin_lock(&rq->lock, rf.cookie);
663 rq = dl_task_offline_migration(rq, p); 639 rq = dl_task_offline_migration(rq, p);
664 rf.cookie = lockdep_pin_lock(&rq->lock); 640 rf.cookie = lockdep_pin_lock(&rq->lock);
641
642 /*
643 * Now that the task has been migrated to the new RQ and we
644 * have that locked, proceed as normal and enqueue the task
645 * there.
646 */
665 } 647 }
648#endif
649
650 enqueue_task_dl(rq, p, ENQUEUE_REPLENISH);
651 if (dl_task(rq->curr))
652 check_preempt_curr_dl(rq, p, 0);
653 else
654 resched_curr(rq);
666 655
656#ifdef CONFIG_SMP
667 /* 657 /*
668 * Queueing this task back might have overloaded rq, check if we need 658 * Queueing this task back might have overloaded rq, check if we need
669 * to kick someone away. 659 * to kick someone away.
@@ -735,9 +725,8 @@ static void update_curr_dl(struct rq *rq)
735 return; 725 return;
736 } 726 }
737 727
738 /* kick cpufreq (see the comment in linux/cpufreq.h). */ 728 /* kick cpufreq (see the comment in kernel/sched/sched.h). */
739 if (cpu_of(rq) == smp_processor_id()) 729 cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_DL);
740 cpufreq_trigger_update(rq_clock(rq));
741 730
742 schedstat_set(curr->se.statistics.exec_max, 731 schedstat_set(curr->se.statistics.exec_max,
743 max(curr->se.statistics.exec_max, delta_exec)); 732 max(curr->se.statistics.exec_max, delta_exec));
@@ -798,7 +787,7 @@ static void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline)
798 if (dl_rq->earliest_dl.curr == 0 || 787 if (dl_rq->earliest_dl.curr == 0 ||
799 dl_time_before(deadline, dl_rq->earliest_dl.curr)) { 788 dl_time_before(deadline, dl_rq->earliest_dl.curr)) {
800 dl_rq->earliest_dl.curr = deadline; 789 dl_rq->earliest_dl.curr = deadline;
801 cpudl_set(&rq->rd->cpudl, rq->cpu, deadline, 1); 790 cpudl_set(&rq->rd->cpudl, rq->cpu, deadline);
802 } 791 }
803} 792}
804 793
@@ -813,14 +802,14 @@ static void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline)
813 if (!dl_rq->dl_nr_running) { 802 if (!dl_rq->dl_nr_running) {
814 dl_rq->earliest_dl.curr = 0; 803 dl_rq->earliest_dl.curr = 0;
815 dl_rq->earliest_dl.next = 0; 804 dl_rq->earliest_dl.next = 0;
816 cpudl_set(&rq->rd->cpudl, rq->cpu, 0, 0); 805 cpudl_clear(&rq->rd->cpudl, rq->cpu);
817 } else { 806 } else {
818 struct rb_node *leftmost = dl_rq->rb_leftmost; 807 struct rb_node *leftmost = dl_rq->rb_leftmost;
819 struct sched_dl_entity *entry; 808 struct sched_dl_entity *entry;
820 809
821 entry = rb_entry(leftmost, struct sched_dl_entity, rb_node); 810 entry = rb_entry(leftmost, struct sched_dl_entity, rb_node);
822 dl_rq->earliest_dl.curr = entry->deadline; 811 dl_rq->earliest_dl.curr = entry->deadline;
823 cpudl_set(&rq->rd->cpudl, rq->cpu, entry->deadline, 1); 812 cpudl_set(&rq->rd->cpudl, rq->cpu, entry->deadline);
824 } 813 }
825} 814}
826 815
@@ -1671,7 +1660,7 @@ static void rq_online_dl(struct rq *rq)
1671 1660
1672 cpudl_set_freecpu(&rq->rd->cpudl, rq->cpu); 1661 cpudl_set_freecpu(&rq->rd->cpudl, rq->cpu);
1673 if (rq->dl.dl_nr_running > 0) 1662 if (rq->dl.dl_nr_running > 0)
1674 cpudl_set(&rq->rd->cpudl, rq->cpu, rq->dl.earliest_dl.curr, 1); 1663 cpudl_set(&rq->rd->cpudl, rq->cpu, rq->dl.earliest_dl.curr);
1675} 1664}
1676 1665
1677/* Assumes rq->lock is held */ 1666/* Assumes rq->lock is held */
@@ -1680,7 +1669,7 @@ static void rq_offline_dl(struct rq *rq)
1680 if (rq->dl.overloaded) 1669 if (rq->dl.overloaded)
1681 dl_clear_overload(rq); 1670 dl_clear_overload(rq);
1682 1671
1683 cpudl_set(&rq->rd->cpudl, rq->cpu, 0, 0); 1672 cpudl_clear(&rq->rd->cpudl, rq->cpu);
1684 cpudl_clear_freecpu(&rq->rd->cpudl, rq->cpu); 1673 cpudl_clear_freecpu(&rq->rd->cpudl, rq->cpu);
1685} 1674}
1686 1675
@@ -1723,10 +1712,20 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p)
1723 */ 1712 */
1724static void switched_to_dl(struct rq *rq, struct task_struct *p) 1713static void switched_to_dl(struct rq *rq, struct task_struct *p)
1725{ 1714{
1715
1716 /* If p is not queued we will update its parameters at next wakeup. */
1717 if (!task_on_rq_queued(p))
1718 return;
1719
1720 /*
1721 * If p is boosted we already updated its params in
1722 * rt_mutex_setprio()->enqueue_task(..., ENQUEUE_REPLENISH),
1723 * p's deadline being now already after rq_clock(rq).
1724 */
1726 if (dl_time_before(p->dl.deadline, rq_clock(rq))) 1725 if (dl_time_before(p->dl.deadline, rq_clock(rq)))
1727 setup_new_dl_entity(&p->dl, &p->dl); 1726 setup_new_dl_entity(&p->dl);
1728 1727
1729 if (task_on_rq_queued(p) && rq->curr != p) { 1728 if (rq->curr != p) {
1730#ifdef CONFIG_SMP 1729#ifdef CONFIG_SMP
1731 if (tsk_nr_cpus_allowed(p) > 1 && rq->dl.overloaded) 1730 if (tsk_nr_cpus_allowed(p) > 1 && rq->dl.overloaded)
1732 queue_push_tasks(rq); 1731 queue_push_tasks(rq);
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 2a0a9995256d..fa178b62ea79 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -369,8 +369,12 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
369 369
370#define P(F) \ 370#define P(F) \
371 SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)F) 371 SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)F)
372#define P_SCHEDSTAT(F) \
373 SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)schedstat_val(F))
372#define PN(F) \ 374#define PN(F) \
373 SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F)) 375 SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F))
376#define PN_SCHEDSTAT(F) \
377 SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)schedstat_val(F)))
374 378
375 if (!se) 379 if (!se)
376 return; 380 return;
@@ -378,26 +382,27 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
378 PN(se->exec_start); 382 PN(se->exec_start);
379 PN(se->vruntime); 383 PN(se->vruntime);
380 PN(se->sum_exec_runtime); 384 PN(se->sum_exec_runtime);
381#ifdef CONFIG_SCHEDSTATS
382 if (schedstat_enabled()) { 385 if (schedstat_enabled()) {
383 PN(se->statistics.wait_start); 386 PN_SCHEDSTAT(se->statistics.wait_start);
384 PN(se->statistics.sleep_start); 387 PN_SCHEDSTAT(se->statistics.sleep_start);
385 PN(se->statistics.block_start); 388 PN_SCHEDSTAT(se->statistics.block_start);
386 PN(se->statistics.sleep_max); 389 PN_SCHEDSTAT(se->statistics.sleep_max);
387 PN(se->statistics.block_max); 390 PN_SCHEDSTAT(se->statistics.block_max);
388 PN(se->statistics.exec_max); 391 PN_SCHEDSTAT(se->statistics.exec_max);
389 PN(se->statistics.slice_max); 392 PN_SCHEDSTAT(se->statistics.slice_max);
390 PN(se->statistics.wait_max); 393 PN_SCHEDSTAT(se->statistics.wait_max);
391 PN(se->statistics.wait_sum); 394 PN_SCHEDSTAT(se->statistics.wait_sum);
392 P(se->statistics.wait_count); 395 P_SCHEDSTAT(se->statistics.wait_count);
393 } 396 }
394#endif
395 P(se->load.weight); 397 P(se->load.weight);
396#ifdef CONFIG_SMP 398#ifdef CONFIG_SMP
397 P(se->avg.load_avg); 399 P(se->avg.load_avg);
398 P(se->avg.util_avg); 400 P(se->avg.util_avg);
399#endif 401#endif
402
403#undef PN_SCHEDSTAT
400#undef PN 404#undef PN
405#undef P_SCHEDSTAT
401#undef P 406#undef P
402} 407}
403#endif 408#endif
@@ -410,7 +415,8 @@ static char *task_group_path(struct task_group *tg)
410 if (autogroup_path(tg, group_path, PATH_MAX)) 415 if (autogroup_path(tg, group_path, PATH_MAX))
411 return group_path; 416 return group_path;
412 417
413 return cgroup_path(tg->css.cgroup, group_path, PATH_MAX); 418 cgroup_path(tg->css.cgroup, group_path, PATH_MAX);
419 return group_path;
414} 420}
415#endif 421#endif
416 422
@@ -429,9 +435,9 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
429 p->prio); 435 p->prio);
430 436
431 SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld", 437 SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld",
432 SPLIT_NS(schedstat_val(p, se.statistics.wait_sum)), 438 SPLIT_NS(schedstat_val_or_zero(p->se.statistics.wait_sum)),
433 SPLIT_NS(p->se.sum_exec_runtime), 439 SPLIT_NS(p->se.sum_exec_runtime),
434 SPLIT_NS(schedstat_val(p, se.statistics.sum_sleep_runtime))); 440 SPLIT_NS(schedstat_val_or_zero(p->se.statistics.sum_sleep_runtime)));
435 441
436#ifdef CONFIG_NUMA_BALANCING 442#ifdef CONFIG_NUMA_BALANCING
437 SEQ_printf(m, " %d %d", task_node(p), task_numa_group_id(p)); 443 SEQ_printf(m, " %d %d", task_node(p), task_numa_group_id(p));
@@ -626,9 +632,7 @@ do { \
626#undef P64 632#undef P64
627#endif 633#endif
628 634
629#ifdef CONFIG_SCHEDSTATS 635#define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, schedstat_val(rq->n));
630#define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, rq->n);
631
632 if (schedstat_enabled()) { 636 if (schedstat_enabled()) {
633 P(yld_count); 637 P(yld_count);
634 P(sched_count); 638 P(sched_count);
@@ -636,9 +640,8 @@ do { \
636 P(ttwu_count); 640 P(ttwu_count);
637 P(ttwu_local); 641 P(ttwu_local);
638 } 642 }
639
640#undef P 643#undef P
641#endif 644
642 spin_lock_irqsave(&sched_debug_lock, flags); 645 spin_lock_irqsave(&sched_debug_lock, flags);
643 print_cfs_stats(m, cpu); 646 print_cfs_stats(m, cpu);
644 print_rt_stats(m, cpu); 647 print_rt_stats(m, cpu);
@@ -868,10 +871,14 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
868 SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)F) 871 SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)F)
869#define P(F) \ 872#define P(F) \
870 SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)p->F) 873 SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)p->F)
874#define P_SCHEDSTAT(F) \
875 SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)schedstat_val(p->F))
871#define __PN(F) \ 876#define __PN(F) \
872 SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F)) 877 SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F))
873#define PN(F) \ 878#define PN(F) \
874 SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F)) 879 SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F))
880#define PN_SCHEDSTAT(F) \
881 SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)schedstat_val(p->F)))
875 882
876 PN(se.exec_start); 883 PN(se.exec_start);
877 PN(se.vruntime); 884 PN(se.vruntime);
@@ -881,37 +888,36 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
881 888
882 P(se.nr_migrations); 889 P(se.nr_migrations);
883 890
884#ifdef CONFIG_SCHEDSTATS
885 if (schedstat_enabled()) { 891 if (schedstat_enabled()) {
886 u64 avg_atom, avg_per_cpu; 892 u64 avg_atom, avg_per_cpu;
887 893
888 PN(se.statistics.sum_sleep_runtime); 894 PN_SCHEDSTAT(se.statistics.sum_sleep_runtime);
889 PN(se.statistics.wait_start); 895 PN_SCHEDSTAT(se.statistics.wait_start);
890 PN(se.statistics.sleep_start); 896 PN_SCHEDSTAT(se.statistics.sleep_start);
891 PN(se.statistics.block_start); 897 PN_SCHEDSTAT(se.statistics.block_start);
892 PN(se.statistics.sleep_max); 898 PN_SCHEDSTAT(se.statistics.sleep_max);
893 PN(se.statistics.block_max); 899 PN_SCHEDSTAT(se.statistics.block_max);
894 PN(se.statistics.exec_max); 900 PN_SCHEDSTAT(se.statistics.exec_max);
895 PN(se.statistics.slice_max); 901 PN_SCHEDSTAT(se.statistics.slice_max);
896 PN(se.statistics.wait_max); 902 PN_SCHEDSTAT(se.statistics.wait_max);
897 PN(se.statistics.wait_sum); 903 PN_SCHEDSTAT(se.statistics.wait_sum);
898 P(se.statistics.wait_count); 904 P_SCHEDSTAT(se.statistics.wait_count);
899 PN(se.statistics.iowait_sum); 905 PN_SCHEDSTAT(se.statistics.iowait_sum);
900 P(se.statistics.iowait_count); 906 P_SCHEDSTAT(se.statistics.iowait_count);
901 P(se.statistics.nr_migrations_cold); 907 P_SCHEDSTAT(se.statistics.nr_migrations_cold);
902 P(se.statistics.nr_failed_migrations_affine); 908 P_SCHEDSTAT(se.statistics.nr_failed_migrations_affine);
903 P(se.statistics.nr_failed_migrations_running); 909 P_SCHEDSTAT(se.statistics.nr_failed_migrations_running);
904 P(se.statistics.nr_failed_migrations_hot); 910 P_SCHEDSTAT(se.statistics.nr_failed_migrations_hot);
905 P(se.statistics.nr_forced_migrations); 911 P_SCHEDSTAT(se.statistics.nr_forced_migrations);
906 P(se.statistics.nr_wakeups); 912 P_SCHEDSTAT(se.statistics.nr_wakeups);
907 P(se.statistics.nr_wakeups_sync); 913 P_SCHEDSTAT(se.statistics.nr_wakeups_sync);
908 P(se.statistics.nr_wakeups_migrate); 914 P_SCHEDSTAT(se.statistics.nr_wakeups_migrate);
909 P(se.statistics.nr_wakeups_local); 915 P_SCHEDSTAT(se.statistics.nr_wakeups_local);
910 P(se.statistics.nr_wakeups_remote); 916 P_SCHEDSTAT(se.statistics.nr_wakeups_remote);
911 P(se.statistics.nr_wakeups_affine); 917 P_SCHEDSTAT(se.statistics.nr_wakeups_affine);
912 P(se.statistics.nr_wakeups_affine_attempts); 918 P_SCHEDSTAT(se.statistics.nr_wakeups_affine_attempts);
913 P(se.statistics.nr_wakeups_passive); 919 P_SCHEDSTAT(se.statistics.nr_wakeups_passive);
914 P(se.statistics.nr_wakeups_idle); 920 P_SCHEDSTAT(se.statistics.nr_wakeups_idle);
915 921
916 avg_atom = p->se.sum_exec_runtime; 922 avg_atom = p->se.sum_exec_runtime;
917 if (nr_switches) 923 if (nr_switches)
@@ -930,7 +936,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
930 __PN(avg_atom); 936 __PN(avg_atom);
931 __PN(avg_per_cpu); 937 __PN(avg_per_cpu);
932 } 938 }
933#endif 939
934 __P(nr_switches); 940 __P(nr_switches);
935 SEQ_printf(m, "%-45s:%21Ld\n", 941 SEQ_printf(m, "%-45s:%21Ld\n",
936 "nr_voluntary_switches", (long long)p->nvcsw); 942 "nr_voluntary_switches", (long long)p->nvcsw);
@@ -947,8 +953,10 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
947#endif 953#endif
948 P(policy); 954 P(policy);
949 P(prio); 955 P(prio);
956#undef PN_SCHEDSTAT
950#undef PN 957#undef PN
951#undef __PN 958#undef __PN
959#undef P_SCHEDSTAT
952#undef P 960#undef P
953#undef __P 961#undef __P
954 962
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 039de34f1521..c242944f5cbd 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -114,6 +114,12 @@ unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
114unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL; 114unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
115#endif 115#endif
116 116
117/*
118 * The margin used when comparing utilization with CPU capacity:
119 * util * 1024 < capacity * margin
120 */
121unsigned int capacity_margin = 1280; /* ~20% */
122
117static inline void update_load_add(struct load_weight *lw, unsigned long inc) 123static inline void update_load_add(struct load_weight *lw, unsigned long inc)
118{ 124{
119 lw->weight += inc; 125 lw->weight += inc;
@@ -256,9 +262,7 @@ static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
256 262
257static inline struct task_struct *task_of(struct sched_entity *se) 263static inline struct task_struct *task_of(struct sched_entity *se)
258{ 264{
259#ifdef CONFIG_SCHED_DEBUG 265 SCHED_WARN_ON(!entity_is_task(se));
260 WARN_ON_ONCE(!entity_is_task(se));
261#endif
262 return container_of(se, struct task_struct, se); 266 return container_of(se, struct task_struct, se);
263} 267}
264 268
@@ -456,17 +460,23 @@ static inline int entity_before(struct sched_entity *a,
456 460
457static void update_min_vruntime(struct cfs_rq *cfs_rq) 461static void update_min_vruntime(struct cfs_rq *cfs_rq)
458{ 462{
463 struct sched_entity *curr = cfs_rq->curr;
464
459 u64 vruntime = cfs_rq->min_vruntime; 465 u64 vruntime = cfs_rq->min_vruntime;
460 466
461 if (cfs_rq->curr) 467 if (curr) {
462 vruntime = cfs_rq->curr->vruntime; 468 if (curr->on_rq)
469 vruntime = curr->vruntime;
470 else
471 curr = NULL;
472 }
463 473
464 if (cfs_rq->rb_leftmost) { 474 if (cfs_rq->rb_leftmost) {
465 struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost, 475 struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost,
466 struct sched_entity, 476 struct sched_entity,
467 run_node); 477 run_node);
468 478
469 if (!cfs_rq->curr) 479 if (!curr)
470 vruntime = se->vruntime; 480 vruntime = se->vruntime;
471 else 481 else
472 vruntime = min_vruntime(vruntime, se->vruntime); 482 vruntime = min_vruntime(vruntime, se->vruntime);
@@ -656,7 +666,7 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
656} 666}
657 667
658#ifdef CONFIG_SMP 668#ifdef CONFIG_SMP
659static int select_idle_sibling(struct task_struct *p, int cpu); 669static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu);
660static unsigned long task_h_load(struct task_struct *p); 670static unsigned long task_h_load(struct task_struct *p);
661 671
662/* 672/*
@@ -680,7 +690,14 @@ void init_entity_runnable_average(struct sched_entity *se)
680 * will definitely be update (after enqueue). 690 * will definitely be update (after enqueue).
681 */ 691 */
682 sa->period_contrib = 1023; 692 sa->period_contrib = 1023;
683 sa->load_avg = scale_load_down(se->load.weight); 693 /*
694 * Tasks are intialized with full load to be seen as heavy tasks until
695 * they get a chance to stabilize to their real load level.
696 * Group entities are intialized with zero load to reflect the fact that
697 * nothing has been attached to the task group yet.
698 */
699 if (entity_is_task(se))
700 sa->load_avg = scale_load_down(se->load.weight);
684 sa->load_sum = sa->load_avg * LOAD_AVG_MAX; 701 sa->load_sum = sa->load_avg * LOAD_AVG_MAX;
685 /* 702 /*
686 * At this point, util_avg won't be used in select_task_rq_fair anyway 703 * At this point, util_avg won't be used in select_task_rq_fair anyway
@@ -726,7 +743,6 @@ void post_init_entity_util_avg(struct sched_entity *se)
726 struct sched_avg *sa = &se->avg; 743 struct sched_avg *sa = &se->avg;
727 long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2; 744 long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2;
728 u64 now = cfs_rq_clock_task(cfs_rq); 745 u64 now = cfs_rq_clock_task(cfs_rq);
729 int tg_update;
730 746
731 if (cap > 0) { 747 if (cap > 0) {
732 if (cfs_rq->avg.util_avg != 0) { 748 if (cfs_rq->avg.util_avg != 0) {
@@ -759,10 +775,9 @@ void post_init_entity_util_avg(struct sched_entity *se)
759 } 775 }
760 } 776 }
761 777
762 tg_update = update_cfs_rq_load_avg(now, cfs_rq, false); 778 update_cfs_rq_load_avg(now, cfs_rq, false);
763 attach_entity_load_avg(cfs_rq, se); 779 attach_entity_load_avg(cfs_rq, se);
764 if (tg_update) 780 update_tg_load_avg(cfs_rq, false);
765 update_tg_load_avg(cfs_rq, false);
766} 781}
767 782
768#else /* !CONFIG_SMP */ 783#else /* !CONFIG_SMP */
@@ -799,7 +814,7 @@ static void update_curr(struct cfs_rq *cfs_rq)
799 max(delta_exec, curr->statistics.exec_max)); 814 max(delta_exec, curr->statistics.exec_max));
800 815
801 curr->sum_exec_runtime += delta_exec; 816 curr->sum_exec_runtime += delta_exec;
802 schedstat_add(cfs_rq, exec_clock, delta_exec); 817 schedstat_add(cfs_rq->exec_clock, delta_exec);
803 818
804 curr->vruntime += calc_delta_fair(delta_exec, curr); 819 curr->vruntime += calc_delta_fair(delta_exec, curr);
805 update_min_vruntime(cfs_rq); 820 update_min_vruntime(cfs_rq);
@@ -820,26 +835,34 @@ static void update_curr_fair(struct rq *rq)
820 update_curr(cfs_rq_of(&rq->curr->se)); 835 update_curr(cfs_rq_of(&rq->curr->se));
821} 836}
822 837
823#ifdef CONFIG_SCHEDSTATS
824static inline void 838static inline void
825update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) 839update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
826{ 840{
827 u64 wait_start = rq_clock(rq_of(cfs_rq)); 841 u64 wait_start, prev_wait_start;
842
843 if (!schedstat_enabled())
844 return;
845
846 wait_start = rq_clock(rq_of(cfs_rq));
847 prev_wait_start = schedstat_val(se->statistics.wait_start);
828 848
829 if (entity_is_task(se) && task_on_rq_migrating(task_of(se)) && 849 if (entity_is_task(se) && task_on_rq_migrating(task_of(se)) &&
830 likely(wait_start > se->statistics.wait_start)) 850 likely(wait_start > prev_wait_start))
831 wait_start -= se->statistics.wait_start; 851 wait_start -= prev_wait_start;
832 852
833 se->statistics.wait_start = wait_start; 853 schedstat_set(se->statistics.wait_start, wait_start);
834} 854}
835 855
836static void 856static inline void
837update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) 857update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
838{ 858{
839 struct task_struct *p; 859 struct task_struct *p;
840 u64 delta; 860 u64 delta;
841 861
842 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start; 862 if (!schedstat_enabled())
863 return;
864
865 delta = rq_clock(rq_of(cfs_rq)) - schedstat_val(se->statistics.wait_start);
843 866
844 if (entity_is_task(se)) { 867 if (entity_is_task(se)) {
845 p = task_of(se); 868 p = task_of(se);
@@ -849,35 +872,114 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
849 * time stamp can be adjusted to accumulate wait time 872 * time stamp can be adjusted to accumulate wait time
850 * prior to migration. 873 * prior to migration.
851 */ 874 */
852 se->statistics.wait_start = delta; 875 schedstat_set(se->statistics.wait_start, delta);
853 return; 876 return;
854 } 877 }
855 trace_sched_stat_wait(p, delta); 878 trace_sched_stat_wait(p, delta);
856 } 879 }
857 880
858 se->statistics.wait_max = max(se->statistics.wait_max, delta); 881 schedstat_set(se->statistics.wait_max,
859 se->statistics.wait_count++; 882 max(schedstat_val(se->statistics.wait_max), delta));
860 se->statistics.wait_sum += delta; 883 schedstat_inc(se->statistics.wait_count);
861 se->statistics.wait_start = 0; 884 schedstat_add(se->statistics.wait_sum, delta);
885 schedstat_set(se->statistics.wait_start, 0);
886}
887
888static inline void
889update_stats_enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
890{
891 struct task_struct *tsk = NULL;
892 u64 sleep_start, block_start;
893
894 if (!schedstat_enabled())
895 return;
896
897 sleep_start = schedstat_val(se->statistics.sleep_start);
898 block_start = schedstat_val(se->statistics.block_start);
899
900 if (entity_is_task(se))
901 tsk = task_of(se);
902
903 if (sleep_start) {
904 u64 delta = rq_clock(rq_of(cfs_rq)) - sleep_start;
905
906 if ((s64)delta < 0)
907 delta = 0;
908
909 if (unlikely(delta > schedstat_val(se->statistics.sleep_max)))
910 schedstat_set(se->statistics.sleep_max, delta);
911
912 schedstat_set(se->statistics.sleep_start, 0);
913 schedstat_add(se->statistics.sum_sleep_runtime, delta);
914
915 if (tsk) {
916 account_scheduler_latency(tsk, delta >> 10, 1);
917 trace_sched_stat_sleep(tsk, delta);
918 }
919 }
920 if (block_start) {
921 u64 delta = rq_clock(rq_of(cfs_rq)) - block_start;
922
923 if ((s64)delta < 0)
924 delta = 0;
925
926 if (unlikely(delta > schedstat_val(se->statistics.block_max)))
927 schedstat_set(se->statistics.block_max, delta);
928
929 schedstat_set(se->statistics.block_start, 0);
930 schedstat_add(se->statistics.sum_sleep_runtime, delta);
931
932 if (tsk) {
933 if (tsk->in_iowait) {
934 schedstat_add(se->statistics.iowait_sum, delta);
935 schedstat_inc(se->statistics.iowait_count);
936 trace_sched_stat_iowait(tsk, delta);
937 }
938
939 trace_sched_stat_blocked(tsk, delta);
940
941 /*
942 * Blocking time is in units of nanosecs, so shift by
943 * 20 to get a milliseconds-range estimation of the
944 * amount of time that the task spent sleeping:
945 */
946 if (unlikely(prof_on == SLEEP_PROFILING)) {
947 profile_hits(SLEEP_PROFILING,
948 (void *)get_wchan(tsk),
949 delta >> 20);
950 }
951 account_scheduler_latency(tsk, delta >> 10, 0);
952 }
953 }
862} 954}
863 955
864/* 956/*
865 * Task is being enqueued - update stats: 957 * Task is being enqueued - update stats:
866 */ 958 */
867static inline void 959static inline void
868update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) 960update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
869{ 961{
962 if (!schedstat_enabled())
963 return;
964
870 /* 965 /*
871 * Are we enqueueing a waiting task? (for current tasks 966 * Are we enqueueing a waiting task? (for current tasks
872 * a dequeue/enqueue event is a NOP) 967 * a dequeue/enqueue event is a NOP)
873 */ 968 */
874 if (se != cfs_rq->curr) 969 if (se != cfs_rq->curr)
875 update_stats_wait_start(cfs_rq, se); 970 update_stats_wait_start(cfs_rq, se);
971
972 if (flags & ENQUEUE_WAKEUP)
973 update_stats_enqueue_sleeper(cfs_rq, se);
876} 974}
877 975
878static inline void 976static inline void
879update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) 977update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
880{ 978{
979
980 if (!schedstat_enabled())
981 return;
982
881 /* 983 /*
882 * Mark the end of the wait period if dequeueing a 984 * Mark the end of the wait period if dequeueing a
883 * waiting task: 985 * waiting task:
@@ -885,40 +987,18 @@ update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
885 if (se != cfs_rq->curr) 987 if (se != cfs_rq->curr)
886 update_stats_wait_end(cfs_rq, se); 988 update_stats_wait_end(cfs_rq, se);
887 989
888 if (flags & DEQUEUE_SLEEP) { 990 if ((flags & DEQUEUE_SLEEP) && entity_is_task(se)) {
889 if (entity_is_task(se)) { 991 struct task_struct *tsk = task_of(se);
890 struct task_struct *tsk = task_of(se);
891 992
892 if (tsk->state & TASK_INTERRUPTIBLE) 993 if (tsk->state & TASK_INTERRUPTIBLE)
893 se->statistics.sleep_start = rq_clock(rq_of(cfs_rq)); 994 schedstat_set(se->statistics.sleep_start,
894 if (tsk->state & TASK_UNINTERRUPTIBLE) 995 rq_clock(rq_of(cfs_rq)));
895 se->statistics.block_start = rq_clock(rq_of(cfs_rq)); 996 if (tsk->state & TASK_UNINTERRUPTIBLE)
896 } 997 schedstat_set(se->statistics.block_start,
998 rq_clock(rq_of(cfs_rq)));
897 } 999 }
898
899}
900#else
901static inline void
902update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
903{
904} 1000}
905 1001
906static inline void
907update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
908{
909}
910
911static inline void
912update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
913{
914}
915
916static inline void
917update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
918{
919}
920#endif
921
922/* 1002/*
923 * We are picking a new current task - update its stats: 1003 * We are picking a new current task - update its stats:
924 */ 1004 */
@@ -1513,8 +1593,16 @@ balance:
1513 * One idle CPU per node is evaluated for a task numa move. 1593 * One idle CPU per node is evaluated for a task numa move.
1514 * Call select_idle_sibling to maybe find a better one. 1594 * Call select_idle_sibling to maybe find a better one.
1515 */ 1595 */
1516 if (!cur) 1596 if (!cur) {
1517 env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu); 1597 /*
1598 * select_idle_siblings() uses an per-cpu cpumask that
1599 * can be used from IRQ context.
1600 */
1601 local_irq_disable();
1602 env->dst_cpu = select_idle_sibling(env->p, env->src_cpu,
1603 env->dst_cpu);
1604 local_irq_enable();
1605 }
1518 1606
1519assign: 1607assign:
1520 task_numa_assign(env, cur, imp); 1608 task_numa_assign(env, cur, imp);
@@ -2292,7 +2380,7 @@ void task_numa_work(struct callback_head *work)
2292 unsigned long nr_pte_updates = 0; 2380 unsigned long nr_pte_updates = 0;
2293 long pages, virtpages; 2381 long pages, virtpages;
2294 2382
2295 WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work)); 2383 SCHED_WARN_ON(p != container_of(work, struct task_struct, numa_work));
2296 2384
2297 work->next = work; /* protect against double add */ 2385 work->next = work; /* protect against double add */
2298 /* 2386 /*
@@ -2803,9 +2891,21 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa,
2803} 2891}
2804 2892
2805#ifdef CONFIG_FAIR_GROUP_SCHED 2893#ifdef CONFIG_FAIR_GROUP_SCHED
2806/* 2894/**
2807 * Updating tg's load_avg is necessary before update_cfs_share (which is done) 2895 * update_tg_load_avg - update the tg's load avg
2808 * and effective_load (which is not done because it is too costly). 2896 * @cfs_rq: the cfs_rq whose avg changed
2897 * @force: update regardless of how small the difference
2898 *
2899 * This function 'ensures': tg->load_avg := \Sum tg->cfs_rq[]->avg.load.
2900 * However, because tg->load_avg is a global value there are performance
2901 * considerations.
2902 *
2903 * In order to avoid having to look at the other cfs_rq's, we use a
2904 * differential update where we store the last value we propagated. This in
2905 * turn allows skipping updates if the differential is 'small'.
2906 *
2907 * Updating tg's load_avg is necessary before update_cfs_share() (which is
2908 * done) and effective_load() (which is not done because it is too costly).
2809 */ 2909 */
2810static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) 2910static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
2811{ 2911{
@@ -2875,12 +2975,7 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
2875 2975
2876static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq) 2976static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
2877{ 2977{
2878 struct rq *rq = rq_of(cfs_rq); 2978 if (&this_rq()->cfs == cfs_rq) {
2879 int cpu = cpu_of(rq);
2880
2881 if (cpu == smp_processor_id() && &rq->cfs == cfs_rq) {
2882 unsigned long max = rq->cpu_capacity_orig;
2883
2884 /* 2979 /*
2885 * There are a few boundary cases this might miss but it should 2980 * There are a few boundary cases this might miss but it should
2886 * get called often enough that that should (hopefully) not be 2981 * get called often enough that that should (hopefully) not be
@@ -2897,8 +2992,7 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
2897 * 2992 *
2898 * See cpu_util(). 2993 * See cpu_util().
2899 */ 2994 */
2900 cpufreq_update_util(rq_clock(rq), 2995 cpufreq_update_util(rq_of(cfs_rq), 0);
2901 min(cfs_rq->avg.util_avg, max), max);
2902 } 2996 }
2903} 2997}
2904 2998
@@ -2931,10 +3025,10 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
2931 * 3025 *
2932 * cfs_rq->avg is used for task_h_load() and update_cfs_share() for example. 3026 * cfs_rq->avg is used for task_h_load() and update_cfs_share() for example.
2933 * 3027 *
2934 * Returns true if the load decayed or we removed utilization. It is expected 3028 * Returns true if the load decayed or we removed load.
2935 * that one calls update_tg_load_avg() on this condition, but after you've 3029 *
2936 * modified the cfs_rq avg (attach/detach), such that we propagate the new 3030 * Since both these conditions indicate a changed cfs_rq->avg.load we should
2937 * avg up. 3031 * call update_tg_load_avg() when this function returns true.
2938 */ 3032 */
2939static inline int 3033static inline int
2940update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) 3034update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
@@ -3159,10 +3253,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
3159 3253
3160static inline void update_load_avg(struct sched_entity *se, int not_used) 3254static inline void update_load_avg(struct sched_entity *se, int not_used)
3161{ 3255{
3162 struct cfs_rq *cfs_rq = cfs_rq_of(se); 3256 cpufreq_update_util(rq_of(cfs_rq_of(se)), 0);
3163 struct rq *rq = rq_of(cfs_rq);
3164
3165 cpufreq_trigger_update(rq_clock(rq));
3166} 3257}
3167 3258
3168static inline void 3259static inline void
@@ -3183,68 +3274,6 @@ static inline int idle_balance(struct rq *rq)
3183 3274
3184#endif /* CONFIG_SMP */ 3275#endif /* CONFIG_SMP */
3185 3276
3186static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
3187{
3188#ifdef CONFIG_SCHEDSTATS
3189 struct task_struct *tsk = NULL;
3190
3191 if (entity_is_task(se))
3192 tsk = task_of(se);
3193
3194 if (se->statistics.sleep_start) {
3195 u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.sleep_start;
3196
3197 if ((s64)delta < 0)
3198 delta = 0;
3199
3200 if (unlikely(delta > se->statistics.sleep_max))
3201 se->statistics.sleep_max = delta;
3202
3203 se->statistics.sleep_start = 0;
3204 se->statistics.sum_sleep_runtime += delta;
3205
3206 if (tsk) {
3207 account_scheduler_latency(tsk, delta >> 10, 1);
3208 trace_sched_stat_sleep(tsk, delta);
3209 }
3210 }
3211 if (se->statistics.block_start) {
3212 u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.block_start;
3213
3214 if ((s64)delta < 0)
3215 delta = 0;
3216
3217 if (unlikely(delta > se->statistics.block_max))
3218 se->statistics.block_max = delta;
3219
3220 se->statistics.block_start = 0;
3221 se->statistics.sum_sleep_runtime += delta;
3222
3223 if (tsk) {
3224 if (tsk->in_iowait) {
3225 se->statistics.iowait_sum += delta;
3226 se->statistics.iowait_count++;
3227 trace_sched_stat_iowait(tsk, delta);
3228 }
3229
3230 trace_sched_stat_blocked(tsk, delta);
3231
3232 /*
3233 * Blocking time is in units of nanosecs, so shift by
3234 * 20 to get a milliseconds-range estimation of the
3235 * amount of time that the task spent sleeping:
3236 */
3237 if (unlikely(prof_on == SLEEP_PROFILING)) {
3238 profile_hits(SLEEP_PROFILING,
3239 (void *)get_wchan(tsk),
3240 delta >> 20);
3241 }
3242 account_scheduler_latency(tsk, delta >> 10, 0);
3243 }
3244 }
3245#endif
3246}
3247
3248static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se) 3277static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
3249{ 3278{
3250#ifdef CONFIG_SCHED_DEBUG 3279#ifdef CONFIG_SCHED_DEBUG
@@ -3254,7 +3283,7 @@ static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
3254 d = -d; 3283 d = -d;
3255 3284
3256 if (d > 3*sysctl_sched_latency) 3285 if (d > 3*sysctl_sched_latency)
3257 schedstat_inc(cfs_rq, nr_spread_over); 3286 schedstat_inc(cfs_rq->nr_spread_over);
3258#endif 3287#endif
3259} 3288}
3260 3289
@@ -3371,17 +3400,12 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
3371 account_entity_enqueue(cfs_rq, se); 3400 account_entity_enqueue(cfs_rq, se);
3372 update_cfs_shares(cfs_rq); 3401 update_cfs_shares(cfs_rq);
3373 3402
3374 if (flags & ENQUEUE_WAKEUP) { 3403 if (flags & ENQUEUE_WAKEUP)
3375 place_entity(cfs_rq, se, 0); 3404 place_entity(cfs_rq, se, 0);
3376 if (schedstat_enabled())
3377 enqueue_sleeper(cfs_rq, se);
3378 }
3379 3405
3380 check_schedstat_required(); 3406 check_schedstat_required();
3381 if (schedstat_enabled()) { 3407 update_stats_enqueue(cfs_rq, se, flags);
3382 update_stats_enqueue(cfs_rq, se); 3408 check_spread(cfs_rq, se);
3383 check_spread(cfs_rq, se);
3384 }
3385 if (!curr) 3409 if (!curr)
3386 __enqueue_entity(cfs_rq, se); 3410 __enqueue_entity(cfs_rq, se);
3387 se->on_rq = 1; 3411 se->on_rq = 1;
@@ -3448,8 +3472,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
3448 update_curr(cfs_rq); 3472 update_curr(cfs_rq);
3449 dequeue_entity_load_avg(cfs_rq, se); 3473 dequeue_entity_load_avg(cfs_rq, se);
3450 3474
3451 if (schedstat_enabled()) 3475 update_stats_dequeue(cfs_rq, se, flags);
3452 update_stats_dequeue(cfs_rq, se, flags);
3453 3476
3454 clear_buddies(cfs_rq, se); 3477 clear_buddies(cfs_rq, se);
3455 3478
@@ -3459,9 +3482,10 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
3459 account_entity_dequeue(cfs_rq, se); 3482 account_entity_dequeue(cfs_rq, se);
3460 3483
3461 /* 3484 /*
3462 * Normalize the entity after updating the min_vruntime because the 3485 * Normalize after update_curr(); which will also have moved
3463 * update can refer to the ->curr item and we need to reflect this 3486 * min_vruntime if @se is the one holding it back. But before doing
3464 * movement in our normalized position. 3487 * update_min_vruntime() again, which will discount @se's position and
3488 * can move min_vruntime forward still more.
3465 */ 3489 */
3466 if (!(flags & DEQUEUE_SLEEP)) 3490 if (!(flags & DEQUEUE_SLEEP))
3467 se->vruntime -= cfs_rq->min_vruntime; 3491 se->vruntime -= cfs_rq->min_vruntime;
@@ -3469,8 +3493,16 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
3469 /* return excess runtime on last dequeue */ 3493 /* return excess runtime on last dequeue */
3470 return_cfs_rq_runtime(cfs_rq); 3494 return_cfs_rq_runtime(cfs_rq);
3471 3495
3472 update_min_vruntime(cfs_rq);
3473 update_cfs_shares(cfs_rq); 3496 update_cfs_shares(cfs_rq);
3497
3498 /*
3499 * Now advance min_vruntime if @se was the entity holding it back,
3500 * except when: DEQUEUE_SAVE && !DEQUEUE_MOVE, in this case we'll be
3501 * put back on, and if we advance min_vruntime, we'll be placed back
3502 * further than we started -- ie. we'll be penalized.
3503 */
3504 if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) == DEQUEUE_SAVE)
3505 update_min_vruntime(cfs_rq);
3474} 3506}
3475 3507
3476/* 3508/*
@@ -3523,25 +3555,25 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
3523 * a CPU. So account for the time it spent waiting on the 3555 * a CPU. So account for the time it spent waiting on the
3524 * runqueue. 3556 * runqueue.
3525 */ 3557 */
3526 if (schedstat_enabled()) 3558 update_stats_wait_end(cfs_rq, se);
3527 update_stats_wait_end(cfs_rq, se);
3528 __dequeue_entity(cfs_rq, se); 3559 __dequeue_entity(cfs_rq, se);
3529 update_load_avg(se, 1); 3560 update_load_avg(se, 1);
3530 } 3561 }
3531 3562
3532 update_stats_curr_start(cfs_rq, se); 3563 update_stats_curr_start(cfs_rq, se);
3533 cfs_rq->curr = se; 3564 cfs_rq->curr = se;
3534#ifdef CONFIG_SCHEDSTATS 3565
3535 /* 3566 /*
3536 * Track our maximum slice length, if the CPU's load is at 3567 * Track our maximum slice length, if the CPU's load is at
3537 * least twice that of our own weight (i.e. dont track it 3568 * least twice that of our own weight (i.e. dont track it
3538 * when there are only lesser-weight tasks around): 3569 * when there are only lesser-weight tasks around):
3539 */ 3570 */
3540 if (schedstat_enabled() && rq_of(cfs_rq)->load.weight >= 2*se->load.weight) { 3571 if (schedstat_enabled() && rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
3541 se->statistics.slice_max = max(se->statistics.slice_max, 3572 schedstat_set(se->statistics.slice_max,
3542 se->sum_exec_runtime - se->prev_sum_exec_runtime); 3573 max((u64)schedstat_val(se->statistics.slice_max),
3574 se->sum_exec_runtime - se->prev_sum_exec_runtime));
3543 } 3575 }
3544#endif 3576
3545 se->prev_sum_exec_runtime = se->sum_exec_runtime; 3577 se->prev_sum_exec_runtime = se->sum_exec_runtime;
3546} 3578}
3547 3579
@@ -3620,13 +3652,10 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
3620 /* throttle cfs_rqs exceeding runtime */ 3652 /* throttle cfs_rqs exceeding runtime */
3621 check_cfs_rq_runtime(cfs_rq); 3653 check_cfs_rq_runtime(cfs_rq);
3622 3654
3623 if (schedstat_enabled()) { 3655 check_spread(cfs_rq, prev);
3624 check_spread(cfs_rq, prev);
3625 if (prev->on_rq)
3626 update_stats_wait_start(cfs_rq, prev);
3627 }
3628 3656
3629 if (prev->on_rq) { 3657 if (prev->on_rq) {
3658 update_stats_wait_start(cfs_rq, prev);
3630 /* Put 'current' back into the tree. */ 3659 /* Put 'current' back into the tree. */
3631 __enqueue_entity(cfs_rq, prev); 3660 __enqueue_entity(cfs_rq, prev);
3632 /* in !on_rq case, update occurred at dequeue */ 3661 /* in !on_rq case, update occurred at dequeue */
@@ -4456,9 +4485,9 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
4456 struct sched_entity *se = &p->se; 4485 struct sched_entity *se = &p->se;
4457 struct cfs_rq *cfs_rq = cfs_rq_of(se); 4486 struct cfs_rq *cfs_rq = cfs_rq_of(se);
4458 4487
4459 WARN_ON(task_rq(p) != rq); 4488 SCHED_WARN_ON(task_rq(p) != rq);
4460 4489
4461 if (cfs_rq->nr_running > 1) { 4490 if (rq->cfs.h_nr_running > 1) {
4462 u64 slice = sched_slice(cfs_rq, se); 4491 u64 slice = sched_slice(cfs_rq, se);
4463 u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime; 4492 u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
4464 s64 delta = slice - ran; 4493 s64 delta = slice - ran;
@@ -4509,6 +4538,14 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
4509 struct cfs_rq *cfs_rq; 4538 struct cfs_rq *cfs_rq;
4510 struct sched_entity *se = &p->se; 4539 struct sched_entity *se = &p->se;
4511 4540
4541 /*
4542 * If in_iowait is set, the code below may not trigger any cpufreq
4543 * utilization updates, so do it here explicitly with the IOWAIT flag
4544 * passed.
4545 */
4546 if (p->in_iowait)
4547 cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_IOWAIT);
4548
4512 for_each_sched_entity(se) { 4549 for_each_sched_entity(se) {
4513 if (se->on_rq) 4550 if (se->on_rq)
4514 break; 4551 break;
@@ -4605,6 +4642,11 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
4605} 4642}
4606 4643
4607#ifdef CONFIG_SMP 4644#ifdef CONFIG_SMP
4645
4646/* Working cpumask for: load_balance, load_balance_newidle. */
4647DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
4648DEFINE_PER_CPU(cpumask_var_t, select_idle_mask);
4649
4608#ifdef CONFIG_NO_HZ_COMMON 4650#ifdef CONFIG_NO_HZ_COMMON
4609/* 4651/*
4610 * per rq 'load' arrray crap; XXX kill this. 4652 * per rq 'load' arrray crap; XXX kill this.
@@ -5006,9 +5048,9 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
5006 * wl = S * s'_i; see (2) 5048 * wl = S * s'_i; see (2)
5007 */ 5049 */
5008 if (W > 0 && w < W) 5050 if (W > 0 && w < W)
5009 wl = (w * (long)tg->shares) / W; 5051 wl = (w * (long)scale_load_down(tg->shares)) / W;
5010 else 5052 else
5011 wl = tg->shares; 5053 wl = scale_load_down(tg->shares);
5012 5054
5013 /* 5055 /*
5014 * Per the above, wl is the new se->load.weight value; since 5056 * Per the above, wl is the new se->load.weight value; since
@@ -5091,18 +5133,18 @@ static int wake_wide(struct task_struct *p)
5091 return 1; 5133 return 1;
5092} 5134}
5093 5135
5094static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) 5136static int wake_affine(struct sched_domain *sd, struct task_struct *p,
5137 int prev_cpu, int sync)
5095{ 5138{
5096 s64 this_load, load; 5139 s64 this_load, load;
5097 s64 this_eff_load, prev_eff_load; 5140 s64 this_eff_load, prev_eff_load;
5098 int idx, this_cpu, prev_cpu; 5141 int idx, this_cpu;
5099 struct task_group *tg; 5142 struct task_group *tg;
5100 unsigned long weight; 5143 unsigned long weight;
5101 int balanced; 5144 int balanced;
5102 5145
5103 idx = sd->wake_idx; 5146 idx = sd->wake_idx;
5104 this_cpu = smp_processor_id(); 5147 this_cpu = smp_processor_id();
5105 prev_cpu = task_cpu(p);
5106 load = source_load(prev_cpu, idx); 5148 load = source_load(prev_cpu, idx);
5107 this_load = target_load(this_cpu, idx); 5149 this_load = target_load(this_cpu, idx);
5108 5150
@@ -5146,13 +5188,13 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
5146 5188
5147 balanced = this_eff_load <= prev_eff_load; 5189 balanced = this_eff_load <= prev_eff_load;
5148 5190
5149 schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts); 5191 schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts);
5150 5192
5151 if (!balanced) 5193 if (!balanced)
5152 return 0; 5194 return 0;
5153 5195
5154 schedstat_inc(sd, ttwu_move_affine); 5196 schedstat_inc(sd->ttwu_move_affine);
5155 schedstat_inc(p, se.statistics.nr_wakeups_affine); 5197 schedstat_inc(p->se.statistics.nr_wakeups_affine);
5156 5198
5157 return 1; 5199 return 1;
5158} 5200}
@@ -5228,6 +5270,10 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
5228 int shallowest_idle_cpu = -1; 5270 int shallowest_idle_cpu = -1;
5229 int i; 5271 int i;
5230 5272
5273 /* Check if we have any choice: */
5274 if (group->group_weight == 1)
5275 return cpumask_first(sched_group_cpus(group));
5276
5231 /* Traverse only the allowed CPUs */ 5277 /* Traverse only the allowed CPUs */
5232 for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) { 5278 for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {
5233 if (idle_cpu(i)) { 5279 if (idle_cpu(i)) {
@@ -5265,64 +5311,242 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
5265} 5311}
5266 5312
5267/* 5313/*
5268 * Try and locate an idle CPU in the sched_domain. 5314 * Implement a for_each_cpu() variant that starts the scan at a given cpu
5315 * (@start), and wraps around.
5316 *
5317 * This is used to scan for idle CPUs; such that not all CPUs looking for an
5318 * idle CPU find the same CPU. The down-side is that tasks tend to cycle
5319 * through the LLC domain.
5320 *
5321 * Especially tbench is found sensitive to this.
5322 */
5323
5324static int cpumask_next_wrap(int n, const struct cpumask *mask, int start, int *wrapped)
5325{
5326 int next;
5327
5328again:
5329 next = find_next_bit(cpumask_bits(mask), nr_cpumask_bits, n+1);
5330
5331 if (*wrapped) {
5332 if (next >= start)
5333 return nr_cpumask_bits;
5334 } else {
5335 if (next >= nr_cpumask_bits) {
5336 *wrapped = 1;
5337 n = -1;
5338 goto again;
5339 }
5340 }
5341
5342 return next;
5343}
5344
5345#define for_each_cpu_wrap(cpu, mask, start, wrap) \
5346 for ((wrap) = 0, (cpu) = (start)-1; \
5347 (cpu) = cpumask_next_wrap((cpu), (mask), (start), &(wrap)), \
5348 (cpu) < nr_cpumask_bits; )
5349
5350#ifdef CONFIG_SCHED_SMT
5351
5352static inline void set_idle_cores(int cpu, int val)
5353{
5354 struct sched_domain_shared *sds;
5355
5356 sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
5357 if (sds)
5358 WRITE_ONCE(sds->has_idle_cores, val);
5359}
5360
5361static inline bool test_idle_cores(int cpu, bool def)
5362{
5363 struct sched_domain_shared *sds;
5364
5365 sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
5366 if (sds)
5367 return READ_ONCE(sds->has_idle_cores);
5368
5369 return def;
5370}
5371
5372/*
5373 * Scans the local SMT mask to see if the entire core is idle, and records this
5374 * information in sd_llc_shared->has_idle_cores.
5375 *
5376 * Since SMT siblings share all cache levels, inspecting this limited remote
5377 * state should be fairly cheap.
5378 */
5379void __update_idle_core(struct rq *rq)
5380{
5381 int core = cpu_of(rq);
5382 int cpu;
5383
5384 rcu_read_lock();
5385 if (test_idle_cores(core, true))
5386 goto unlock;
5387
5388 for_each_cpu(cpu, cpu_smt_mask(core)) {
5389 if (cpu == core)
5390 continue;
5391
5392 if (!idle_cpu(cpu))
5393 goto unlock;
5394 }
5395
5396 set_idle_cores(core, 1);
5397unlock:
5398 rcu_read_unlock();
5399}
5400
5401/*
5402 * Scan the entire LLC domain for idle cores; this dynamically switches off if
5403 * there are no idle cores left in the system; tracked through
5404 * sd_llc->shared->has_idle_cores and enabled through update_idle_core() above.
5405 */
5406static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target)
5407{
5408 struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
5409 int core, cpu, wrap;
5410
5411 if (!static_branch_likely(&sched_smt_present))
5412 return -1;
5413
5414 if (!test_idle_cores(target, false))
5415 return -1;
5416
5417 cpumask_and(cpus, sched_domain_span(sd), tsk_cpus_allowed(p));
5418
5419 for_each_cpu_wrap(core, cpus, target, wrap) {
5420 bool idle = true;
5421
5422 for_each_cpu(cpu, cpu_smt_mask(core)) {
5423 cpumask_clear_cpu(cpu, cpus);
5424 if (!idle_cpu(cpu))
5425 idle = false;
5426 }
5427
5428 if (idle)
5429 return core;
5430 }
5431
5432 /*
5433 * Failed to find an idle core; stop looking for one.
5434 */
5435 set_idle_cores(target, 0);
5436
5437 return -1;
5438}
5439
5440/*
5441 * Scan the local SMT mask for idle CPUs.
5442 */
5443static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
5444{
5445 int cpu;
5446
5447 if (!static_branch_likely(&sched_smt_present))
5448 return -1;
5449
5450 for_each_cpu(cpu, cpu_smt_mask(target)) {
5451 if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
5452 continue;
5453 if (idle_cpu(cpu))
5454 return cpu;
5455 }
5456
5457 return -1;
5458}
5459
5460#else /* CONFIG_SCHED_SMT */
5461
5462static inline int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target)
5463{
5464 return -1;
5465}
5466
5467static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
5468{
5469 return -1;
5470}
5471
5472#endif /* CONFIG_SCHED_SMT */
5473
5474/*
5475 * Scan the LLC domain for idle CPUs; this is dynamically regulated by
5476 * comparing the average scan cost (tracked in sd->avg_scan_cost) against the
5477 * average idle time for this rq (as found in rq->avg_idle).
5478 */
5479static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int target)
5480{
5481 struct sched_domain *this_sd;
5482 u64 avg_cost, avg_idle = this_rq()->avg_idle;
5483 u64 time, cost;
5484 s64 delta;
5485 int cpu, wrap;
5486
5487 this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
5488 if (!this_sd)
5489 return -1;
5490
5491 avg_cost = this_sd->avg_scan_cost;
5492
5493 /*
5494 * Due to large variance we need a large fuzz factor; hackbench in
5495 * particularly is sensitive here.
5496 */
5497 if ((avg_idle / 512) < avg_cost)
5498 return -1;
5499
5500 time = local_clock();
5501
5502 for_each_cpu_wrap(cpu, sched_domain_span(sd), target, wrap) {
5503 if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
5504 continue;
5505 if (idle_cpu(cpu))
5506 break;
5507 }
5508
5509 time = local_clock() - time;
5510 cost = this_sd->avg_scan_cost;
5511 delta = (s64)(time - cost) / 8;
5512 this_sd->avg_scan_cost += delta;
5513
5514 return cpu;
5515}
5516
5517/*
5518 * Try and locate an idle core/thread in the LLC cache domain.
5269 */ 5519 */
5270static int select_idle_sibling(struct task_struct *p, int target) 5520static int select_idle_sibling(struct task_struct *p, int prev, int target)
5271{ 5521{
5272 struct sched_domain *sd; 5522 struct sched_domain *sd;
5273 struct sched_group *sg; 5523 int i;
5274 int i = task_cpu(p);
5275 5524
5276 if (idle_cpu(target)) 5525 if (idle_cpu(target))
5277 return target; 5526 return target;
5278 5527
5279 /* 5528 /*
5280 * If the prevous cpu is cache affine and idle, don't be stupid. 5529 * If the previous cpu is cache affine and idle, don't be stupid.
5281 */ 5530 */
5282 if (i != target && cpus_share_cache(i, target) && idle_cpu(i)) 5531 if (prev != target && cpus_share_cache(prev, target) && idle_cpu(prev))
5283 return i; 5532 return prev;
5284 5533
5285 /*
5286 * Otherwise, iterate the domains and find an eligible idle cpu.
5287 *
5288 * A completely idle sched group at higher domains is more
5289 * desirable than an idle group at a lower level, because lower
5290 * domains have smaller groups and usually share hardware
5291 * resources which causes tasks to contend on them, e.g. x86
5292 * hyperthread siblings in the lowest domain (SMT) can contend
5293 * on the shared cpu pipeline.
5294 *
5295 * However, while we prefer idle groups at higher domains
5296 * finding an idle cpu at the lowest domain is still better than
5297 * returning 'target', which we've already established, isn't
5298 * idle.
5299 */
5300 sd = rcu_dereference(per_cpu(sd_llc, target)); 5534 sd = rcu_dereference(per_cpu(sd_llc, target));
5301 for_each_lower_domain(sd) { 5535 if (!sd)
5302 sg = sd->groups; 5536 return target;
5303 do { 5537
5304 if (!cpumask_intersects(sched_group_cpus(sg), 5538 i = select_idle_core(p, sd, target);
5305 tsk_cpus_allowed(p))) 5539 if ((unsigned)i < nr_cpumask_bits)
5306 goto next; 5540 return i;
5307 5541
5308 /* Ensure the entire group is idle */ 5542 i = select_idle_cpu(p, sd, target);
5309 for_each_cpu(i, sched_group_cpus(sg)) { 5543 if ((unsigned)i < nr_cpumask_bits)
5310 if (i == target || !idle_cpu(i)) 5544 return i;
5311 goto next; 5545
5312 } 5546 i = select_idle_smt(p, sd, target);
5547 if ((unsigned)i < nr_cpumask_bits)
5548 return i;
5313 5549
5314 /*
5315 * It doesn't matter which cpu we pick, the
5316 * whole group is idle.
5317 */
5318 target = cpumask_first_and(sched_group_cpus(sg),
5319 tsk_cpus_allowed(p));
5320 goto done;
5321next:
5322 sg = sg->next;
5323 } while (sg != sd->groups);
5324 }
5325done:
5326 return target; 5550 return target;
5327} 5551}
5328 5552
@@ -5360,6 +5584,32 @@ static int cpu_util(int cpu)
5360 return (util >= capacity) ? capacity : util; 5584 return (util >= capacity) ? capacity : util;
5361} 5585}
5362 5586
5587static inline int task_util(struct task_struct *p)
5588{
5589 return p->se.avg.util_avg;
5590}
5591
5592/*
5593 * Disable WAKE_AFFINE in the case where task @p doesn't fit in the
5594 * capacity of either the waking CPU @cpu or the previous CPU @prev_cpu.
5595 *
5596 * In that case WAKE_AFFINE doesn't make sense and we'll let
5597 * BALANCE_WAKE sort things out.
5598 */
5599static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
5600{
5601 long min_cap, max_cap;
5602
5603 min_cap = min(capacity_orig_of(prev_cpu), capacity_orig_of(cpu));
5604 max_cap = cpu_rq(cpu)->rd->max_cpu_capacity;
5605
5606 /* Minimum capacity is close to max, no need to abort wake_affine */
5607 if (max_cap - min_cap < max_cap >> 3)
5608 return 0;
5609
5610 return min_cap * 1024 < task_util(p) * capacity_margin;
5611}
5612
5363/* 5613/*
5364 * select_task_rq_fair: Select target runqueue for the waking task in domains 5614 * select_task_rq_fair: Select target runqueue for the waking task in domains
5365 * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE, 5615 * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE,
@@ -5383,7 +5633,8 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
5383 5633
5384 if (sd_flag & SD_BALANCE_WAKE) { 5634 if (sd_flag & SD_BALANCE_WAKE) {
5385 record_wakee(p); 5635 record_wakee(p);
5386 want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, tsk_cpus_allowed(p)); 5636 want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu)
5637 && cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
5387 } 5638 }
5388 5639
5389 rcu_read_lock(); 5640 rcu_read_lock();
@@ -5409,13 +5660,13 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
5409 5660
5410 if (affine_sd) { 5661 if (affine_sd) {
5411 sd = NULL; /* Prefer wake_affine over balance flags */ 5662 sd = NULL; /* Prefer wake_affine over balance flags */
5412 if (cpu != prev_cpu && wake_affine(affine_sd, p, sync)) 5663 if (cpu != prev_cpu && wake_affine(affine_sd, p, prev_cpu, sync))
5413 new_cpu = cpu; 5664 new_cpu = cpu;
5414 } 5665 }
5415 5666
5416 if (!sd) { 5667 if (!sd) {
5417 if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */ 5668 if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */
5418 new_cpu = select_idle_sibling(p, new_cpu); 5669 new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
5419 5670
5420 } else while (sd) { 5671 } else while (sd) {
5421 struct sched_group *group; 5672 struct sched_group *group;
@@ -5939,7 +6190,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
5939 * 6190 *
5940 * The adjacency matrix of the resulting graph is given by: 6191 * The adjacency matrix of the resulting graph is given by:
5941 * 6192 *
5942 * log_2 n 6193 * log_2 n
5943 * A_i,j = \Union (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1) (6) 6194 * A_i,j = \Union (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1) (6)
5944 * k = 0 6195 * k = 0
5945 * 6196 *
@@ -5985,7 +6236,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
5985 * 6236 *
5986 * [XXX write more on how we solve this.. _after_ merging pjt's patches that 6237 * [XXX write more on how we solve this.. _after_ merging pjt's patches that
5987 * rewrite all of this once again.] 6238 * rewrite all of this once again.]
5988 */ 6239 */
5989 6240
5990static unsigned long __read_mostly max_load_balance_interval = HZ/10; 6241static unsigned long __read_mostly max_load_balance_interval = HZ/10;
5991 6242
@@ -6133,7 +6384,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
6133 if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) { 6384 if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
6134 int cpu; 6385 int cpu;
6135 6386
6136 schedstat_inc(p, se.statistics.nr_failed_migrations_affine); 6387 schedstat_inc(p->se.statistics.nr_failed_migrations_affine);
6137 6388
6138 env->flags |= LBF_SOME_PINNED; 6389 env->flags |= LBF_SOME_PINNED;
6139 6390
@@ -6164,7 +6415,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
6164 env->flags &= ~LBF_ALL_PINNED; 6415 env->flags &= ~LBF_ALL_PINNED;
6165 6416
6166 if (task_running(env->src_rq, p)) { 6417 if (task_running(env->src_rq, p)) {
6167 schedstat_inc(p, se.statistics.nr_failed_migrations_running); 6418 schedstat_inc(p->se.statistics.nr_failed_migrations_running);
6168 return 0; 6419 return 0;
6169 } 6420 }
6170 6421
@@ -6181,13 +6432,13 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
6181 if (tsk_cache_hot <= 0 || 6432 if (tsk_cache_hot <= 0 ||
6182 env->sd->nr_balance_failed > env->sd->cache_nice_tries) { 6433 env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
6183 if (tsk_cache_hot == 1) { 6434 if (tsk_cache_hot == 1) {
6184 schedstat_inc(env->sd, lb_hot_gained[env->idle]); 6435 schedstat_inc(env->sd->lb_hot_gained[env->idle]);
6185 schedstat_inc(p, se.statistics.nr_forced_migrations); 6436 schedstat_inc(p->se.statistics.nr_forced_migrations);
6186 } 6437 }
6187 return 1; 6438 return 1;
6188 } 6439 }
6189 6440
6190 schedstat_inc(p, se.statistics.nr_failed_migrations_hot); 6441 schedstat_inc(p->se.statistics.nr_failed_migrations_hot);
6191 return 0; 6442 return 0;
6192} 6443}
6193 6444
@@ -6227,7 +6478,7 @@ static struct task_struct *detach_one_task(struct lb_env *env)
6227 * so we can safely collect stats here rather than 6478 * so we can safely collect stats here rather than
6228 * inside detach_tasks(). 6479 * inside detach_tasks().
6229 */ 6480 */
6230 schedstat_inc(env->sd, lb_gained[env->idle]); 6481 schedstat_inc(env->sd->lb_gained[env->idle]);
6231 return p; 6482 return p;
6232 } 6483 }
6233 return NULL; 6484 return NULL;
@@ -6319,7 +6570,7 @@ next:
6319 * so we can safely collect detach_one_task() stats here rather 6570 * so we can safely collect detach_one_task() stats here rather
6320 * than inside detach_one_task(). 6571 * than inside detach_one_task().
6321 */ 6572 */
6322 schedstat_add(env->sd, lb_gained[env->idle], detached); 6573 schedstat_add(env->sd->lb_gained[env->idle], detached);
6323 6574
6324 return detached; 6575 return detached;
6325} 6576}
@@ -6647,7 +6898,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
6647 /* 6898 /*
6648 * !SD_OVERLAP domains can assume that child groups 6899 * !SD_OVERLAP domains can assume that child groups
6649 * span the current group. 6900 * span the current group.
6650 */ 6901 */
6651 6902
6652 group = child->groups; 6903 group = child->groups;
6653 do { 6904 do {
@@ -7147,7 +7398,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
7147 load_above_capacity = busiest->sum_nr_running * SCHED_CAPACITY_SCALE; 7398 load_above_capacity = busiest->sum_nr_running * SCHED_CAPACITY_SCALE;
7148 if (load_above_capacity > busiest->group_capacity) { 7399 if (load_above_capacity > busiest->group_capacity) {
7149 load_above_capacity -= busiest->group_capacity; 7400 load_above_capacity -= busiest->group_capacity;
7150 load_above_capacity *= NICE_0_LOAD; 7401 load_above_capacity *= scale_load_down(NICE_0_LOAD);
7151 load_above_capacity /= busiest->group_capacity; 7402 load_above_capacity /= busiest->group_capacity;
7152 } else 7403 } else
7153 load_above_capacity = ~0UL; 7404 load_above_capacity = ~0UL;
@@ -7354,9 +7605,6 @@ static struct rq *find_busiest_queue(struct lb_env *env,
7354 */ 7605 */
7355#define MAX_PINNED_INTERVAL 512 7606#define MAX_PINNED_INTERVAL 512
7356 7607
7357/* Working cpumask for load_balance and load_balance_newidle. */
7358DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
7359
7360static int need_active_balance(struct lb_env *env) 7608static int need_active_balance(struct lb_env *env)
7361{ 7609{
7362 struct sched_domain *sd = env->sd; 7610 struct sched_domain *sd = env->sd;
@@ -7460,7 +7708,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
7460 7708
7461 cpumask_copy(cpus, cpu_active_mask); 7709 cpumask_copy(cpus, cpu_active_mask);
7462 7710
7463 schedstat_inc(sd, lb_count[idle]); 7711 schedstat_inc(sd->lb_count[idle]);
7464 7712
7465redo: 7713redo:
7466 if (!should_we_balance(&env)) { 7714 if (!should_we_balance(&env)) {
@@ -7470,19 +7718,19 @@ redo:
7470 7718
7471 group = find_busiest_group(&env); 7719 group = find_busiest_group(&env);
7472 if (!group) { 7720 if (!group) {
7473 schedstat_inc(sd, lb_nobusyg[idle]); 7721 schedstat_inc(sd->lb_nobusyg[idle]);
7474 goto out_balanced; 7722 goto out_balanced;
7475 } 7723 }
7476 7724
7477 busiest = find_busiest_queue(&env, group); 7725 busiest = find_busiest_queue(&env, group);
7478 if (!busiest) { 7726 if (!busiest) {
7479 schedstat_inc(sd, lb_nobusyq[idle]); 7727 schedstat_inc(sd->lb_nobusyq[idle]);
7480 goto out_balanced; 7728 goto out_balanced;
7481 } 7729 }
7482 7730
7483 BUG_ON(busiest == env.dst_rq); 7731 BUG_ON(busiest == env.dst_rq);
7484 7732
7485 schedstat_add(sd, lb_imbalance[idle], env.imbalance); 7733 schedstat_add(sd->lb_imbalance[idle], env.imbalance);
7486 7734
7487 env.src_cpu = busiest->cpu; 7735 env.src_cpu = busiest->cpu;
7488 env.src_rq = busiest; 7736 env.src_rq = busiest;
@@ -7589,7 +7837,7 @@ more_balance:
7589 } 7837 }
7590 7838
7591 if (!ld_moved) { 7839 if (!ld_moved) {
7592 schedstat_inc(sd, lb_failed[idle]); 7840 schedstat_inc(sd->lb_failed[idle]);
7593 /* 7841 /*
7594 * Increment the failure counter only on periodic balance. 7842 * Increment the failure counter only on periodic balance.
7595 * We do not want newidle balance, which can be very 7843 * We do not want newidle balance, which can be very
@@ -7672,7 +7920,7 @@ out_all_pinned:
7672 * we can't migrate them. Let the imbalance flag set so parent level 7920 * we can't migrate them. Let the imbalance flag set so parent level
7673 * can try to migrate them. 7921 * can try to migrate them.
7674 */ 7922 */
7675 schedstat_inc(sd, lb_balanced[idle]); 7923 schedstat_inc(sd->lb_balanced[idle]);
7676 7924
7677 sd->nr_balance_failed = 0; 7925 sd->nr_balance_failed = 0;
7678 7926
@@ -7704,11 +7952,12 @@ get_sd_balance_interval(struct sched_domain *sd, int cpu_busy)
7704} 7952}
7705 7953
7706static inline void 7954static inline void
7707update_next_balance(struct sched_domain *sd, int cpu_busy, unsigned long *next_balance) 7955update_next_balance(struct sched_domain *sd, unsigned long *next_balance)
7708{ 7956{
7709 unsigned long interval, next; 7957 unsigned long interval, next;
7710 7958
7711 interval = get_sd_balance_interval(sd, cpu_busy); 7959 /* used by idle balance, so cpu_busy = 0 */
7960 interval = get_sd_balance_interval(sd, 0);
7712 next = sd->last_balance + interval; 7961 next = sd->last_balance + interval;
7713 7962
7714 if (time_after(*next_balance, next)) 7963 if (time_after(*next_balance, next))
@@ -7738,7 +7987,7 @@ static int idle_balance(struct rq *this_rq)
7738 rcu_read_lock(); 7987 rcu_read_lock();
7739 sd = rcu_dereference_check_sched_domain(this_rq->sd); 7988 sd = rcu_dereference_check_sched_domain(this_rq->sd);
7740 if (sd) 7989 if (sd)
7741 update_next_balance(sd, 0, &next_balance); 7990 update_next_balance(sd, &next_balance);
7742 rcu_read_unlock(); 7991 rcu_read_unlock();
7743 7992
7744 goto out; 7993 goto out;
@@ -7756,7 +8005,7 @@ static int idle_balance(struct rq *this_rq)
7756 continue; 8005 continue;
7757 8006
7758 if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) { 8007 if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) {
7759 update_next_balance(sd, 0, &next_balance); 8008 update_next_balance(sd, &next_balance);
7760 break; 8009 break;
7761 } 8010 }
7762 8011
@@ -7774,7 +8023,7 @@ static int idle_balance(struct rq *this_rq)
7774 curr_cost += domain_cost; 8023 curr_cost += domain_cost;
7775 } 8024 }
7776 8025
7777 update_next_balance(sd, 0, &next_balance); 8026 update_next_balance(sd, &next_balance);
7778 8027
7779 /* 8028 /*
7780 * Stop searching for tasks to pull if there are 8029 * Stop searching for tasks to pull if there are
@@ -7864,15 +8113,15 @@ static int active_load_balance_cpu_stop(void *data)
7864 .idle = CPU_IDLE, 8113 .idle = CPU_IDLE,
7865 }; 8114 };
7866 8115
7867 schedstat_inc(sd, alb_count); 8116 schedstat_inc(sd->alb_count);
7868 8117
7869 p = detach_one_task(&env); 8118 p = detach_one_task(&env);
7870 if (p) { 8119 if (p) {
7871 schedstat_inc(sd, alb_pushed); 8120 schedstat_inc(sd->alb_pushed);
7872 /* Active balancing done, reset the failure counter. */ 8121 /* Active balancing done, reset the failure counter. */
7873 sd->nr_balance_failed = 0; 8122 sd->nr_balance_failed = 0;
7874 } else { 8123 } else {
7875 schedstat_inc(sd, alb_failed); 8124 schedstat_inc(sd->alb_failed);
7876 } 8125 }
7877 } 8126 }
7878 rcu_read_unlock(); 8127 rcu_read_unlock();
@@ -7964,13 +8213,13 @@ static inline void set_cpu_sd_state_busy(void)
7964 int cpu = smp_processor_id(); 8213 int cpu = smp_processor_id();
7965 8214
7966 rcu_read_lock(); 8215 rcu_read_lock();
7967 sd = rcu_dereference(per_cpu(sd_busy, cpu)); 8216 sd = rcu_dereference(per_cpu(sd_llc, cpu));
7968 8217
7969 if (!sd || !sd->nohz_idle) 8218 if (!sd || !sd->nohz_idle)
7970 goto unlock; 8219 goto unlock;
7971 sd->nohz_idle = 0; 8220 sd->nohz_idle = 0;
7972 8221
7973 atomic_inc(&sd->groups->sgc->nr_busy_cpus); 8222 atomic_inc(&sd->shared->nr_busy_cpus);
7974unlock: 8223unlock:
7975 rcu_read_unlock(); 8224 rcu_read_unlock();
7976} 8225}
@@ -7981,13 +8230,13 @@ void set_cpu_sd_state_idle(void)
7981 int cpu = smp_processor_id(); 8230 int cpu = smp_processor_id();
7982 8231
7983 rcu_read_lock(); 8232 rcu_read_lock();
7984 sd = rcu_dereference(per_cpu(sd_busy, cpu)); 8233 sd = rcu_dereference(per_cpu(sd_llc, cpu));
7985 8234
7986 if (!sd || sd->nohz_idle) 8235 if (!sd || sd->nohz_idle)
7987 goto unlock; 8236 goto unlock;
7988 sd->nohz_idle = 1; 8237 sd->nohz_idle = 1;
7989 8238
7990 atomic_dec(&sd->groups->sgc->nr_busy_cpus); 8239 atomic_dec(&sd->shared->nr_busy_cpus);
7991unlock: 8240unlock:
7992 rcu_read_unlock(); 8241 rcu_read_unlock();
7993} 8242}
@@ -8214,8 +8463,8 @@ end:
8214static inline bool nohz_kick_needed(struct rq *rq) 8463static inline bool nohz_kick_needed(struct rq *rq)
8215{ 8464{
8216 unsigned long now = jiffies; 8465 unsigned long now = jiffies;
8466 struct sched_domain_shared *sds;
8217 struct sched_domain *sd; 8467 struct sched_domain *sd;
8218 struct sched_group_capacity *sgc;
8219 int nr_busy, cpu = rq->cpu; 8468 int nr_busy, cpu = rq->cpu;
8220 bool kick = false; 8469 bool kick = false;
8221 8470
@@ -8243,11 +8492,13 @@ static inline bool nohz_kick_needed(struct rq *rq)
8243 return true; 8492 return true;
8244 8493
8245 rcu_read_lock(); 8494 rcu_read_lock();
8246 sd = rcu_dereference(per_cpu(sd_busy, cpu)); 8495 sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
8247 if (sd) { 8496 if (sds) {
8248 sgc = sd->groups->sgc; 8497 /*
8249 nr_busy = atomic_read(&sgc->nr_busy_cpus); 8498 * XXX: write a coherent comment on why we do this.
8250 8499 * See also: http://lkml.kernel.org/r/20111202010832.602203411@sbsiddha-desk.sc.intel.com
8500 */
8501 nr_busy = atomic_read(&sds->nr_busy_cpus);
8251 if (nr_busy > 1) { 8502 if (nr_busy > 1) {
8252 kick = true; 8503 kick = true;
8253 goto unlock; 8504 goto unlock;
@@ -8283,7 +8534,7 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { }
8283 * run_rebalance_domains is triggered when needed from the scheduler tick. 8534 * run_rebalance_domains is triggered when needed from the scheduler tick.
8284 * Also triggered for nohz idle balancing (with nohz_balancing_kick set). 8535 * Also triggered for nohz idle balancing (with nohz_balancing_kick set).
8285 */ 8536 */
8286static void run_rebalance_domains(struct softirq_action *h) 8537static __latent_entropy void run_rebalance_domains(struct softirq_action *h)
8287{ 8538{
8288 struct rq *this_rq = this_rq(); 8539 struct rq *this_rq = this_rq();
8289 enum cpu_idle_type idle = this_rq->idle_balance ? 8540 enum cpu_idle_type idle = this_rq->idle_balance ?
@@ -8441,7 +8692,6 @@ static void detach_task_cfs_rq(struct task_struct *p)
8441 struct sched_entity *se = &p->se; 8692 struct sched_entity *se = &p->se;
8442 struct cfs_rq *cfs_rq = cfs_rq_of(se); 8693 struct cfs_rq *cfs_rq = cfs_rq_of(se);
8443 u64 now = cfs_rq_clock_task(cfs_rq); 8694 u64 now = cfs_rq_clock_task(cfs_rq);
8444 int tg_update;
8445 8695
8446 if (!vruntime_normalized(p)) { 8696 if (!vruntime_normalized(p)) {
8447 /* 8697 /*
@@ -8453,10 +8703,9 @@ static void detach_task_cfs_rq(struct task_struct *p)
8453 } 8703 }
8454 8704
8455 /* Catch up with the cfs_rq and remove our load when we leave */ 8705 /* Catch up with the cfs_rq and remove our load when we leave */
8456 tg_update = update_cfs_rq_load_avg(now, cfs_rq, false); 8706 update_cfs_rq_load_avg(now, cfs_rq, false);
8457 detach_entity_load_avg(cfs_rq, se); 8707 detach_entity_load_avg(cfs_rq, se);
8458 if (tg_update) 8708 update_tg_load_avg(cfs_rq, false);
8459 update_tg_load_avg(cfs_rq, false);
8460} 8709}
8461 8710
8462static void attach_task_cfs_rq(struct task_struct *p) 8711static void attach_task_cfs_rq(struct task_struct *p)
@@ -8464,7 +8713,6 @@ static void attach_task_cfs_rq(struct task_struct *p)
8464 struct sched_entity *se = &p->se; 8713 struct sched_entity *se = &p->se;
8465 struct cfs_rq *cfs_rq = cfs_rq_of(se); 8714 struct cfs_rq *cfs_rq = cfs_rq_of(se);
8466 u64 now = cfs_rq_clock_task(cfs_rq); 8715 u64 now = cfs_rq_clock_task(cfs_rq);
8467 int tg_update;
8468 8716
8469#ifdef CONFIG_FAIR_GROUP_SCHED 8717#ifdef CONFIG_FAIR_GROUP_SCHED
8470 /* 8718 /*
@@ -8475,10 +8723,9 @@ static void attach_task_cfs_rq(struct task_struct *p)
8475#endif 8723#endif
8476 8724
8477 /* Synchronize task with its cfs_rq */ 8725 /* Synchronize task with its cfs_rq */
8478 tg_update = update_cfs_rq_load_avg(now, cfs_rq, false); 8726 update_cfs_rq_load_avg(now, cfs_rq, false);
8479 attach_entity_load_avg(cfs_rq, se); 8727 attach_entity_load_avg(cfs_rq, se);
8480 if (tg_update) 8728 update_tg_load_avg(cfs_rq, false);
8481 update_tg_load_avg(cfs_rq, false);
8482 8729
8483 if (!vruntime_normalized(p)) 8730 if (!vruntime_normalized(p))
8484 se->vruntime += cfs_rq->min_vruntime; 8731 se->vruntime += cfs_rq->min_vruntime;
@@ -8592,7 +8839,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8592{ 8839{
8593 struct sched_entity *se; 8840 struct sched_entity *se;
8594 struct cfs_rq *cfs_rq; 8841 struct cfs_rq *cfs_rq;
8595 struct rq *rq;
8596 int i; 8842 int i;
8597 8843
8598 tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL); 8844 tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
@@ -8607,8 +8853,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8607 init_cfs_bandwidth(tg_cfs_bandwidth(tg)); 8853 init_cfs_bandwidth(tg_cfs_bandwidth(tg));
8608 8854
8609 for_each_possible_cpu(i) { 8855 for_each_possible_cpu(i) {
8610 rq = cpu_rq(i);
8611
8612 cfs_rq = kzalloc_node(sizeof(struct cfs_rq), 8856 cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
8613 GFP_KERNEL, cpu_to_node(i)); 8857 GFP_KERNEL, cpu_to_node(i));
8614 if (!cfs_rq) 8858 if (!cfs_rq)
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 9fb873cfc75c..1d8718d5300d 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -16,6 +16,9 @@
16 16
17#include "sched.h" 17#include "sched.h"
18 18
19/* Linker adds these: start and end of __cpuidle functions */
20extern char __cpuidle_text_start[], __cpuidle_text_end[];
21
19/** 22/**
20 * sched_idle_set_state - Record idle state for the current CPU. 23 * sched_idle_set_state - Record idle state for the current CPU.
21 * @idle_state: State to record. 24 * @idle_state: State to record.
@@ -53,7 +56,7 @@ static int __init cpu_idle_nopoll_setup(char *__unused)
53__setup("hlt", cpu_idle_nopoll_setup); 56__setup("hlt", cpu_idle_nopoll_setup);
54#endif 57#endif
55 58
56static inline int cpu_idle_poll(void) 59static noinline int __cpuidle cpu_idle_poll(void)
57{ 60{
58 rcu_idle_enter(); 61 rcu_idle_enter();
59 trace_cpu_idle_rcuidle(0, smp_processor_id()); 62 trace_cpu_idle_rcuidle(0, smp_processor_id());
@@ -84,7 +87,7 @@ void __weak arch_cpu_idle(void)
84 * 87 *
85 * To use when the cpuidle framework cannot be used. 88 * To use when the cpuidle framework cannot be used.
86 */ 89 */
87void default_idle_call(void) 90void __cpuidle default_idle_call(void)
88{ 91{
89 if (current_clr_polling_and_test()) { 92 if (current_clr_polling_and_test()) {
90 local_irq_enable(); 93 local_irq_enable();
@@ -271,6 +274,12 @@ static void cpu_idle_loop(void)
271 } 274 }
272} 275}
273 276
277bool cpu_in_idle(unsigned long pc)
278{
279 return pc >= (unsigned long)__cpuidle_text_start &&
280 pc < (unsigned long)__cpuidle_text_end;
281}
282
274void cpu_startup_entry(enum cpuhp_state state) 283void cpu_startup_entry(enum cpuhp_state state)
275{ 284{
276 /* 285 /*
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index 2ce5458bbe1d..5405d3feb112 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -27,8 +27,8 @@ static struct task_struct *
27pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie) 27pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie)
28{ 28{
29 put_prev_task(rq, prev); 29 put_prev_task(rq, prev);
30 30 update_idle_core(rq);
31 schedstat_inc(rq, sched_goidle); 31 schedstat_inc(rq->sched_goidle);
32 return rq->idle; 32 return rq->idle;
33} 33}
34 34
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index d5690b722691..2516b8df6dbb 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -957,9 +957,8 @@ static void update_curr_rt(struct rq *rq)
957 if (unlikely((s64)delta_exec <= 0)) 957 if (unlikely((s64)delta_exec <= 0))
958 return; 958 return;
959 959
960 /* Kick cpufreq (see the comment in linux/cpufreq.h). */ 960 /* Kick cpufreq (see the comment in kernel/sched/sched.h). */
961 if (cpu_of(rq) == smp_processor_id()) 961 cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_RT);
962 cpufreq_trigger_update(rq_clock(rq));
963 962
964 schedstat_set(curr->se.statistics.exec_max, 963 schedstat_set(curr->se.statistics.exec_max,
965 max(curr->se.statistics.exec_max, delta_exec)); 964 max(curr->se.statistics.exec_max, delta_exec));
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index c64fc5114004..055f935d4421 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2,6 +2,7 @@
2#include <linux/sched.h> 2#include <linux/sched.h>
3#include <linux/sched/sysctl.h> 3#include <linux/sched/sysctl.h>
4#include <linux/sched/rt.h> 4#include <linux/sched/rt.h>
5#include <linux/u64_stats_sync.h>
5#include <linux/sched/deadline.h> 6#include <linux/sched/deadline.h>
6#include <linux/binfmts.h> 7#include <linux/binfmts.h>
7#include <linux/mutex.h> 8#include <linux/mutex.h>
@@ -15,6 +16,12 @@
15#include "cpudeadline.h" 16#include "cpudeadline.h"
16#include "cpuacct.h" 17#include "cpuacct.h"
17 18
19#ifdef CONFIG_SCHED_DEBUG
20#define SCHED_WARN_ON(x) WARN_ONCE(x, #x)
21#else
22#define SCHED_WARN_ON(x) ((void)(x))
23#endif
24
18struct rq; 25struct rq;
19struct cpuidle_state; 26struct cpuidle_state;
20 27
@@ -565,6 +572,8 @@ struct root_domain {
565 */ 572 */
566 cpumask_var_t rto_mask; 573 cpumask_var_t rto_mask;
567 struct cpupri cpupri; 574 struct cpupri cpupri;
575
576 unsigned long max_cpu_capacity;
568}; 577};
569 578
570extern struct root_domain def_root_domain; 579extern struct root_domain def_root_domain;
@@ -597,7 +606,6 @@ struct rq {
597#ifdef CONFIG_SMP 606#ifdef CONFIG_SMP
598 unsigned long last_load_update_tick; 607 unsigned long last_load_update_tick;
599#endif /* CONFIG_SMP */ 608#endif /* CONFIG_SMP */
600 u64 nohz_stamp;
601 unsigned long nohz_flags; 609 unsigned long nohz_flags;
602#endif /* CONFIG_NO_HZ_COMMON */ 610#endif /* CONFIG_NO_HZ_COMMON */
603#ifdef CONFIG_NO_HZ_FULL 611#ifdef CONFIG_NO_HZ_FULL
@@ -723,6 +731,23 @@ static inline int cpu_of(struct rq *rq)
723#endif 731#endif
724} 732}
725 733
734
735#ifdef CONFIG_SCHED_SMT
736
737extern struct static_key_false sched_smt_present;
738
739extern void __update_idle_core(struct rq *rq);
740
741static inline void update_idle_core(struct rq *rq)
742{
743 if (static_branch_unlikely(&sched_smt_present))
744 __update_idle_core(rq);
745}
746
747#else
748static inline void update_idle_core(struct rq *rq) { }
749#endif
750
726DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); 751DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
727 752
728#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) 753#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
@@ -857,8 +882,8 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
857DECLARE_PER_CPU(struct sched_domain *, sd_llc); 882DECLARE_PER_CPU(struct sched_domain *, sd_llc);
858DECLARE_PER_CPU(int, sd_llc_size); 883DECLARE_PER_CPU(int, sd_llc_size);
859DECLARE_PER_CPU(int, sd_llc_id); 884DECLARE_PER_CPU(int, sd_llc_id);
885DECLARE_PER_CPU(struct sched_domain_shared *, sd_llc_shared);
860DECLARE_PER_CPU(struct sched_domain *, sd_numa); 886DECLARE_PER_CPU(struct sched_domain *, sd_numa);
861DECLARE_PER_CPU(struct sched_domain *, sd_busy);
862DECLARE_PER_CPU(struct sched_domain *, sd_asym); 887DECLARE_PER_CPU(struct sched_domain *, sd_asym);
863 888
864struct sched_group_capacity { 889struct sched_group_capacity {
@@ -870,10 +895,6 @@ struct sched_group_capacity {
870 unsigned int capacity; 895 unsigned int capacity;
871 unsigned long next_update; 896 unsigned long next_update;
872 int imbalance; /* XXX unrelated to capacity but shared group state */ 897 int imbalance; /* XXX unrelated to capacity but shared group state */
873 /*
874 * Number of busy cpus in this group.
875 */
876 atomic_t nr_busy_cpus;
877 898
878 unsigned long cpumask[0]; /* iteration mask */ 899 unsigned long cpumask[0]; /* iteration mask */
879}; 900};
@@ -1000,7 +1021,11 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1000 * per-task data have been completed by this moment. 1021 * per-task data have been completed by this moment.
1001 */ 1022 */
1002 smp_wmb(); 1023 smp_wmb();
1024#ifdef CONFIG_THREAD_INFO_IN_TASK
1025 p->cpu = cpu;
1026#else
1003 task_thread_info(p)->cpu = cpu; 1027 task_thread_info(p)->cpu = cpu;
1028#endif
1004 p->wake_cpu = cpu; 1029 p->wake_cpu = cpu;
1005#endif 1030#endif
1006} 1031}
@@ -1260,6 +1285,11 @@ static inline void put_prev_task(struct rq *rq, struct task_struct *prev)
1260 prev->sched_class->put_prev_task(rq, prev); 1285 prev->sched_class->put_prev_task(rq, prev);
1261} 1286}
1262 1287
1288static inline void set_curr_task(struct rq *rq, struct task_struct *curr)
1289{
1290 curr->sched_class->set_curr_task(rq);
1291}
1292
1263#define sched_class_highest (&stop_sched_class) 1293#define sched_class_highest (&stop_sched_class)
1264#define for_each_class(class) \ 1294#define for_each_class(class) \
1265 for (class = sched_class_highest; class; class = class->next) 1295 for (class = sched_class_highest; class; class = class->next)
@@ -1290,7 +1320,7 @@ static inline void idle_set_state(struct rq *rq,
1290 1320
1291static inline struct cpuidle_state *idle_get_state(struct rq *rq) 1321static inline struct cpuidle_state *idle_get_state(struct rq *rq)
1292{ 1322{
1293 WARN_ON(!rcu_read_lock_held()); 1323 SCHED_WARN_ON(!rcu_read_lock_held());
1294 return rq->idle_state; 1324 return rq->idle_state;
1295} 1325}
1296#else 1326#else
@@ -1710,52 +1740,28 @@ static inline void nohz_balance_exit_idle(unsigned int cpu) { }
1710#endif 1740#endif
1711 1741
1712#ifdef CONFIG_IRQ_TIME_ACCOUNTING 1742#ifdef CONFIG_IRQ_TIME_ACCOUNTING
1743struct irqtime {
1744 u64 hardirq_time;
1745 u64 softirq_time;
1746 u64 irq_start_time;
1747 struct u64_stats_sync sync;
1748};
1713 1749
1714DECLARE_PER_CPU(u64, cpu_hardirq_time); 1750DECLARE_PER_CPU(struct irqtime, cpu_irqtime);
1715DECLARE_PER_CPU(u64, cpu_softirq_time);
1716
1717#ifndef CONFIG_64BIT
1718DECLARE_PER_CPU(seqcount_t, irq_time_seq);
1719
1720static inline void irq_time_write_begin(void)
1721{
1722 __this_cpu_inc(irq_time_seq.sequence);
1723 smp_wmb();
1724}
1725
1726static inline void irq_time_write_end(void)
1727{
1728 smp_wmb();
1729 __this_cpu_inc(irq_time_seq.sequence);
1730}
1731 1751
1732static inline u64 irq_time_read(int cpu) 1752static inline u64 irq_time_read(int cpu)
1733{ 1753{
1734 u64 irq_time; 1754 struct irqtime *irqtime = &per_cpu(cpu_irqtime, cpu);
1735 unsigned seq; 1755 unsigned int seq;
1756 u64 total;
1736 1757
1737 do { 1758 do {
1738 seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu)); 1759 seq = __u64_stats_fetch_begin(&irqtime->sync);
1739 irq_time = per_cpu(cpu_softirq_time, cpu) + 1760 total = irqtime->softirq_time + irqtime->hardirq_time;
1740 per_cpu(cpu_hardirq_time, cpu); 1761 } while (__u64_stats_fetch_retry(&irqtime->sync, seq));
1741 } while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq));
1742
1743 return irq_time;
1744}
1745#else /* CONFIG_64BIT */
1746static inline void irq_time_write_begin(void)
1747{
1748}
1749
1750static inline void irq_time_write_end(void)
1751{
1752}
1753 1762
1754static inline u64 irq_time_read(int cpu) 1763 return total;
1755{
1756 return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);
1757} 1764}
1758#endif /* CONFIG_64BIT */
1759#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ 1765#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
1760 1766
1761#ifdef CONFIG_CPU_FREQ 1767#ifdef CONFIG_CPU_FREQ
@@ -1763,27 +1769,13 @@ DECLARE_PER_CPU(struct update_util_data *, cpufreq_update_util_data);
1763 1769
1764/** 1770/**
1765 * cpufreq_update_util - Take a note about CPU utilization changes. 1771 * cpufreq_update_util - Take a note about CPU utilization changes.
1766 * @time: Current time. 1772 * @rq: Runqueue to carry out the update for.
1767 * @util: Current utilization. 1773 * @flags: Update reason flags.
1768 * @max: Utilization ceiling.
1769 * 1774 *
1770 * This function is called by the scheduler on every invocation of 1775 * This function is called by the scheduler on the CPU whose utilization is
1771 * update_load_avg() on the CPU whose utilization is being updated. 1776 * being updated.
1772 * 1777 *
1773 * It can only be called from RCU-sched read-side critical sections. 1778 * It can only be called from RCU-sched read-side critical sections.
1774 */
1775static inline void cpufreq_update_util(u64 time, unsigned long util, unsigned long max)
1776{
1777 struct update_util_data *data;
1778
1779 data = rcu_dereference_sched(*this_cpu_ptr(&cpufreq_update_util_data));
1780 if (data)
1781 data->func(data, time, util, max);
1782}
1783
1784/**
1785 * cpufreq_trigger_update - Trigger CPU performance state evaluation if needed.
1786 * @time: Current time.
1787 * 1779 *
1788 * The way cpufreq is currently arranged requires it to evaluate the CPU 1780 * The way cpufreq is currently arranged requires it to evaluate the CPU
1789 * performance state (frequency/voltage) on a regular basis to prevent it from 1781 * performance state (frequency/voltage) on a regular basis to prevent it from
@@ -1797,13 +1789,23 @@ static inline void cpufreq_update_util(u64 time, unsigned long util, unsigned lo
1797 * but that really is a band-aid. Going forward it should be replaced with 1789 * but that really is a band-aid. Going forward it should be replaced with
1798 * solutions targeted more specifically at RT and DL tasks. 1790 * solutions targeted more specifically at RT and DL tasks.
1799 */ 1791 */
1800static inline void cpufreq_trigger_update(u64 time) 1792static inline void cpufreq_update_util(struct rq *rq, unsigned int flags)
1793{
1794 struct update_util_data *data;
1795
1796 data = rcu_dereference_sched(*this_cpu_ptr(&cpufreq_update_util_data));
1797 if (data)
1798 data->func(data, rq_clock(rq), flags);
1799}
1800
1801static inline void cpufreq_update_this_cpu(struct rq *rq, unsigned int flags)
1801{ 1802{
1802 cpufreq_update_util(time, ULONG_MAX, 0); 1803 if (cpu_of(rq) == smp_processor_id())
1804 cpufreq_update_util(rq, flags);
1803} 1805}
1804#else 1806#else
1805static inline void cpufreq_update_util(u64 time, unsigned long util, unsigned long max) {} 1807static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {}
1806static inline void cpufreq_trigger_update(u64 time) {} 1808static inline void cpufreq_update_this_cpu(struct rq *rq, unsigned int flags) {}
1807#endif /* CONFIG_CPU_FREQ */ 1809#endif /* CONFIG_CPU_FREQ */
1808 1810
1809#ifdef arch_scale_freq_capacity 1811#ifdef arch_scale_freq_capacity
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index 78955cbea31c..34659a853505 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -29,11 +29,12 @@ rq_sched_info_dequeued(struct rq *rq, unsigned long long delta)
29 if (rq) 29 if (rq)
30 rq->rq_sched_info.run_delay += delta; 30 rq->rq_sched_info.run_delay += delta;
31} 31}
32# define schedstat_enabled() static_branch_unlikely(&sched_schedstats) 32#define schedstat_enabled() static_branch_unlikely(&sched_schedstats)
33# define schedstat_inc(rq, field) do { if (schedstat_enabled()) { (rq)->field++; } } while (0) 33#define schedstat_inc(var) do { if (schedstat_enabled()) { var++; } } while (0)
34# define schedstat_add(rq, field, amt) do { if (schedstat_enabled()) { (rq)->field += (amt); } } while (0) 34#define schedstat_add(var, amt) do { if (schedstat_enabled()) { var += (amt); } } while (0)
35# define schedstat_set(var, val) do { if (schedstat_enabled()) { var = (val); } } while (0) 35#define schedstat_set(var, val) do { if (schedstat_enabled()) { var = (val); } } while (0)
36# define schedstat_val(rq, field) ((schedstat_enabled()) ? (rq)->field : 0) 36#define schedstat_val(var) (var)
37#define schedstat_val_or_zero(var) ((schedstat_enabled()) ? (var) : 0)
37 38
38#else /* !CONFIG_SCHEDSTATS */ 39#else /* !CONFIG_SCHEDSTATS */
39static inline void 40static inline void
@@ -45,12 +46,13 @@ rq_sched_info_dequeued(struct rq *rq, unsigned long long delta)
45static inline void 46static inline void
46rq_sched_info_depart(struct rq *rq, unsigned long long delta) 47rq_sched_info_depart(struct rq *rq, unsigned long long delta)
47{} 48{}
48# define schedstat_enabled() 0 49#define schedstat_enabled() 0
49# define schedstat_inc(rq, field) do { } while (0) 50#define schedstat_inc(var) do { } while (0)
50# define schedstat_add(rq, field, amt) do { } while (0) 51#define schedstat_add(var, amt) do { } while (0)
51# define schedstat_set(var, val) do { } while (0) 52#define schedstat_set(var, val) do { } while (0)
52# define schedstat_val(rq, field) 0 53#define schedstat_val(var) 0
53#endif 54#define schedstat_val_or_zero(var) 0
55#endif /* CONFIG_SCHEDSTATS */
54 56
55#ifdef CONFIG_SCHED_INFO 57#ifdef CONFIG_SCHED_INFO
56static inline void sched_info_reset_dequeued(struct task_struct *t) 58static inline void sched_info_reset_dequeued(struct task_struct *t)
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index f15d6b6a538a..9453efe9b25a 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -196,27 +196,48 @@ prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state)
196} 196}
197EXPORT_SYMBOL(prepare_to_wait_exclusive); 197EXPORT_SYMBOL(prepare_to_wait_exclusive);
198 198
199long prepare_to_wait_event(wait_queue_head_t *q, wait_queue_t *wait, int state) 199void init_wait_entry(wait_queue_t *wait, int flags)
200{ 200{
201 unsigned long flags; 201 wait->flags = flags;
202
203 if (signal_pending_state(state, current))
204 return -ERESTARTSYS;
205
206 wait->private = current; 202 wait->private = current;
207 wait->func = autoremove_wake_function; 203 wait->func = autoremove_wake_function;
204 INIT_LIST_HEAD(&wait->task_list);
205}
206EXPORT_SYMBOL(init_wait_entry);
207
208long prepare_to_wait_event(wait_queue_head_t *q, wait_queue_t *wait, int state)
209{
210 unsigned long flags;
211 long ret = 0;
208 212
209 spin_lock_irqsave(&q->lock, flags); 213 spin_lock_irqsave(&q->lock, flags);
210 if (list_empty(&wait->task_list)) { 214 if (unlikely(signal_pending_state(state, current))) {
211 if (wait->flags & WQ_FLAG_EXCLUSIVE) 215 /*
212 __add_wait_queue_tail(q, wait); 216 * Exclusive waiter must not fail if it was selected by wakeup,
213 else 217 * it should "consume" the condition we were waiting for.
214 __add_wait_queue(q, wait); 218 *
219 * The caller will recheck the condition and return success if
220 * we were already woken up, we can not miss the event because
221 * wakeup locks/unlocks the same q->lock.
222 *
223 * But we need to ensure that set-condition + wakeup after that
224 * can't see us, it should wake up another exclusive waiter if
225 * we fail.
226 */
227 list_del_init(&wait->task_list);
228 ret = -ERESTARTSYS;
229 } else {
230 if (list_empty(&wait->task_list)) {
231 if (wait->flags & WQ_FLAG_EXCLUSIVE)
232 __add_wait_queue_tail(q, wait);
233 else
234 __add_wait_queue(q, wait);
235 }
236 set_current_state(state);
215 } 237 }
216 set_current_state(state);
217 spin_unlock_irqrestore(&q->lock, flags); 238 spin_unlock_irqrestore(&q->lock, flags);
218 239
219 return 0; 240 return ret;
220} 241}
221EXPORT_SYMBOL(prepare_to_wait_event); 242EXPORT_SYMBOL(prepare_to_wait_event);
222 243
@@ -255,39 +276,6 @@ void finish_wait(wait_queue_head_t *q, wait_queue_t *wait)
255} 276}
256EXPORT_SYMBOL(finish_wait); 277EXPORT_SYMBOL(finish_wait);
257 278
258/**
259 * abort_exclusive_wait - abort exclusive waiting in a queue
260 * @q: waitqueue waited on
261 * @wait: wait descriptor
262 * @mode: runstate of the waiter to be woken
263 * @key: key to identify a wait bit queue or %NULL
264 *
265 * Sets current thread back to running state and removes
266 * the wait descriptor from the given waitqueue if still
267 * queued.
268 *
269 * Wakes up the next waiter if the caller is concurrently
270 * woken up through the queue.
271 *
272 * This prevents waiter starvation where an exclusive waiter
273 * aborts and is woken up concurrently and no one wakes up
274 * the next waiter.
275 */
276void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait,
277 unsigned int mode, void *key)
278{
279 unsigned long flags;
280
281 __set_current_state(TASK_RUNNING);
282 spin_lock_irqsave(&q->lock, flags);
283 if (!list_empty(&wait->task_list))
284 list_del_init(&wait->task_list);
285 else if (waitqueue_active(q))
286 __wake_up_locked_key(q, mode, key);
287 spin_unlock_irqrestore(&q->lock, flags);
288}
289EXPORT_SYMBOL(abort_exclusive_wait);
290
291int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key) 279int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key)
292{ 280{
293 int ret = default_wake_function(wait, mode, sync, key); 281 int ret = default_wake_function(wait, mode, sync, key);
@@ -425,20 +413,29 @@ int __sched
425__wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q, 413__wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q,
426 wait_bit_action_f *action, unsigned mode) 414 wait_bit_action_f *action, unsigned mode)
427{ 415{
428 do { 416 int ret = 0;
429 int ret;
430 417
418 for (;;) {
431 prepare_to_wait_exclusive(wq, &q->wait, mode); 419 prepare_to_wait_exclusive(wq, &q->wait, mode);
432 if (!test_bit(q->key.bit_nr, q->key.flags)) 420 if (test_bit(q->key.bit_nr, q->key.flags)) {
433 continue; 421 ret = action(&q->key, mode);
434 ret = action(&q->key, mode); 422 /*
435 if (!ret) 423 * See the comment in prepare_to_wait_event().
436 continue; 424 * finish_wait() does not necessarily takes wq->lock,
437 abort_exclusive_wait(wq, &q->wait, mode, &q->key); 425 * but test_and_set_bit() implies mb() which pairs with
438 return ret; 426 * smp_mb__after_atomic() before wake_up_page().
439 } while (test_and_set_bit(q->key.bit_nr, q->key.flags)); 427 */
440 finish_wait(wq, &q->wait); 428 if (ret)
441 return 0; 429 finish_wait(wq, &q->wait);
430 }
431 if (!test_and_set_bit(q->key.bit_nr, q->key.flags)) {
432 if (!ret)
433 finish_wait(wq, &q->wait);
434 return 0;
435 } else if (ret) {
436 return ret;
437 }
438 }
442} 439}
443EXPORT_SYMBOL(__wait_on_bit_lock); 440EXPORT_SYMBOL(__wait_on_bit_lock);
444 441
@@ -483,16 +480,6 @@ void wake_up_bit(void *word, int bit)
483} 480}
484EXPORT_SYMBOL(wake_up_bit); 481EXPORT_SYMBOL(wake_up_bit);
485 482
486wait_queue_head_t *bit_waitqueue(void *word, int bit)
487{
488 const int shift = BITS_PER_LONG == 32 ? 5 : 6;
489 const struct zone *zone = page_zone(virt_to_page(word));
490 unsigned long val = (unsigned long)word << shift | bit;
491
492 return &zone->wait_table[hash_long(val, zone->wait_table_bits)];
493}
494EXPORT_SYMBOL(bit_waitqueue);
495
496/* 483/*
497 * Manipulate the atomic_t address to produce a better bit waitqueue table hash 484 * Manipulate the atomic_t address to produce a better bit waitqueue table hash
498 * index (we're keying off bit -1, but that would produce a horrible hash 485 * index (we're keying off bit -1, but that would produce a horrible hash
diff --git a/kernel/signal.c b/kernel/signal.c
index af21afc00d08..75761acc77cf 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -3044,6 +3044,11 @@ void kernel_sigaction(int sig, __sighandler_t action)
3044} 3044}
3045EXPORT_SYMBOL(kernel_sigaction); 3045EXPORT_SYMBOL(kernel_sigaction);
3046 3046
3047void __weak sigaction_compat_abi(struct k_sigaction *act,
3048 struct k_sigaction *oact)
3049{
3050}
3051
3047int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact) 3052int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
3048{ 3053{
3049 struct task_struct *p = current, *t; 3054 struct task_struct *p = current, *t;
@@ -3059,6 +3064,8 @@ int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
3059 if (oact) 3064 if (oact)
3060 *oact = *k; 3065 *oact = *k;
3061 3066
3067 sigaction_compat_abi(act, oact);
3068
3062 if (act) { 3069 if (act) {
3063 sigdelsetmask(&act->sa.sa_mask, 3070 sigdelsetmask(&act->sa.sa_mask,
3064 sigmask(SIGKILL) | sigmask(SIGSTOP)); 3071 sigmask(SIGKILL) | sigmask(SIGSTOP));
diff --git a/kernel/smp.c b/kernel/smp.c
index 3aa642d39c03..bba3b201668d 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -14,6 +14,7 @@
14#include <linux/smp.h> 14#include <linux/smp.h>
15#include <linux/cpu.h> 15#include <linux/cpu.h>
16#include <linux/sched.h> 16#include <linux/sched.h>
17#include <linux/hypervisor.h>
17 18
18#include "smpboot.h" 19#include "smpboot.h"
19 20
@@ -724,3 +725,54 @@ void wake_up_all_idle_cpus(void)
724 preempt_enable(); 725 preempt_enable();
725} 726}
726EXPORT_SYMBOL_GPL(wake_up_all_idle_cpus); 727EXPORT_SYMBOL_GPL(wake_up_all_idle_cpus);
728
729/**
730 * smp_call_on_cpu - Call a function on a specific cpu
731 *
732 * Used to call a function on a specific cpu and wait for it to return.
733 * Optionally make sure the call is done on a specified physical cpu via vcpu
734 * pinning in order to support virtualized environments.
735 */
736struct smp_call_on_cpu_struct {
737 struct work_struct work;
738 struct completion done;
739 int (*func)(void *);
740 void *data;
741 int ret;
742 int cpu;
743};
744
745static void smp_call_on_cpu_callback(struct work_struct *work)
746{
747 struct smp_call_on_cpu_struct *sscs;
748
749 sscs = container_of(work, struct smp_call_on_cpu_struct, work);
750 if (sscs->cpu >= 0)
751 hypervisor_pin_vcpu(sscs->cpu);
752 sscs->ret = sscs->func(sscs->data);
753 if (sscs->cpu >= 0)
754 hypervisor_pin_vcpu(-1);
755
756 complete(&sscs->done);
757}
758
759int smp_call_on_cpu(unsigned int cpu, int (*func)(void *), void *par, bool phys)
760{
761 struct smp_call_on_cpu_struct sscs = {
762 .done = COMPLETION_INITIALIZER_ONSTACK(sscs.done),
763 .func = func,
764 .data = par,
765 .cpu = phys ? cpu : -1,
766 };
767
768 INIT_WORK_ONSTACK(&sscs.work, smp_call_on_cpu_callback);
769
770 if (cpu >= nr_cpu_ids || !cpu_online(cpu))
771 return -ENXIO;
772
773 queue_work_on(cpu, system_wq, &sscs.work);
774 wait_for_completion(&sscs.done);
775
776 return sscs.ret;
777}
778EXPORT_SYMBOL_GPL(smp_call_on_cpu);
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index 13bc43d1fb22..4a5c6e73ecd4 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -186,6 +186,11 @@ __smpboot_create_thread(struct smp_hotplug_thread *ht, unsigned int cpu)
186 kfree(td); 186 kfree(td);
187 return PTR_ERR(tsk); 187 return PTR_ERR(tsk);
188 } 188 }
189 /*
190 * Park the thread so that it could start right on the CPU
191 * when it is available.
192 */
193 kthread_park(tsk);
189 get_task_struct(tsk); 194 get_task_struct(tsk);
190 *per_cpu_ptr(ht->store, cpu) = tsk; 195 *per_cpu_ptr(ht->store, cpu) = tsk;
191 if (ht->create) { 196 if (ht->create) {
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 17caf4b63342..744fa611cae0 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -58,7 +58,7 @@ static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp
58DEFINE_PER_CPU(struct task_struct *, ksoftirqd); 58DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
59 59
60const char * const softirq_to_name[NR_SOFTIRQS] = { 60const char * const softirq_to_name[NR_SOFTIRQS] = {
61 "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "BLOCK_IOPOLL", 61 "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "IRQ_POLL",
62 "TASKLET", "SCHED", "HRTIMER", "RCU" 62 "TASKLET", "SCHED", "HRTIMER", "RCU"
63}; 63};
64 64
@@ -78,6 +78,17 @@ static void wakeup_softirqd(void)
78} 78}
79 79
80/* 80/*
81 * If ksoftirqd is scheduled, we do not want to process pending softirqs
82 * right now. Let ksoftirqd handle this at its own rate, to get fairness.
83 */
84static bool ksoftirqd_running(void)
85{
86 struct task_struct *tsk = __this_cpu_read(ksoftirqd);
87
88 return tsk && (tsk->state == TASK_RUNNING);
89}
90
91/*
81 * preempt_count and SOFTIRQ_OFFSET usage: 92 * preempt_count and SOFTIRQ_OFFSET usage:
82 * - preempt_count is changed by SOFTIRQ_OFFSET on entering or leaving 93 * - preempt_count is changed by SOFTIRQ_OFFSET on entering or leaving
83 * softirq processing. 94 * softirq processing.
@@ -313,7 +324,7 @@ asmlinkage __visible void do_softirq(void)
313 324
314 pending = local_softirq_pending(); 325 pending = local_softirq_pending();
315 326
316 if (pending) 327 if (pending && !ksoftirqd_running())
317 do_softirq_own_stack(); 328 do_softirq_own_stack();
318 329
319 local_irq_restore(flags); 330 local_irq_restore(flags);
@@ -340,6 +351,9 @@ void irq_enter(void)
340 351
341static inline void invoke_softirq(void) 352static inline void invoke_softirq(void)
342{ 353{
354 if (ksoftirqd_running())
355 return;
356
343 if (!force_irqthreads) { 357 if (!force_irqthreads) {
344#ifdef CONFIG_HAVE_IRQ_EXIT_ON_IRQ_STACK 358#ifdef CONFIG_HAVE_IRQ_EXIT_ON_IRQ_STACK
345 /* 359 /*
@@ -482,7 +496,7 @@ void __tasklet_hi_schedule_first(struct tasklet_struct *t)
482} 496}
483EXPORT_SYMBOL(__tasklet_hi_schedule_first); 497EXPORT_SYMBOL(__tasklet_hi_schedule_first);
484 498
485static void tasklet_action(struct softirq_action *a) 499static __latent_entropy void tasklet_action(struct softirq_action *a)
486{ 500{
487 struct tasklet_struct *list; 501 struct tasklet_struct *list;
488 502
@@ -518,7 +532,7 @@ static void tasklet_action(struct softirq_action *a)
518 } 532 }
519} 533}
520 534
521static void tasklet_hi_action(struct softirq_action *a) 535static __latent_entropy void tasklet_hi_action(struct softirq_action *a)
522{ 536{
523 struct tasklet_struct *list; 537 struct tasklet_struct *list;
524 538
@@ -700,7 +714,7 @@ void tasklet_kill_immediate(struct tasklet_struct *t, unsigned int cpu)
700 BUG(); 714 BUG();
701} 715}
702 716
703static void takeover_tasklets(unsigned int cpu) 717static int takeover_tasklets(unsigned int cpu)
704{ 718{
705 /* CPU is dead, so no lock needed. */ 719 /* CPU is dead, so no lock needed. */
706 local_irq_disable(); 720 local_irq_disable();
@@ -723,27 +737,12 @@ static void takeover_tasklets(unsigned int cpu)
723 raise_softirq_irqoff(HI_SOFTIRQ); 737 raise_softirq_irqoff(HI_SOFTIRQ);
724 738
725 local_irq_enable(); 739 local_irq_enable();
740 return 0;
726} 741}
742#else
743#define takeover_tasklets NULL
727#endif /* CONFIG_HOTPLUG_CPU */ 744#endif /* CONFIG_HOTPLUG_CPU */
728 745
729static int cpu_callback(struct notifier_block *nfb, unsigned long action,
730 void *hcpu)
731{
732 switch (action) {
733#ifdef CONFIG_HOTPLUG_CPU
734 case CPU_DEAD:
735 case CPU_DEAD_FROZEN:
736 takeover_tasklets((unsigned long)hcpu);
737 break;
738#endif /* CONFIG_HOTPLUG_CPU */
739 }
740 return NOTIFY_OK;
741}
742
743static struct notifier_block cpu_nfb = {
744 .notifier_call = cpu_callback
745};
746
747static struct smp_hotplug_thread softirq_threads = { 746static struct smp_hotplug_thread softirq_threads = {
748 .store = &ksoftirqd, 747 .store = &ksoftirqd,
749 .thread_should_run = ksoftirqd_should_run, 748 .thread_should_run = ksoftirqd_should_run,
@@ -753,8 +752,8 @@ static struct smp_hotplug_thread softirq_threads = {
753 752
754static __init int spawn_ksoftirqd(void) 753static __init int spawn_ksoftirqd(void)
755{ 754{
756 register_cpu_notifier(&cpu_nfb); 755 cpuhp_setup_state_nocalls(CPUHP_SOFTIRQ_DEAD, "softirq:dead", NULL,
757 756 takeover_tasklets);
758 BUG_ON(smpboot_register_percpu_thread(&softirq_threads)); 757 BUG_ON(smpboot_register_percpu_thread(&softirq_threads));
759 758
760 return 0; 759 return 0;
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 4a1ca5f6da7e..ec9ab2f01489 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -20,7 +20,6 @@
20#include <linux/kallsyms.h> 20#include <linux/kallsyms.h>
21#include <linux/smpboot.h> 21#include <linux/smpboot.h>
22#include <linux/atomic.h> 22#include <linux/atomic.h>
23#include <linux/lglock.h>
24#include <linux/nmi.h> 23#include <linux/nmi.h>
25 24
26/* 25/*
@@ -47,13 +46,9 @@ struct cpu_stopper {
47static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper); 46static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper);
48static bool stop_machine_initialized = false; 47static bool stop_machine_initialized = false;
49 48
50/* 49/* static data for stop_cpus */
51 * Avoids a race between stop_two_cpus and global stop_cpus, where 50static DEFINE_MUTEX(stop_cpus_mutex);
52 * the stoppers could get queued up in reverse order, leading to 51static bool stop_cpus_in_progress;
53 * system deadlock. Using an lglock means stop_two_cpus remains
54 * relatively cheap.
55 */
56DEFINE_STATIC_LGLOCK(stop_cpus_lock);
57 52
58static void cpu_stop_init_done(struct cpu_stop_done *done, unsigned int nr_todo) 53static void cpu_stop_init_done(struct cpu_stop_done *done, unsigned int nr_todo)
59{ 54{
@@ -126,6 +121,11 @@ int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg)
126 cpu_stop_init_done(&done, 1); 121 cpu_stop_init_done(&done, 1);
127 if (!cpu_stop_queue_work(cpu, &work)) 122 if (!cpu_stop_queue_work(cpu, &work))
128 return -ENOENT; 123 return -ENOENT;
124 /*
125 * In case @cpu == smp_proccessor_id() we can avoid a sleep+wakeup
126 * cycle by doing a preemption:
127 */
128 cond_resched();
129 wait_for_completion(&done.completion); 129 wait_for_completion(&done.completion);
130 return done.ret; 130 return done.ret;
131} 131}
@@ -230,14 +230,26 @@ static int cpu_stop_queue_two_works(int cpu1, struct cpu_stop_work *work1,
230 struct cpu_stopper *stopper1 = per_cpu_ptr(&cpu_stopper, cpu1); 230 struct cpu_stopper *stopper1 = per_cpu_ptr(&cpu_stopper, cpu1);
231 struct cpu_stopper *stopper2 = per_cpu_ptr(&cpu_stopper, cpu2); 231 struct cpu_stopper *stopper2 = per_cpu_ptr(&cpu_stopper, cpu2);
232 int err; 232 int err;
233 233retry:
234 lg_double_lock(&stop_cpus_lock, cpu1, cpu2);
235 spin_lock_irq(&stopper1->lock); 234 spin_lock_irq(&stopper1->lock);
236 spin_lock_nested(&stopper2->lock, SINGLE_DEPTH_NESTING); 235 spin_lock_nested(&stopper2->lock, SINGLE_DEPTH_NESTING);
237 236
238 err = -ENOENT; 237 err = -ENOENT;
239 if (!stopper1->enabled || !stopper2->enabled) 238 if (!stopper1->enabled || !stopper2->enabled)
240 goto unlock; 239 goto unlock;
240 /*
241 * Ensure that if we race with __stop_cpus() the stoppers won't get
242 * queued up in reverse order leading to system deadlock.
243 *
244 * We can't miss stop_cpus_in_progress if queue_stop_cpus_work() has
245 * queued a work on cpu1 but not on cpu2, we hold both locks.
246 *
247 * It can be falsely true but it is safe to spin until it is cleared,
248 * queue_stop_cpus_work() does everything under preempt_disable().
249 */
250 err = -EDEADLK;
251 if (unlikely(stop_cpus_in_progress))
252 goto unlock;
241 253
242 err = 0; 254 err = 0;
243 __cpu_stop_queue_work(stopper1, work1); 255 __cpu_stop_queue_work(stopper1, work1);
@@ -245,8 +257,12 @@ static int cpu_stop_queue_two_works(int cpu1, struct cpu_stop_work *work1,
245unlock: 257unlock:
246 spin_unlock(&stopper2->lock); 258 spin_unlock(&stopper2->lock);
247 spin_unlock_irq(&stopper1->lock); 259 spin_unlock_irq(&stopper1->lock);
248 lg_double_unlock(&stop_cpus_lock, cpu1, cpu2);
249 260
261 if (unlikely(err == -EDEADLK)) {
262 while (stop_cpus_in_progress)
263 cpu_relax();
264 goto retry;
265 }
250 return err; 266 return err;
251} 267}
252/** 268/**
@@ -316,9 +332,6 @@ bool stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg,
316 return cpu_stop_queue_work(cpu, work_buf); 332 return cpu_stop_queue_work(cpu, work_buf);
317} 333}
318 334
319/* static data for stop_cpus */
320static DEFINE_MUTEX(stop_cpus_mutex);
321
322static bool queue_stop_cpus_work(const struct cpumask *cpumask, 335static bool queue_stop_cpus_work(const struct cpumask *cpumask,
323 cpu_stop_fn_t fn, void *arg, 336 cpu_stop_fn_t fn, void *arg,
324 struct cpu_stop_done *done) 337 struct cpu_stop_done *done)
@@ -332,7 +345,8 @@ static bool queue_stop_cpus_work(const struct cpumask *cpumask,
332 * preempted by a stopper which might wait for other stoppers 345 * preempted by a stopper which might wait for other stoppers
333 * to enter @fn which can lead to deadlock. 346 * to enter @fn which can lead to deadlock.
334 */ 347 */
335 lg_global_lock(&stop_cpus_lock); 348 preempt_disable();
349 stop_cpus_in_progress = true;
336 for_each_cpu(cpu, cpumask) { 350 for_each_cpu(cpu, cpumask) {
337 work = &per_cpu(cpu_stopper.stop_work, cpu); 351 work = &per_cpu(cpu_stopper.stop_work, cpu);
338 work->fn = fn; 352 work->fn = fn;
@@ -341,7 +355,8 @@ static bool queue_stop_cpus_work(const struct cpumask *cpumask,
341 if (cpu_stop_queue_work(cpu, work)) 355 if (cpu_stop_queue_work(cpu, work))
342 queued = true; 356 queued = true;
343 } 357 }
344 lg_global_unlock(&stop_cpus_lock); 358 stop_cpus_in_progress = false;
359 preempt_enable();
345 360
346 return queued; 361 return queued;
347} 362}
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 2c5e3a8e00d7..635482e60ca3 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -250,3 +250,8 @@ cond_syscall(sys_execveat);
250 250
251/* membarrier */ 251/* membarrier */
252cond_syscall(sys_membarrier); 252cond_syscall(sys_membarrier);
253
254/* memory protection keys */
255cond_syscall(sys_pkey_mprotect);
256cond_syscall(sys_pkey_alloc);
257cond_syscall(sys_pkey_free);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index a13bbdaab47d..706309f9ed84 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -65,6 +65,7 @@
65#include <linux/sched/sysctl.h> 65#include <linux/sched/sysctl.h>
66#include <linux/kexec.h> 66#include <linux/kexec.h>
67#include <linux/bpf.h> 67#include <linux/bpf.h>
68#include <linux/mount.h>
68 69
69#include <asm/uaccess.h> 70#include <asm/uaccess.h>
70#include <asm/processor.h> 71#include <asm/processor.h>
@@ -106,9 +107,8 @@ extern unsigned int core_pipe_limit;
106extern int pid_max; 107extern int pid_max;
107extern int pid_max_min, pid_max_max; 108extern int pid_max_min, pid_max_max;
108extern int percpu_pagelist_fraction; 109extern int percpu_pagelist_fraction;
109extern int compat_log;
110extern int latencytop_enabled; 110extern int latencytop_enabled;
111extern int sysctl_nr_open_min, sysctl_nr_open_max; 111extern unsigned int sysctl_nr_open_min, sysctl_nr_open_max;
112#ifndef CONFIG_MMU 112#ifndef CONFIG_MMU
113extern int sysctl_nr_trim_pages; 113extern int sysctl_nr_trim_pages;
114#endif 114#endif
@@ -1084,15 +1084,6 @@ static struct ctl_table kern_table[] = {
1084 .extra1 = &neg_one, 1084 .extra1 = &neg_one,
1085 }, 1085 },
1086#endif 1086#endif
1087#ifdef CONFIG_COMPAT
1088 {
1089 .procname = "compat-log",
1090 .data = &compat_log,
1091 .maxlen = sizeof (int),
1092 .mode = 0644,
1093 .proc_handler = proc_dointvec,
1094 },
1095#endif
1096#ifdef CONFIG_RT_MUTEXES 1087#ifdef CONFIG_RT_MUTEXES
1097 { 1088 {
1098 .procname = "max_lock_depth", 1089 .procname = "max_lock_depth",
@@ -1692,7 +1683,7 @@ static struct ctl_table fs_table[] = {
1692 { 1683 {
1693 .procname = "nr_open", 1684 .procname = "nr_open",
1694 .data = &sysctl_nr_open, 1685 .data = &sysctl_nr_open,
1695 .maxlen = sizeof(int), 1686 .maxlen = sizeof(unsigned int),
1696 .mode = 0644, 1687 .mode = 0644,
1697 .proc_handler = proc_dointvec_minmax, 1688 .proc_handler = proc_dointvec_minmax,
1698 .extra1 = &sysctl_nr_open_min, 1689 .extra1 = &sysctl_nr_open_min,
@@ -1838,6 +1829,14 @@ static struct ctl_table fs_table[] = {
1838 .mode = 0644, 1829 .mode = 0644,
1839 .proc_handler = proc_doulongvec_minmax, 1830 .proc_handler = proc_doulongvec_minmax,
1840 }, 1831 },
1832 {
1833 .procname = "mount-max",
1834 .data = &sysctl_mount_max,
1835 .maxlen = sizeof(unsigned int),
1836 .mode = 0644,
1837 .proc_handler = proc_dointvec_minmax,
1838 .extra1 = &one,
1839 },
1841 { } 1840 { }
1842}; 1841};
1843 1842
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index b3f05ee20d18..cbb387a265db 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -54,7 +54,11 @@ static const struct nla_policy taskstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1
54 [TASKSTATS_CMD_ATTR_REGISTER_CPUMASK] = { .type = NLA_STRING }, 54 [TASKSTATS_CMD_ATTR_REGISTER_CPUMASK] = { .type = NLA_STRING },
55 [TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK] = { .type = NLA_STRING },}; 55 [TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK] = { .type = NLA_STRING },};
56 56
57static const struct nla_policy cgroupstats_cmd_get_policy[CGROUPSTATS_CMD_ATTR_MAX+1] = { 57/*
58 * We have to use TASKSTATS_CMD_ATTR_MAX here, it is the maxattr in the family.
59 * Make sure they are always aligned.
60 */
61static const struct nla_policy cgroupstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1] = {
58 [CGROUPSTATS_CMD_ATTR_FD] = { .type = NLA_U32 }, 62 [CGROUPSTATS_CMD_ATTR_FD] = { .type = NLA_U32 },
59}; 63};
60 64
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index c3aad685bbc0..12dd190634ab 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -542,7 +542,6 @@ static int alarm_clock_get(clockid_t which_clock, struct timespec *tp)
542static int alarm_timer_create(struct k_itimer *new_timer) 542static int alarm_timer_create(struct k_itimer *new_timer)
543{ 543{
544 enum alarmtimer_type type; 544 enum alarmtimer_type type;
545 struct alarm_base *base;
546 545
547 if (!alarmtimer_get_rtcdev()) 546 if (!alarmtimer_get_rtcdev())
548 return -ENOTSUPP; 547 return -ENOTSUPP;
@@ -551,7 +550,6 @@ static int alarm_timer_create(struct k_itimer *new_timer)
551 return -EPERM; 550 return -EPERM;
552 551
553 type = clock2alarm(new_timer->it_clock); 552 type = clock2alarm(new_timer->it_clock);
554 base = &alarm_bases[type];
555 alarm_init(&new_timer->it.alarm.alarmtimer, type, alarm_handle_timer); 553 alarm_init(&new_timer->it.alarm.alarmtimer, type, alarm_handle_timer);
556 return 0; 554 return 0;
557} 555}
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 6a5a310a1a53..7e4fad75acaa 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -600,9 +600,18 @@ static void __clocksource_select(bool skipcur)
600 */ 600 */
601 if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) && oneshot) { 601 if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) && oneshot) {
602 /* Override clocksource cannot be used. */ 602 /* Override clocksource cannot be used. */
603 pr_warn("Override clocksource %s is not HRT compatible - cannot switch while in HRT/NOHZ mode\n", 603 if (cs->flags & CLOCK_SOURCE_UNSTABLE) {
604 cs->name); 604 pr_warn("Override clocksource %s is unstable and not HRT compatible - cannot switch while in HRT/NOHZ mode\n",
605 override_name[0] = 0; 605 cs->name);
606 override_name[0] = 0;
607 } else {
608 /*
609 * The override cannot be currently verified.
610 * Deferring to let the watchdog check.
611 */
612 pr_info("Override clocksource %s is not currently HRT compatible - deferring\n",
613 cs->name);
614 }
606 } else 615 } else
607 /* Override clocksource can be used. */ 616 /* Override clocksource can be used. */
608 best = cs; 617 best = cs;
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 9ba7c820fc23..bb5ec425dfe0 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -307,7 +307,7 @@ EXPORT_SYMBOL_GPL(__ktime_divns);
307 */ 307 */
308ktime_t ktime_add_safe(const ktime_t lhs, const ktime_t rhs) 308ktime_t ktime_add_safe(const ktime_t lhs, const ktime_t rhs)
309{ 309{
310 ktime_t res = ktime_add(lhs, rhs); 310 ktime_t res = ktime_add_unsafe(lhs, rhs);
311 311
312 /* 312 /*
313 * We use KTIME_SEC_MAX here, the maximum timeout which we can 313 * We use KTIME_SEC_MAX here, the maximum timeout which we can
@@ -703,7 +703,7 @@ static void clock_was_set_work(struct work_struct *work)
703static DECLARE_WORK(hrtimer_work, clock_was_set_work); 703static DECLARE_WORK(hrtimer_work, clock_was_set_work);
704 704
705/* 705/*
706 * Called from timekeeping and resume code to reprogramm the hrtimer 706 * Called from timekeeping and resume code to reprogram the hrtimer
707 * interrupt device on all cpus. 707 * interrupt device on all cpus.
708 */ 708 */
709void clock_was_set_delayed(void) 709void clock_was_set_delayed(void)
@@ -1241,7 +1241,7 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base,
1241 1241
1242 /* 1242 /*
1243 * Note: We clear the running state after enqueue_hrtimer and 1243 * Note: We clear the running state after enqueue_hrtimer and
1244 * we do not reprogramm the event hardware. Happens either in 1244 * we do not reprogram the event hardware. Happens either in
1245 * hrtimer_start_range_ns() or in hrtimer_interrupt() 1245 * hrtimer_start_range_ns() or in hrtimer_interrupt()
1246 * 1246 *
1247 * Note: Because we dropped the cpu_base->lock above, 1247 * Note: Because we dropped the cpu_base->lock above,
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 2ec7c00228f3..3bcb61b52f6c 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -186,10 +186,13 @@ static bool check_tick_dependency(atomic_t *dep)
186 return false; 186 return false;
187} 187}
188 188
189static bool can_stop_full_tick(struct tick_sched *ts) 189static bool can_stop_full_tick(int cpu, struct tick_sched *ts)
190{ 190{
191 WARN_ON_ONCE(!irqs_disabled()); 191 WARN_ON_ONCE(!irqs_disabled());
192 192
193 if (unlikely(!cpu_online(cpu)))
194 return false;
195
193 if (check_tick_dependency(&tick_dep_mask)) 196 if (check_tick_dependency(&tick_dep_mask))
194 return false; 197 return false;
195 198
@@ -843,7 +846,7 @@ static void tick_nohz_full_update_tick(struct tick_sched *ts)
843 if (!ts->tick_stopped && ts->nohz_mode == NOHZ_MODE_INACTIVE) 846 if (!ts->tick_stopped && ts->nohz_mode == NOHZ_MODE_INACTIVE)
844 return; 847 return;
845 848
846 if (can_stop_full_tick(ts)) 849 if (can_stop_full_tick(cpu, ts))
847 tick_nohz_stop_sched_tick(ts, ktime_get(), cpu); 850 tick_nohz_stop_sched_tick(ts, ktime_get(), cpu);
848 else if (ts->tick_stopped) 851 else if (ts->tick_stopped)
849 tick_nohz_restart_sched_tick(ts, ktime_get()); 852 tick_nohz_restart_sched_tick(ts, ktime_get());
diff --git a/kernel/time/time.c b/kernel/time/time.c
index 667b9335f5d6..bd62fb8e8e77 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -780,7 +780,7 @@ struct timespec64 timespec64_add_safe(const struct timespec64 lhs,
780{ 780{
781 struct timespec64 res; 781 struct timespec64 res;
782 782
783 set_normalized_timespec64(&res, lhs.tv_sec + rhs.tv_sec, 783 set_normalized_timespec64(&res, (timeu64_t) lhs.tv_sec + rhs.tv_sec,
784 lhs.tv_nsec + rhs.tv_nsec); 784 lhs.tv_nsec + rhs.tv_nsec);
785 785
786 if (unlikely(res.tv_sec < lhs.tv_sec || res.tv_sec < rhs.tv_sec)) { 786 if (unlikely(res.tv_sec < lhs.tv_sec || res.tv_sec < rhs.tv_sec)) {
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index e07fb093f819..37dec7e3db43 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -403,8 +403,11 @@ static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf)
403 tkr = tkf->base + (seq & 0x01); 403 tkr = tkf->base + (seq & 0x01);
404 now = ktime_to_ns(tkr->base); 404 now = ktime_to_ns(tkr->base);
405 405
406 now += clocksource_delta(tkr->read(tkr->clock), 406 now += timekeeping_delta_to_ns(tkr,
407 tkr->cycle_last, tkr->mask); 407 clocksource_delta(
408 tkr->read(tkr->clock),
409 tkr->cycle_last,
410 tkr->mask));
408 } while (read_seqcount_retry(&tkf->seq, seq)); 411 } while (read_seqcount_retry(&tkf->seq, seq));
409 412
410 return now; 413 return now;
diff --git a/kernel/time/timekeeping_debug.c b/kernel/time/timekeeping_debug.c
index 107310a6f36f..ca9fb800336b 100644
--- a/kernel/time/timekeeping_debug.c
+++ b/kernel/time/timekeeping_debug.c
@@ -75,5 +75,7 @@ void tk_debug_account_sleep_time(struct timespec64 *t)
75 int bin = min(fls(t->tv_sec), NUM_BINS-1); 75 int bin = min(fls(t->tv_sec), NUM_BINS-1);
76 76
77 sleep_time_bin[bin]++; 77 sleep_time_bin[bin]++;
78 pr_info("Suspended for %lld.%03lu seconds\n", (s64)t->tv_sec,
79 t->tv_nsec / NSEC_PER_MSEC);
78} 80}
79 81
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 32bf6f75a8fe..c611c47de884 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -878,7 +878,7 @@ static inline struct timer_base *get_timer_base(u32 tflags)
878 878
879#ifdef CONFIG_NO_HZ_COMMON 879#ifdef CONFIG_NO_HZ_COMMON
880static inline struct timer_base * 880static inline struct timer_base *
881__get_target_base(struct timer_base *base, unsigned tflags) 881get_target_base(struct timer_base *base, unsigned tflags)
882{ 882{
883#ifdef CONFIG_SMP 883#ifdef CONFIG_SMP
884 if ((tflags & TIMER_PINNED) || !base->migration_enabled) 884 if ((tflags & TIMER_PINNED) || !base->migration_enabled)
@@ -891,25 +891,27 @@ __get_target_base(struct timer_base *base, unsigned tflags)
891 891
892static inline void forward_timer_base(struct timer_base *base) 892static inline void forward_timer_base(struct timer_base *base)
893{ 893{
894 unsigned long jnow = READ_ONCE(jiffies);
895
894 /* 896 /*
895 * We only forward the base when it's idle and we have a delta between 897 * We only forward the base when it's idle and we have a delta between
896 * base clock and jiffies. 898 * base clock and jiffies.
897 */ 899 */
898 if (!base->is_idle || (long) (jiffies - base->clk) < 2) 900 if (!base->is_idle || (long) (jnow - base->clk) < 2)
899 return; 901 return;
900 902
901 /* 903 /*
902 * If the next expiry value is > jiffies, then we fast forward to 904 * If the next expiry value is > jiffies, then we fast forward to
903 * jiffies otherwise we forward to the next expiry value. 905 * jiffies otherwise we forward to the next expiry value.
904 */ 906 */
905 if (time_after(base->next_expiry, jiffies)) 907 if (time_after(base->next_expiry, jnow))
906 base->clk = jiffies; 908 base->clk = jnow;
907 else 909 else
908 base->clk = base->next_expiry; 910 base->clk = base->next_expiry;
909} 911}
910#else 912#else
911static inline struct timer_base * 913static inline struct timer_base *
912__get_target_base(struct timer_base *base, unsigned tflags) 914get_target_base(struct timer_base *base, unsigned tflags)
913{ 915{
914 return get_timer_this_cpu_base(tflags); 916 return get_timer_this_cpu_base(tflags);
915} 917}
@@ -917,14 +919,6 @@ __get_target_base(struct timer_base *base, unsigned tflags)
917static inline void forward_timer_base(struct timer_base *base) { } 919static inline void forward_timer_base(struct timer_base *base) { }
918#endif 920#endif
919 921
920static inline struct timer_base *
921get_target_base(struct timer_base *base, unsigned tflags)
922{
923 struct timer_base *target = __get_target_base(base, tflags);
924
925 forward_timer_base(target);
926 return target;
927}
928 922
929/* 923/*
930 * We are using hashed locking: Holding per_cpu(timer_bases[x]).lock means 924 * We are using hashed locking: Holding per_cpu(timer_bases[x]).lock means
@@ -943,7 +937,14 @@ static struct timer_base *lock_timer_base(struct timer_list *timer,
943{ 937{
944 for (;;) { 938 for (;;) {
945 struct timer_base *base; 939 struct timer_base *base;
946 u32 tf = timer->flags; 940 u32 tf;
941
942 /*
943 * We need to use READ_ONCE() here, otherwise the compiler
944 * might re-read @tf between the check for TIMER_MIGRATING
945 * and spin_lock().
946 */
947 tf = READ_ONCE(timer->flags);
947 948
948 if (!(tf & TIMER_MIGRATING)) { 949 if (!(tf & TIMER_MIGRATING)) {
949 base = get_timer_base(tf); 950 base = get_timer_base(tf);
@@ -964,6 +965,8 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
964 unsigned long clk = 0, flags; 965 unsigned long clk = 0, flags;
965 int ret = 0; 966 int ret = 0;
966 967
968 BUG_ON(!timer->function);
969
967 /* 970 /*
968 * This is a common optimization triggered by the networking code - if 971 * This is a common optimization triggered by the networking code - if
969 * the timer is re-modified to have the same timeout or ends up in the 972 * the timer is re-modified to have the same timeout or ends up in the
@@ -972,13 +975,16 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
972 if (timer_pending(timer)) { 975 if (timer_pending(timer)) {
973 if (timer->expires == expires) 976 if (timer->expires == expires)
974 return 1; 977 return 1;
978
975 /* 979 /*
976 * Take the current timer_jiffies of base, but without holding 980 * We lock timer base and calculate the bucket index right
977 * the lock! 981 * here. If the timer ends up in the same bucket, then we
982 * just update the expiry time and avoid the whole
983 * dequeue/enqueue dance.
978 */ 984 */
979 base = get_timer_base(timer->flags); 985 base = lock_timer_base(timer, &flags);
980 clk = base->clk;
981 986
987 clk = base->clk;
982 idx = calc_wheel_index(expires, clk); 988 idx = calc_wheel_index(expires, clk);
983 989
984 /* 990 /*
@@ -988,14 +994,14 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
988 */ 994 */
989 if (idx == timer_get_idx(timer)) { 995 if (idx == timer_get_idx(timer)) {
990 timer->expires = expires; 996 timer->expires = expires;
991 return 1; 997 ret = 1;
998 goto out_unlock;
992 } 999 }
1000 } else {
1001 base = lock_timer_base(timer, &flags);
993 } 1002 }
994 1003
995 timer_stats_timer_set_start_info(timer); 1004 timer_stats_timer_set_start_info(timer);
996 BUG_ON(!timer->function);
997
998 base = lock_timer_base(timer, &flags);
999 1005
1000 ret = detach_if_pending(timer, base, false); 1006 ret = detach_if_pending(timer, base, false);
1001 if (!ret && pending_only) 1007 if (!ret && pending_only)
@@ -1025,12 +1031,16 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
1025 } 1031 }
1026 } 1032 }
1027 1033
1034 /* Try to forward a stale timer base clock */
1035 forward_timer_base(base);
1036
1028 timer->expires = expires; 1037 timer->expires = expires;
1029 /* 1038 /*
1030 * If 'idx' was calculated above and the base time did not advance 1039 * If 'idx' was calculated above and the base time did not advance
1031 * between calculating 'idx' and taking the lock, only enqueue_timer() 1040 * between calculating 'idx' and possibly switching the base, only
1032 * and trigger_dyntick_cpu() is required. Otherwise we need to 1041 * enqueue_timer() and trigger_dyntick_cpu() is required. Otherwise
1033 * (re)calculate the wheel index via internal_add_timer(). 1042 * we need to (re)calculate the wheel index via
1043 * internal_add_timer().
1034 */ 1044 */
1035 if (idx != UINT_MAX && clk == base->clk) { 1045 if (idx != UINT_MAX && clk == base->clk) {
1036 enqueue_timer(base, timer, idx); 1046 enqueue_timer(base, timer, idx);
@@ -1510,12 +1520,16 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
1510 is_max_delta = (nextevt == base->clk + NEXT_TIMER_MAX_DELTA); 1520 is_max_delta = (nextevt == base->clk + NEXT_TIMER_MAX_DELTA);
1511 base->next_expiry = nextevt; 1521 base->next_expiry = nextevt;
1512 /* 1522 /*
1513 * We have a fresh next event. Check whether we can forward the base: 1523 * We have a fresh next event. Check whether we can forward the
1524 * base. We can only do that when @basej is past base->clk
1525 * otherwise we might rewind base->clk.
1514 */ 1526 */
1515 if (time_after(nextevt, jiffies)) 1527 if (time_after(basej, base->clk)) {
1516 base->clk = jiffies; 1528 if (time_after(nextevt, basej))
1517 else if (time_after(nextevt, base->clk)) 1529 base->clk = basej;
1518 base->clk = nextevt; 1530 else if (time_after(nextevt, base->clk))
1531 base->clk = nextevt;
1532 }
1519 1533
1520 if (time_before_eq(nextevt, basej)) { 1534 if (time_before_eq(nextevt, basej)) {
1521 expires = basem; 1535 expires = basem;
@@ -1633,7 +1647,7 @@ static inline void __run_timers(struct timer_base *base)
1633/* 1647/*
1634 * This function runs timers and the timer-tq in bottom half context. 1648 * This function runs timers and the timer-tq in bottom half context.
1635 */ 1649 */
1636static void run_timer_softirq(struct softirq_action *h) 1650static __latent_entropy void run_timer_softirq(struct softirq_action *h)
1637{ 1651{
1638 struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]); 1652 struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
1639 1653
diff --git a/kernel/torture.c b/kernel/torture.c
index 75961b3decfe..0d887eb62856 100644
--- a/kernel/torture.c
+++ b/kernel/torture.c
@@ -43,6 +43,7 @@
43#include <linux/stat.h> 43#include <linux/stat.h>
44#include <linux/slab.h> 44#include <linux/slab.h>
45#include <linux/trace_clock.h> 45#include <linux/trace_clock.h>
46#include <linux/ktime.h>
46#include <asm/byteorder.h> 47#include <asm/byteorder.h>
47#include <linux/torture.h> 48#include <linux/torture.h>
48 49
@@ -446,9 +447,8 @@ EXPORT_SYMBOL_GPL(torture_shuffle_cleanup);
446 * Variables for auto-shutdown. This allows "lights out" torture runs 447 * Variables for auto-shutdown. This allows "lights out" torture runs
447 * to be fully scripted. 448 * to be fully scripted.
448 */ 449 */
449static int shutdown_secs; /* desired test duration in seconds. */
450static struct task_struct *shutdown_task; 450static struct task_struct *shutdown_task;
451static unsigned long shutdown_time; /* jiffies to system shutdown. */ 451static ktime_t shutdown_time; /* time to system shutdown. */
452static void (*torture_shutdown_hook)(void); 452static void (*torture_shutdown_hook)(void);
453 453
454/* 454/*
@@ -471,20 +471,20 @@ EXPORT_SYMBOL_GPL(torture_shutdown_absorb);
471 */ 471 */
472static int torture_shutdown(void *arg) 472static int torture_shutdown(void *arg)
473{ 473{
474 long delta; 474 ktime_t ktime_snap;
475 unsigned long jiffies_snap;
476 475
477 VERBOSE_TOROUT_STRING("torture_shutdown task started"); 476 VERBOSE_TOROUT_STRING("torture_shutdown task started");
478 jiffies_snap = jiffies; 477 ktime_snap = ktime_get();
479 while (ULONG_CMP_LT(jiffies_snap, shutdown_time) && 478 while (ktime_before(ktime_snap, shutdown_time) &&
480 !torture_must_stop()) { 479 !torture_must_stop()) {
481 delta = shutdown_time - jiffies_snap;
482 if (verbose) 480 if (verbose)
483 pr_alert("%s" TORTURE_FLAG 481 pr_alert("%s" TORTURE_FLAG
484 "torture_shutdown task: %lu jiffies remaining\n", 482 "torture_shutdown task: %llu ms remaining\n",
485 torture_type, delta); 483 torture_type,
486 schedule_timeout_interruptible(delta); 484 ktime_ms_delta(shutdown_time, ktime_snap));
487 jiffies_snap = jiffies; 485 set_current_state(TASK_INTERRUPTIBLE);
486 schedule_hrtimeout(&shutdown_time, HRTIMER_MODE_ABS);
487 ktime_snap = ktime_get();
488 } 488 }
489 if (torture_must_stop()) { 489 if (torture_must_stop()) {
490 torture_kthread_stopping("torture_shutdown"); 490 torture_kthread_stopping("torture_shutdown");
@@ -511,10 +511,9 @@ int torture_shutdown_init(int ssecs, void (*cleanup)(void))
511{ 511{
512 int ret = 0; 512 int ret = 0;
513 513
514 shutdown_secs = ssecs;
515 torture_shutdown_hook = cleanup; 514 torture_shutdown_hook = cleanup;
516 if (shutdown_secs > 0) { 515 if (ssecs > 0) {
517 shutdown_time = jiffies + shutdown_secs * HZ; 516 shutdown_time = ktime_add(ktime_get(), ktime_set(ssecs, 0));
518 ret = torture_create_kthread(torture_shutdown, NULL, 517 ret = torture_create_kthread(torture_shutdown, NULL,
519 shutdown_task); 518 shutdown_task);
520 } 519 }
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index f4b86e8ca1e7..2a96b063d659 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -24,11 +24,6 @@ config HAVE_FUNCTION_GRAPH_TRACER
24 help 24 help
25 See Documentation/trace/ftrace-design.txt 25 See Documentation/trace/ftrace-design.txt
26 26
27config HAVE_FUNCTION_GRAPH_FP_TEST
28 bool
29 help
30 See Documentation/trace/ftrace-design.txt
31
32config HAVE_DYNAMIC_FTRACE 27config HAVE_DYNAMIC_FTRACE
33 bool 28 bool
34 help 29 help
@@ -221,6 +216,41 @@ config SCHED_TRACER
221 This tracer tracks the latency of the highest priority task 216 This tracer tracks the latency of the highest priority task
222 to be scheduled in, starting from the point it has woken up. 217 to be scheduled in, starting from the point it has woken up.
223 218
219config HWLAT_TRACER
220 bool "Tracer to detect hardware latencies (like SMIs)"
221 select GENERIC_TRACER
222 help
223 This tracer, when enabled will create one or more kernel threads,
224 depening on what the cpumask file is set to, which each thread
225 spinning in a loop looking for interruptions caused by
226 something other than the kernel. For example, if a
227 System Management Interrupt (SMI) takes a noticeable amount of
228 time, this tracer will detect it. This is useful for testing
229 if a system is reliable for Real Time tasks.
230
231 Some files are created in the tracing directory when this
232 is enabled:
233
234 hwlat_detector/width - time in usecs for how long to spin for
235 hwlat_detector/window - time in usecs between the start of each
236 iteration
237
238 A kernel thread is created that will spin with interrupts disabled
239 for "width" microseconds in every "widow" cycle. It will not spin
240 for "window - width" microseconds, where the system can
241 continue to operate.
242
243 The output will appear in the trace and trace_pipe files.
244
245 When the tracer is not running, it has no affect on the system,
246 but when it is running, it can cause the system to be
247 periodically non responsive. Do not run this tracer on a
248 production system.
249
250 To enable this tracer, echo in "hwlat" into the current_tracer
251 file. Every time a latency is greater than tracing_thresh, it will
252 be recorded into the ring buffer.
253
224config ENABLE_DEFAULT_TRACERS 254config ENABLE_DEFAULT_TRACERS
225 bool "Trace process context switches and events" 255 bool "Trace process context switches and events"
226 depends on !GENERIC_TRACER 256 depends on !GENERIC_TRACER
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index d0a1617b52b4..e57980845549 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -1,8 +1,4 @@
1 1
2# We are fully aware of the dangers of __builtin_return_address()
3FRAME_CFLAGS := $(call cc-disable-warning,frame-address)
4KBUILD_CFLAGS += $(FRAME_CFLAGS)
5
6# Do not instrument the tracer itself: 2# Do not instrument the tracer itself:
7 3
8ifdef CONFIG_FUNCTION_TRACER 4ifdef CONFIG_FUNCTION_TRACER
@@ -41,6 +37,7 @@ obj-$(CONFIG_FUNCTION_TRACER) += trace_functions.o
41obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o 37obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o
42obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o 38obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o
43obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o 39obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o
40obj-$(CONFIG_HWLAT_TRACER) += trace_hwlat.o
44obj-$(CONFIG_NOP_TRACER) += trace_nop.o 41obj-$(CONFIG_NOP_TRACER) += trace_nop.o
45obj-$(CONFIG_STACK_TRACER) += trace_stack.o 42obj-$(CONFIG_STACK_TRACER) += trace_stack.o
46obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o 43obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index b20438fdb029..5dcb99281259 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -1,4 +1,5 @@
1/* Copyright (c) 2011-2015 PLUMgrid, http://plumgrid.com 1/* Copyright (c) 2011-2015 PLUMgrid, http://plumgrid.com
2 * Copyright (c) 2016 Facebook
2 * 3 *
3 * This program is free software; you can redistribute it and/or 4 * This program is free software; you can redistribute it and/or
4 * modify it under the terms of version 2 of the GNU General Public 5 * modify it under the terms of version 2 of the GNU General Public
@@ -8,6 +9,7 @@
8#include <linux/types.h> 9#include <linux/types.h>
9#include <linux/slab.h> 10#include <linux/slab.h>
10#include <linux/bpf.h> 11#include <linux/bpf.h>
12#include <linux/bpf_perf_event.h>
11#include <linux/filter.h> 13#include <linux/filter.h>
12#include <linux/uaccess.h> 14#include <linux/uaccess.h>
13#include <linux/ctype.h> 15#include <linux/ctype.h>
@@ -59,11 +61,9 @@ unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx)
59} 61}
60EXPORT_SYMBOL_GPL(trace_call_bpf); 62EXPORT_SYMBOL_GPL(trace_call_bpf);
61 63
62static u64 bpf_probe_read(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) 64BPF_CALL_3(bpf_probe_read, void *, dst, u32, size, const void *, unsafe_ptr)
63{ 65{
64 void *dst = (void *) (long) r1; 66 int ret;
65 int ret, size = (int) r2;
66 void *unsafe_ptr = (void *) (long) r3;
67 67
68 ret = probe_kernel_read(dst, unsafe_ptr, size); 68 ret = probe_kernel_read(dst, unsafe_ptr, size);
69 if (unlikely(ret < 0)) 69 if (unlikely(ret < 0))
@@ -81,12 +81,9 @@ static const struct bpf_func_proto bpf_probe_read_proto = {
81 .arg3_type = ARG_ANYTHING, 81 .arg3_type = ARG_ANYTHING,
82}; 82};
83 83
84static u64 bpf_probe_write_user(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) 84BPF_CALL_3(bpf_probe_write_user, void *, unsafe_ptr, const void *, src,
85 u32, size)
85{ 86{
86 void *unsafe_ptr = (void *) (long) r1;
87 void *src = (void *) (long) r2;
88 int size = (int) r3;
89
90 /* 87 /*
91 * Ensure we're in user context which is safe for the helper to 88 * Ensure we're in user context which is safe for the helper to
92 * run. This helper has no business in a kthread. 89 * run. This helper has no business in a kthread.
@@ -128,9 +125,9 @@ static const struct bpf_func_proto *bpf_get_probe_write_proto(void)
128 * limited trace_printk() 125 * limited trace_printk()
129 * only %d %u %x %ld %lu %lx %lld %llu %llx %p %s conversion specifiers allowed 126 * only %d %u %x %ld %lu %lx %lld %llu %llx %p %s conversion specifiers allowed
130 */ 127 */
131static u64 bpf_trace_printk(u64 r1, u64 fmt_size, u64 r3, u64 r4, u64 r5) 128BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1,
129 u64, arg2, u64, arg3)
132{ 130{
133 char *fmt = (char *) (long) r1;
134 bool str_seen = false; 131 bool str_seen = false;
135 int mod[3] = {}; 132 int mod[3] = {};
136 int fmt_cnt = 0; 133 int fmt_cnt = 0;
@@ -176,16 +173,16 @@ static u64 bpf_trace_printk(u64 r1, u64 fmt_size, u64 r3, u64 r4, u64 r5)
176 173
177 switch (fmt_cnt) { 174 switch (fmt_cnt) {
178 case 1: 175 case 1:
179 unsafe_addr = r3; 176 unsafe_addr = arg1;
180 r3 = (long) buf; 177 arg1 = (long) buf;
181 break; 178 break;
182 case 2: 179 case 2:
183 unsafe_addr = r4; 180 unsafe_addr = arg2;
184 r4 = (long) buf; 181 arg2 = (long) buf;
185 break; 182 break;
186 case 3: 183 case 3:
187 unsafe_addr = r5; 184 unsafe_addr = arg3;
188 r5 = (long) buf; 185 arg3 = (long) buf;
189 break; 186 break;
190 } 187 }
191 buf[0] = 0; 188 buf[0] = 0;
@@ -207,9 +204,9 @@ static u64 bpf_trace_printk(u64 r1, u64 fmt_size, u64 r3, u64 r4, u64 r5)
207 } 204 }
208 205
209 return __trace_printk(1/* fake ip will not be printed */, fmt, 206 return __trace_printk(1/* fake ip will not be printed */, fmt,
210 mod[0] == 2 ? r3 : mod[0] == 1 ? (long) r3 : (u32) r3, 207 mod[0] == 2 ? arg1 : mod[0] == 1 ? (long) arg1 : (u32) arg1,
211 mod[1] == 2 ? r4 : mod[1] == 1 ? (long) r4 : (u32) r4, 208 mod[1] == 2 ? arg2 : mod[1] == 1 ? (long) arg2 : (u32) arg2,
212 mod[2] == 2 ? r5 : mod[2] == 1 ? (long) r5 : (u32) r5); 209 mod[2] == 2 ? arg3 : mod[2] == 1 ? (long) arg3 : (u32) arg3);
213} 210}
214 211
215static const struct bpf_func_proto bpf_trace_printk_proto = { 212static const struct bpf_func_proto bpf_trace_printk_proto = {
@@ -231,9 +228,8 @@ const struct bpf_func_proto *bpf_get_trace_printk_proto(void)
231 return &bpf_trace_printk_proto; 228 return &bpf_trace_printk_proto;
232} 229}
233 230
234static u64 bpf_perf_event_read(u64 r1, u64 flags, u64 r3, u64 r4, u64 r5) 231BPF_CALL_2(bpf_perf_event_read, struct bpf_map *, map, u64, flags)
235{ 232{
236 struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
237 struct bpf_array *array = container_of(map, struct bpf_array, map); 233 struct bpf_array *array = container_of(map, struct bpf_array, map);
238 unsigned int cpu = smp_processor_id(); 234 unsigned int cpu = smp_processor_id();
239 u64 index = flags & BPF_F_INDEX_MASK; 235 u64 index = flags & BPF_F_INDEX_MASK;
@@ -310,11 +306,9 @@ __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map,
310 return 0; 306 return 0;
311} 307}
312 308
313static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 flags, u64 r4, u64 size) 309BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map,
310 u64, flags, void *, data, u64, size)
314{ 311{
315 struct pt_regs *regs = (struct pt_regs *)(long) r1;
316 struct bpf_map *map = (struct bpf_map *)(long) r2;
317 void *data = (void *)(long) r4;
318 struct perf_raw_record raw = { 312 struct perf_raw_record raw = {
319 .frag = { 313 .frag = {
320 .size = size, 314 .size = size,
@@ -365,7 +359,7 @@ u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
365 return __bpf_perf_event_output(regs, map, flags, &raw); 359 return __bpf_perf_event_output(regs, map, flags, &raw);
366} 360}
367 361
368static u64 bpf_get_current_task(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) 362BPF_CALL_0(bpf_get_current_task)
369{ 363{
370 return (long) current; 364 return (long) current;
371} 365}
@@ -376,6 +370,31 @@ static const struct bpf_func_proto bpf_get_current_task_proto = {
376 .ret_type = RET_INTEGER, 370 .ret_type = RET_INTEGER,
377}; 371};
378 372
373BPF_CALL_2(bpf_current_task_under_cgroup, struct bpf_map *, map, u32, idx)
374{
375 struct bpf_array *array = container_of(map, struct bpf_array, map);
376 struct cgroup *cgrp;
377
378 if (unlikely(in_interrupt()))
379 return -EINVAL;
380 if (unlikely(idx >= array->map.max_entries))
381 return -E2BIG;
382
383 cgrp = READ_ONCE(array->ptrs[idx]);
384 if (unlikely(!cgrp))
385 return -EAGAIN;
386
387 return task_under_cgroup_hierarchy(current, cgrp);
388}
389
390static const struct bpf_func_proto bpf_current_task_under_cgroup_proto = {
391 .func = bpf_current_task_under_cgroup,
392 .gpl_only = false,
393 .ret_type = RET_INTEGER,
394 .arg1_type = ARG_CONST_MAP_PTR,
395 .arg2_type = ARG_ANYTHING,
396};
397
379static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id) 398static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id)
380{ 399{
381 switch (func_id) { 400 switch (func_id) {
@@ -407,6 +426,10 @@ static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id)
407 return &bpf_perf_event_read_proto; 426 return &bpf_perf_event_read_proto;
408 case BPF_FUNC_probe_write_user: 427 case BPF_FUNC_probe_write_user:
409 return bpf_get_probe_write_proto(); 428 return bpf_get_probe_write_proto();
429 case BPF_FUNC_current_task_under_cgroup:
430 return &bpf_current_task_under_cgroup_proto;
431 case BPF_FUNC_get_prandom_u32:
432 return &bpf_get_prandom_u32_proto;
410 default: 433 default:
411 return NULL; 434 return NULL;
412 } 435 }
@@ -447,16 +470,17 @@ static struct bpf_prog_type_list kprobe_tl = {
447 .type = BPF_PROG_TYPE_KPROBE, 470 .type = BPF_PROG_TYPE_KPROBE,
448}; 471};
449 472
450static u64 bpf_perf_event_output_tp(u64 r1, u64 r2, u64 index, u64 r4, u64 size) 473BPF_CALL_5(bpf_perf_event_output_tp, void *, tp_buff, struct bpf_map *, map,
474 u64, flags, void *, data, u64, size)
451{ 475{
476 struct pt_regs *regs = *(struct pt_regs **)tp_buff;
477
452 /* 478 /*
453 * r1 points to perf tracepoint buffer where first 8 bytes are hidden 479 * r1 points to perf tracepoint buffer where first 8 bytes are hidden
454 * from bpf program and contain a pointer to 'struct pt_regs'. Fetch it 480 * from bpf program and contain a pointer to 'struct pt_regs'. Fetch it
455 * from there and call the same bpf_perf_event_output() helper 481 * from there and call the same bpf_perf_event_output() helper inline.
456 */ 482 */
457 u64 ctx = *(long *)(uintptr_t)r1; 483 return ____bpf_perf_event_output(regs, map, flags, data, size);
458
459 return bpf_perf_event_output(ctx, r2, index, r4, size);
460} 484}
461 485
462static const struct bpf_func_proto bpf_perf_event_output_proto_tp = { 486static const struct bpf_func_proto bpf_perf_event_output_proto_tp = {
@@ -470,11 +494,18 @@ static const struct bpf_func_proto bpf_perf_event_output_proto_tp = {
470 .arg5_type = ARG_CONST_STACK_SIZE, 494 .arg5_type = ARG_CONST_STACK_SIZE,
471}; 495};
472 496
473static u64 bpf_get_stackid_tp(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) 497BPF_CALL_3(bpf_get_stackid_tp, void *, tp_buff, struct bpf_map *, map,
498 u64, flags)
474{ 499{
475 u64 ctx = *(long *)(uintptr_t)r1; 500 struct pt_regs *regs = *(struct pt_regs **)tp_buff;
476 501
477 return bpf_get_stackid(ctx, r2, r3, r4, r5); 502 /*
503 * Same comment as in bpf_perf_event_output_tp(), only that this time
504 * the other helper's function body cannot be inlined due to being
505 * external, thus we need to call raw helper function.
506 */
507 return bpf_get_stackid((unsigned long) regs, (unsigned long) map,
508 flags, 0, 0);
478} 509}
479 510
480static const struct bpf_func_proto bpf_get_stackid_proto_tp = { 511static const struct bpf_func_proto bpf_get_stackid_proto_tp = {
@@ -520,10 +551,69 @@ static struct bpf_prog_type_list tracepoint_tl = {
520 .type = BPF_PROG_TYPE_TRACEPOINT, 551 .type = BPF_PROG_TYPE_TRACEPOINT,
521}; 552};
522 553
554static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type,
555 enum bpf_reg_type *reg_type)
556{
557 if (off < 0 || off >= sizeof(struct bpf_perf_event_data))
558 return false;
559 if (type != BPF_READ)
560 return false;
561 if (off % size != 0)
562 return false;
563 if (off == offsetof(struct bpf_perf_event_data, sample_period)) {
564 if (size != sizeof(u64))
565 return false;
566 } else {
567 if (size != sizeof(long))
568 return false;
569 }
570 return true;
571}
572
573static u32 pe_prog_convert_ctx_access(enum bpf_access_type type, int dst_reg,
574 int src_reg, int ctx_off,
575 struct bpf_insn *insn_buf,
576 struct bpf_prog *prog)
577{
578 struct bpf_insn *insn = insn_buf;
579
580 switch (ctx_off) {
581 case offsetof(struct bpf_perf_event_data, sample_period):
582 BUILD_BUG_ON(FIELD_SIZEOF(struct perf_sample_data, period) != sizeof(u64));
583
584 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern,
585 data), dst_reg, src_reg,
586 offsetof(struct bpf_perf_event_data_kern, data));
587 *insn++ = BPF_LDX_MEM(BPF_DW, dst_reg, dst_reg,
588 offsetof(struct perf_sample_data, period));
589 break;
590 default:
591 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern,
592 regs), dst_reg, src_reg,
593 offsetof(struct bpf_perf_event_data_kern, regs));
594 *insn++ = BPF_LDX_MEM(BPF_SIZEOF(long), dst_reg, dst_reg, ctx_off);
595 break;
596 }
597
598 return insn - insn_buf;
599}
600
601static const struct bpf_verifier_ops perf_event_prog_ops = {
602 .get_func_proto = tp_prog_func_proto,
603 .is_valid_access = pe_prog_is_valid_access,
604 .convert_ctx_access = pe_prog_convert_ctx_access,
605};
606
607static struct bpf_prog_type_list perf_event_tl = {
608 .ops = &perf_event_prog_ops,
609 .type = BPF_PROG_TYPE_PERF_EVENT,
610};
611
523static int __init register_kprobe_prog_ops(void) 612static int __init register_kprobe_prog_ops(void)
524{ 613{
525 bpf_register_prog_type(&kprobe_tl); 614 bpf_register_prog_type(&kprobe_tl);
526 bpf_register_prog_type(&tracepoint_tl); 615 bpf_register_prog_type(&tracepoint_tl);
616 bpf_register_prog_type(&perf_event_tl);
527 return 0; 617 return 0;
528} 618}
529late_initcall(register_kprobe_prog_ops); 619late_initcall(register_kprobe_prog_ops);
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 84752c8e28b5..da87b3cba5b3 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -872,7 +872,13 @@ function_profile_call(unsigned long ip, unsigned long parent_ip,
872#ifdef CONFIG_FUNCTION_GRAPH_TRACER 872#ifdef CONFIG_FUNCTION_GRAPH_TRACER
873static int profile_graph_entry(struct ftrace_graph_ent *trace) 873static int profile_graph_entry(struct ftrace_graph_ent *trace)
874{ 874{
875 int index = trace->depth;
876
875 function_profile_call(trace->func, 0, NULL, NULL); 877 function_profile_call(trace->func, 0, NULL, NULL);
878
879 if (index >= 0 && index < FTRACE_RETFUNC_DEPTH)
880 current->ret_stack[index].subtime = 0;
881
876 return 1; 882 return 1;
877} 883}
878 884
@@ -1856,6 +1862,10 @@ static int __ftrace_hash_update_ipmodify(struct ftrace_ops *ops,
1856 1862
1857 /* Update rec->flags */ 1863 /* Update rec->flags */
1858 do_for_each_ftrace_rec(pg, rec) { 1864 do_for_each_ftrace_rec(pg, rec) {
1865
1866 if (rec->flags & FTRACE_FL_DISABLED)
1867 continue;
1868
1859 /* We need to update only differences of filter_hash */ 1869 /* We need to update only differences of filter_hash */
1860 in_old = !!ftrace_lookup_ip(old_hash, rec->ip); 1870 in_old = !!ftrace_lookup_ip(old_hash, rec->ip);
1861 in_new = !!ftrace_lookup_ip(new_hash, rec->ip); 1871 in_new = !!ftrace_lookup_ip(new_hash, rec->ip);
@@ -1878,6 +1888,10 @@ rollback:
1878 1888
1879 /* Roll back what we did above */ 1889 /* Roll back what we did above */
1880 do_for_each_ftrace_rec(pg, rec) { 1890 do_for_each_ftrace_rec(pg, rec) {
1891
1892 if (rec->flags & FTRACE_FL_DISABLED)
1893 continue;
1894
1881 if (rec == end) 1895 if (rec == end)
1882 goto err_out; 1896 goto err_out;
1883 1897
@@ -2391,6 +2405,10 @@ void __weak ftrace_replace_code(int enable)
2391 return; 2405 return;
2392 2406
2393 do_for_each_ftrace_rec(pg, rec) { 2407 do_for_each_ftrace_rec(pg, rec) {
2408
2409 if (rec->flags & FTRACE_FL_DISABLED)
2410 continue;
2411
2394 failed = __ftrace_replace_code(rec, enable); 2412 failed = __ftrace_replace_code(rec, enable);
2395 if (failed) { 2413 if (failed) {
2396 ftrace_bug(failed, rec); 2414 ftrace_bug(failed, rec);
@@ -2757,7 +2775,7 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command)
2757 struct dyn_ftrace *rec; 2775 struct dyn_ftrace *rec;
2758 2776
2759 do_for_each_ftrace_rec(pg, rec) { 2777 do_for_each_ftrace_rec(pg, rec) {
2760 if (FTRACE_WARN_ON_ONCE(rec->flags)) 2778 if (FTRACE_WARN_ON_ONCE(rec->flags & ~FTRACE_FL_DISABLED))
2761 pr_warn(" %pS flags:%lx\n", 2779 pr_warn(" %pS flags:%lx\n",
2762 (void *)rec->ip, rec->flags); 2780 (void *)rec->ip, rec->flags);
2763 } while_for_each_ftrace_rec(); 2781 } while_for_each_ftrace_rec();
@@ -3592,6 +3610,10 @@ match_records(struct ftrace_hash *hash, char *func, int len, char *mod)
3592 goto out_unlock; 3610 goto out_unlock;
3593 3611
3594 do_for_each_ftrace_rec(pg, rec) { 3612 do_for_each_ftrace_rec(pg, rec) {
3613
3614 if (rec->flags & FTRACE_FL_DISABLED)
3615 continue;
3616
3595 if (ftrace_match_record(rec, &func_g, mod_match, exclude_mod)) { 3617 if (ftrace_match_record(rec, &func_g, mod_match, exclude_mod)) {
3596 ret = enter_record(hash, rec, clear_filter); 3618 ret = enter_record(hash, rec, clear_filter);
3597 if (ret < 0) { 3619 if (ret < 0) {
@@ -3787,6 +3809,9 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
3787 3809
3788 do_for_each_ftrace_rec(pg, rec) { 3810 do_for_each_ftrace_rec(pg, rec) {
3789 3811
3812 if (rec->flags & FTRACE_FL_DISABLED)
3813 continue;
3814
3790 if (!ftrace_match_record(rec, &func_g, NULL, 0)) 3815 if (!ftrace_match_record(rec, &func_g, NULL, 0))
3791 continue; 3816 continue;
3792 3817
@@ -4679,6 +4704,9 @@ ftrace_set_func(unsigned long *array, int *idx, int size, char *buffer)
4679 4704
4680 do_for_each_ftrace_rec(pg, rec) { 4705 do_for_each_ftrace_rec(pg, rec) {
4681 4706
4707 if (rec->flags & FTRACE_FL_DISABLED)
4708 continue;
4709
4682 if (ftrace_match_record(rec, &func_g, NULL, 0)) { 4710 if (ftrace_match_record(rec, &func_g, NULL, 0)) {
4683 /* if it is in the array */ 4711 /* if it is in the array */
4684 exists = false; 4712 exists = false;
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index dade4c9559cc..8696ce6bf2f6 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1047,7 +1047,7 @@ void disable_trace_on_warning(void)
1047 * 1047 *
1048 * Shows real state of the ring buffer if it is enabled or not. 1048 * Shows real state of the ring buffer if it is enabled or not.
1049 */ 1049 */
1050static int tracer_tracing_is_on(struct trace_array *tr) 1050int tracer_tracing_is_on(struct trace_array *tr)
1051{ 1051{
1052 if (tr->trace_buffer.buffer) 1052 if (tr->trace_buffer.buffer)
1053 return ring_buffer_record_is_on(tr->trace_buffer.buffer); 1053 return ring_buffer_record_is_on(tr->trace_buffer.buffer);
@@ -4123,6 +4123,30 @@ static const char readme_msg[] =
4123 "\t\t\t traces\n" 4123 "\t\t\t traces\n"
4124#endif 4124#endif
4125#endif /* CONFIG_STACK_TRACER */ 4125#endif /* CONFIG_STACK_TRACER */
4126#ifdef CONFIG_KPROBE_EVENT
4127 " kprobe_events\t\t- Add/remove/show the kernel dynamic events\n"
4128 "\t\t\t Write into this file to define/undefine new trace events.\n"
4129#endif
4130#ifdef CONFIG_UPROBE_EVENT
4131 " uprobe_events\t\t- Add/remove/show the userspace dynamic events\n"
4132 "\t\t\t Write into this file to define/undefine new trace events.\n"
4133#endif
4134#if defined(CONFIG_KPROBE_EVENT) || defined(CONFIG_UPROBE_EVENT)
4135 "\t accepts: event-definitions (one definition per line)\n"
4136 "\t Format: p|r[:[<group>/]<event>] <place> [<args>]\n"
4137 "\t -:[<group>/]<event>\n"
4138#ifdef CONFIG_KPROBE_EVENT
4139 "\t place: [<module>:]<symbol>[+<offset>]|<memaddr>\n"
4140#endif
4141#ifdef CONFIG_UPROBE_EVENT
4142 "\t place: <path>:<offset>\n"
4143#endif
4144 "\t args: <name>=fetcharg[:type]\n"
4145 "\t fetcharg: %<register>, @<address>, @<symbol>[+|-<offset>],\n"
4146 "\t $stack<index>, $stack, $retval, $comm\n"
4147 "\t type: s8/16/32/64, u8/16/32/64, x8/16/32/64, string,\n"
4148 "\t b<bit-width>@<bit-offset>/<container-size>\n"
4149#endif
4126 " events/\t\t- Directory containing all trace event subsystems:\n" 4150 " events/\t\t- Directory containing all trace event subsystems:\n"
4127 " enable\t\t- Write 0/1 to enable/disable tracing of all events\n" 4151 " enable\t\t- Write 0/1 to enable/disable tracing of all events\n"
4128 " events/<system>/\t- Directory containing all trace events for <system>:\n" 4152 " events/<system>/\t- Directory containing all trace events for <system>:\n"
@@ -4945,7 +4969,7 @@ out:
4945 return ret; 4969 return ret;
4946} 4970}
4947 4971
4948#ifdef CONFIG_TRACER_MAX_TRACE 4972#if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER)
4949 4973
4950static ssize_t 4974static ssize_t
4951tracing_max_lat_read(struct file *filp, char __user *ubuf, 4975tracing_max_lat_read(struct file *filp, char __user *ubuf,
@@ -5124,19 +5148,20 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,
5124 struct trace_iterator *iter = filp->private_data; 5148 struct trace_iterator *iter = filp->private_data;
5125 ssize_t sret; 5149 ssize_t sret;
5126 5150
5127 /* return any leftover data */
5128 sret = trace_seq_to_user(&iter->seq, ubuf, cnt);
5129 if (sret != -EBUSY)
5130 return sret;
5131
5132 trace_seq_init(&iter->seq);
5133
5134 /* 5151 /*
5135 * Avoid more than one consumer on a single file descriptor 5152 * Avoid more than one consumer on a single file descriptor
5136 * This is just a matter of traces coherency, the ring buffer itself 5153 * This is just a matter of traces coherency, the ring buffer itself
5137 * is protected. 5154 * is protected.
5138 */ 5155 */
5139 mutex_lock(&iter->mutex); 5156 mutex_lock(&iter->mutex);
5157
5158 /* return any leftover data */
5159 sret = trace_seq_to_user(&iter->seq, ubuf, cnt);
5160 if (sret != -EBUSY)
5161 goto out;
5162
5163 trace_seq_init(&iter->seq);
5164
5140 if (iter->trace->read) { 5165 if (iter->trace->read) {
5141 sret = iter->trace->read(iter, filp, ubuf, cnt, ppos); 5166 sret = iter->trace->read(iter, filp, ubuf, cnt, ppos);
5142 if (sret) 5167 if (sret)
@@ -5867,7 +5892,7 @@ static const struct file_operations tracing_thresh_fops = {
5867 .llseek = generic_file_llseek, 5892 .llseek = generic_file_llseek,
5868}; 5893};
5869 5894
5870#ifdef CONFIG_TRACER_MAX_TRACE 5895#if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER)
5871static const struct file_operations tracing_max_lat_fops = { 5896static const struct file_operations tracing_max_lat_fops = {
5872 .open = tracing_open_generic, 5897 .open = tracing_open_generic,
5873 .read = tracing_max_lat_read, 5898 .read = tracing_max_lat_read,
@@ -6163,9 +6188,6 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
6163 return -EBUSY; 6188 return -EBUSY;
6164#endif 6189#endif
6165 6190
6166 if (splice_grow_spd(pipe, &spd))
6167 return -ENOMEM;
6168
6169 if (*ppos & (PAGE_SIZE - 1)) 6191 if (*ppos & (PAGE_SIZE - 1))
6170 return -EINVAL; 6192 return -EINVAL;
6171 6193
@@ -6175,6 +6197,9 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
6175 len &= PAGE_MASK; 6197 len &= PAGE_MASK;
6176 } 6198 }
6177 6199
6200 if (splice_grow_spd(pipe, &spd))
6201 return -ENOMEM;
6202
6178 again: 6203 again:
6179 trace_access_lock(iter->cpu_file); 6204 trace_access_lock(iter->cpu_file);
6180 entries = ring_buffer_entries_cpu(iter->trace_buffer->buffer, iter->cpu_file); 6205 entries = ring_buffer_entries_cpu(iter->trace_buffer->buffer, iter->cpu_file);
@@ -6232,19 +6257,21 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
6232 /* did we read anything? */ 6257 /* did we read anything? */
6233 if (!spd.nr_pages) { 6258 if (!spd.nr_pages) {
6234 if (ret) 6259 if (ret)
6235 return ret; 6260 goto out;
6236 6261
6262 ret = -EAGAIN;
6237 if ((file->f_flags & O_NONBLOCK) || (flags & SPLICE_F_NONBLOCK)) 6263 if ((file->f_flags & O_NONBLOCK) || (flags & SPLICE_F_NONBLOCK))
6238 return -EAGAIN; 6264 goto out;
6239 6265
6240 ret = wait_on_pipe(iter, true); 6266 ret = wait_on_pipe(iter, true);
6241 if (ret) 6267 if (ret)
6242 return ret; 6268 goto out;
6243 6269
6244 goto again; 6270 goto again;
6245 } 6271 }
6246 6272
6247 ret = splice_to_pipe(pipe, &spd); 6273 ret = splice_to_pipe(pipe, &spd);
6274out:
6248 splice_shrink_spd(&spd); 6275 splice_shrink_spd(&spd);
6249 6276
6250 return ret; 6277 return ret;
@@ -7195,7 +7222,7 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer)
7195 7222
7196 create_trace_options_dir(tr); 7223 create_trace_options_dir(tr);
7197 7224
7198#ifdef CONFIG_TRACER_MAX_TRACE 7225#if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER)
7199 trace_create_file("tracing_max_latency", 0644, d_tracer, 7226 trace_create_file("tracing_max_latency", 0644, d_tracer,
7200 &tr->max_latency, &tracing_max_lat_fops); 7227 &tr->max_latency, &tracing_max_lat_fops);
7201#endif 7228#endif
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index f783df416726..fd24b1f9ac43 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -38,6 +38,7 @@ enum trace_type {
38 TRACE_USER_STACK, 38 TRACE_USER_STACK,
39 TRACE_BLK, 39 TRACE_BLK,
40 TRACE_BPUTS, 40 TRACE_BPUTS,
41 TRACE_HWLAT,
41 42
42 __TRACE_LAST_TYPE, 43 __TRACE_LAST_TYPE,
43}; 44};
@@ -213,6 +214,8 @@ struct trace_array {
213 */ 214 */
214 struct trace_buffer max_buffer; 215 struct trace_buffer max_buffer;
215 bool allocated_snapshot; 216 bool allocated_snapshot;
217#endif
218#if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER)
216 unsigned long max_latency; 219 unsigned long max_latency;
217#endif 220#endif
218 struct trace_pid_list __rcu *filtered_pids; 221 struct trace_pid_list __rcu *filtered_pids;
@@ -326,6 +329,7 @@ extern void __ftrace_bad_type(void);
326 IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT); \ 329 IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT); \
327 IF_ASSIGN(var, ent, struct bprint_entry, TRACE_BPRINT); \ 330 IF_ASSIGN(var, ent, struct bprint_entry, TRACE_BPRINT); \
328 IF_ASSIGN(var, ent, struct bputs_entry, TRACE_BPUTS); \ 331 IF_ASSIGN(var, ent, struct bputs_entry, TRACE_BPUTS); \
332 IF_ASSIGN(var, ent, struct hwlat_entry, TRACE_HWLAT); \
329 IF_ASSIGN(var, ent, struct trace_mmiotrace_rw, \ 333 IF_ASSIGN(var, ent, struct trace_mmiotrace_rw, \
330 TRACE_MMIO_RW); \ 334 TRACE_MMIO_RW); \
331 IF_ASSIGN(var, ent, struct trace_mmiotrace_map, \ 335 IF_ASSIGN(var, ent, struct trace_mmiotrace_map, \
@@ -571,6 +575,7 @@ void tracing_reset_current(int cpu);
571void tracing_reset_all_online_cpus(void); 575void tracing_reset_all_online_cpus(void);
572int tracing_open_generic(struct inode *inode, struct file *filp); 576int tracing_open_generic(struct inode *inode, struct file *filp);
573bool tracing_is_disabled(void); 577bool tracing_is_disabled(void);
578int tracer_tracing_is_on(struct trace_array *tr);
574struct dentry *trace_create_file(const char *name, 579struct dentry *trace_create_file(const char *name,
575 umode_t mode, 580 umode_t mode,
576 struct dentry *parent, 581 struct dentry *parent,
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index 5c30efcda5e6..d1cc37e78f99 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -322,3 +322,30 @@ FTRACE_ENTRY(branch, trace_branch,
322 FILTER_OTHER 322 FILTER_OTHER
323); 323);
324 324
325
326FTRACE_ENTRY(hwlat, hwlat_entry,
327
328 TRACE_HWLAT,
329
330 F_STRUCT(
331 __field( u64, duration )
332 __field( u64, outer_duration )
333 __field( u64, nmi_total_ts )
334 __field_struct( struct timespec, timestamp )
335 __field_desc( long, timestamp, tv_sec )
336 __field_desc( long, timestamp, tv_nsec )
337 __field( unsigned int, nmi_count )
338 __field( unsigned int, seqnum )
339 ),
340
341 F_printk("cnt:%u\tts:%010lu.%010lu\tinner:%llu\touter:%llunmi-ts:%llu\tnmi-count:%u\n",
342 __entry->seqnum,
343 __entry->tv_sec,
344 __entry->tv_nsec,
345 __entry->duration,
346 __entry->outer_duration,
347 __entry->nmi_total_ts,
348 __entry->nmi_count),
349
350 FILTER_OTHER
351);
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
index a975571cde24..6721a1e89f39 100644
--- a/kernel/trace/trace_events_trigger.c
+++ b/kernel/trace/trace_events_trigger.c
@@ -1028,6 +1028,7 @@ static struct event_command trigger_traceon_cmd = {
1028static struct event_command trigger_traceoff_cmd = { 1028static struct event_command trigger_traceoff_cmd = {
1029 .name = "traceoff", 1029 .name = "traceoff",
1030 .trigger_type = ETT_TRACE_ONOFF, 1030 .trigger_type = ETT_TRACE_ONOFF,
1031 .flags = EVENT_CMD_FL_POST_TRIGGER,
1031 .func = event_trigger_callback, 1032 .func = event_trigger_callback,
1032 .reg = register_trigger, 1033 .reg = register_trigger,
1033 .unreg = unregister_trigger, 1034 .unreg = unregister_trigger,
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 7363ccf79512..4e480e870474 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -119,7 +119,7 @@ print_graph_duration(struct trace_array *tr, unsigned long long duration,
119/* Add a function return address to the trace stack on thread info.*/ 119/* Add a function return address to the trace stack on thread info.*/
120int 120int
121ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth, 121ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth,
122 unsigned long frame_pointer) 122 unsigned long frame_pointer, unsigned long *retp)
123{ 123{
124 unsigned long long calltime; 124 unsigned long long calltime;
125 int index; 125 int index;
@@ -170,8 +170,12 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth,
170 current->ret_stack[index].ret = ret; 170 current->ret_stack[index].ret = ret;
171 current->ret_stack[index].func = func; 171 current->ret_stack[index].func = func;
172 current->ret_stack[index].calltime = calltime; 172 current->ret_stack[index].calltime = calltime;
173 current->ret_stack[index].subtime = 0; 173#ifdef HAVE_FUNCTION_GRAPH_FP_TEST
174 current->ret_stack[index].fp = frame_pointer; 174 current->ret_stack[index].fp = frame_pointer;
175#endif
176#ifdef HAVE_FUNCTION_GRAPH_RET_ADDR_PTR
177 current->ret_stack[index].retp = retp;
178#endif
175 *depth = current->curr_ret_stack; 179 *depth = current->curr_ret_stack;
176 180
177 return 0; 181 return 0;
@@ -204,7 +208,7 @@ ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret,
204 return; 208 return;
205 } 209 }
206 210
207#if defined(CONFIG_HAVE_FUNCTION_GRAPH_FP_TEST) && !defined(CC_USING_FENTRY) 211#ifdef HAVE_FUNCTION_GRAPH_FP_TEST
208 /* 212 /*
209 * The arch may choose to record the frame pointer used 213 * The arch may choose to record the frame pointer used
210 * and check it here to make sure that it is what we expect it 214 * and check it here to make sure that it is what we expect it
@@ -279,6 +283,64 @@ unsigned long ftrace_return_to_handler(unsigned long frame_pointer)
279 return ret; 283 return ret;
280} 284}
281 285
286/**
287 * ftrace_graph_ret_addr - convert a potentially modified stack return address
288 * to its original value
289 *
290 * This function can be called by stack unwinding code to convert a found stack
291 * return address ('ret') to its original value, in case the function graph
292 * tracer has modified it to be 'return_to_handler'. If the address hasn't
293 * been modified, the unchanged value of 'ret' is returned.
294 *
295 * 'idx' is a state variable which should be initialized by the caller to zero
296 * before the first call.
297 *
298 * 'retp' is a pointer to the return address on the stack. It's ignored if
299 * the arch doesn't have HAVE_FUNCTION_GRAPH_RET_ADDR_PTR defined.
300 */
301#ifdef HAVE_FUNCTION_GRAPH_RET_ADDR_PTR
302unsigned long ftrace_graph_ret_addr(struct task_struct *task, int *idx,
303 unsigned long ret, unsigned long *retp)
304{
305 int index = task->curr_ret_stack;
306 int i;
307
308 if (ret != (unsigned long)return_to_handler)
309 return ret;
310
311 if (index < -1)
312 index += FTRACE_NOTRACE_DEPTH;
313
314 if (index < 0)
315 return ret;
316
317 for (i = 0; i <= index; i++)
318 if (task->ret_stack[i].retp == retp)
319 return task->ret_stack[i].ret;
320
321 return ret;
322}
323#else /* !HAVE_FUNCTION_GRAPH_RET_ADDR_PTR */
324unsigned long ftrace_graph_ret_addr(struct task_struct *task, int *idx,
325 unsigned long ret, unsigned long *retp)
326{
327 int task_idx;
328
329 if (ret != (unsigned long)return_to_handler)
330 return ret;
331
332 task_idx = task->curr_ret_stack;
333
334 if (!task->ret_stack || task_idx < *idx)
335 return ret;
336
337 task_idx -= *idx;
338 (*idx)++;
339
340 return task->ret_stack[task_idx].ret;
341}
342#endif /* HAVE_FUNCTION_GRAPH_RET_ADDR_PTR */
343
282int __trace_graph_entry(struct trace_array *tr, 344int __trace_graph_entry(struct trace_array *tr,
283 struct ftrace_graph_ent *trace, 345 struct ftrace_graph_ent *trace,
284 unsigned long flags, 346 unsigned long flags,
@@ -1120,6 +1182,11 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent,
1120 trace_seq_puts(s, "/* "); 1182 trace_seq_puts(s, "/* ");
1121 1183
1122 switch (iter->ent->type) { 1184 switch (iter->ent->type) {
1185 case TRACE_BPUTS:
1186 ret = trace_print_bputs_msg_only(iter);
1187 if (ret != TRACE_TYPE_HANDLED)
1188 return ret;
1189 break;
1123 case TRACE_BPRINT: 1190 case TRACE_BPRINT:
1124 ret = trace_print_bprintk_msg_only(iter); 1191 ret = trace_print_bprintk_msg_only(iter);
1125 if (ret != TRACE_TYPE_HANDLED) 1192 if (ret != TRACE_TYPE_HANDLED)
diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c
new file mode 100644
index 000000000000..b97286c48735
--- /dev/null
+++ b/kernel/trace/trace_hwlat.c
@@ -0,0 +1,633 @@
1/*
2 * trace_hwlatdetect.c - A simple Hardware Latency detector.
3 *
4 * Use this tracer to detect large system latencies induced by the behavior of
5 * certain underlying system hardware or firmware, independent of Linux itself.
6 * The code was developed originally to detect the presence of SMIs on Intel
7 * and AMD systems, although there is no dependency upon x86 herein.
8 *
9 * The classical example usage of this tracer is in detecting the presence of
10 * SMIs or System Management Interrupts on Intel and AMD systems. An SMI is a
11 * somewhat special form of hardware interrupt spawned from earlier CPU debug
12 * modes in which the (BIOS/EFI/etc.) firmware arranges for the South Bridge
13 * LPC (or other device) to generate a special interrupt under certain
14 * circumstances, for example, upon expiration of a special SMI timer device,
15 * due to certain external thermal readings, on certain I/O address accesses,
16 * and other situations. An SMI hits a special CPU pin, triggers a special
17 * SMI mode (complete with special memory map), and the OS is unaware.
18 *
19 * Although certain hardware-inducing latencies are necessary (for example,
20 * a modern system often requires an SMI handler for correct thermal control
21 * and remote management) they can wreak havoc upon any OS-level performance
22 * guarantees toward low-latency, especially when the OS is not even made
23 * aware of the presence of these interrupts. For this reason, we need a
24 * somewhat brute force mechanism to detect these interrupts. In this case,
25 * we do it by hogging all of the CPU(s) for configurable timer intervals,
26 * sampling the built-in CPU timer, looking for discontiguous readings.
27 *
28 * WARNING: This implementation necessarily introduces latencies. Therefore,
29 * you should NEVER use this tracer while running in a production
30 * environment requiring any kind of low-latency performance
31 * guarantee(s).
32 *
33 * Copyright (C) 2008-2009 Jon Masters, Red Hat, Inc. <jcm@redhat.com>
34 * Copyright (C) 2013-2016 Steven Rostedt, Red Hat, Inc. <srostedt@redhat.com>
35 *
36 * Includes useful feedback from Clark Williams <clark@redhat.com>
37 *
38 * This file is licensed under the terms of the GNU General Public
39 * License version 2. This program is licensed "as is" without any
40 * warranty of any kind, whether express or implied.
41 */
42#include <linux/kthread.h>
43#include <linux/tracefs.h>
44#include <linux/uaccess.h>
45#include <linux/cpumask.h>
46#include <linux/delay.h>
47#include "trace.h"
48
49static struct trace_array *hwlat_trace;
50
51#define U64STR_SIZE 22 /* 20 digits max */
52
53#define BANNER "hwlat_detector: "
54#define DEFAULT_SAMPLE_WINDOW 1000000 /* 1s */
55#define DEFAULT_SAMPLE_WIDTH 500000 /* 0.5s */
56#define DEFAULT_LAT_THRESHOLD 10 /* 10us */
57
58/* sampling thread*/
59static struct task_struct *hwlat_kthread;
60
61static struct dentry *hwlat_sample_width; /* sample width us */
62static struct dentry *hwlat_sample_window; /* sample window us */
63
64/* Save the previous tracing_thresh value */
65static unsigned long save_tracing_thresh;
66
67/* NMI timestamp counters */
68static u64 nmi_ts_start;
69static u64 nmi_total_ts;
70static int nmi_count;
71static int nmi_cpu;
72
73/* Tells NMIs to call back to the hwlat tracer to record timestamps */
74bool trace_hwlat_callback_enabled;
75
76/* If the user changed threshold, remember it */
77static u64 last_tracing_thresh = DEFAULT_LAT_THRESHOLD * NSEC_PER_USEC;
78
79/* Individual latency samples are stored here when detected. */
80struct hwlat_sample {
81 u64 seqnum; /* unique sequence */
82 u64 duration; /* delta */
83 u64 outer_duration; /* delta (outer loop) */
84 u64 nmi_total_ts; /* Total time spent in NMIs */
85 struct timespec timestamp; /* wall time */
86 int nmi_count; /* # NMIs during this sample */
87};
88
89/* keep the global state somewhere. */
90static struct hwlat_data {
91
92 struct mutex lock; /* protect changes */
93
94 u64 count; /* total since reset */
95
96 u64 sample_window; /* total sampling window (on+off) */
97 u64 sample_width; /* active sampling portion of window */
98
99} hwlat_data = {
100 .sample_window = DEFAULT_SAMPLE_WINDOW,
101 .sample_width = DEFAULT_SAMPLE_WIDTH,
102};
103
104static void trace_hwlat_sample(struct hwlat_sample *sample)
105{
106 struct trace_array *tr = hwlat_trace;
107 struct trace_event_call *call = &event_hwlat;
108 struct ring_buffer *buffer = tr->trace_buffer.buffer;
109 struct ring_buffer_event *event;
110 struct hwlat_entry *entry;
111 unsigned long flags;
112 int pc;
113
114 pc = preempt_count();
115 local_save_flags(flags);
116
117 event = trace_buffer_lock_reserve(buffer, TRACE_HWLAT, sizeof(*entry),
118 flags, pc);
119 if (!event)
120 return;
121 entry = ring_buffer_event_data(event);
122 entry->seqnum = sample->seqnum;
123 entry->duration = sample->duration;
124 entry->outer_duration = sample->outer_duration;
125 entry->timestamp = sample->timestamp;
126 entry->nmi_total_ts = sample->nmi_total_ts;
127 entry->nmi_count = sample->nmi_count;
128
129 if (!call_filter_check_discard(call, entry, buffer, event))
130 __buffer_unlock_commit(buffer, event);
131}
132
133/* Macros to encapsulate the time capturing infrastructure */
134#define time_type u64
135#define time_get() trace_clock_local()
136#define time_to_us(x) div_u64(x, 1000)
137#define time_sub(a, b) ((a) - (b))
138#define init_time(a, b) (a = b)
139#define time_u64(a) a
140
141void trace_hwlat_callback(bool enter)
142{
143 if (smp_processor_id() != nmi_cpu)
144 return;
145
146 /*
147 * Currently trace_clock_local() calls sched_clock() and the
148 * generic version is not NMI safe.
149 */
150 if (!IS_ENABLED(CONFIG_GENERIC_SCHED_CLOCK)) {
151 if (enter)
152 nmi_ts_start = time_get();
153 else
154 nmi_total_ts = time_get() - nmi_ts_start;
155 }
156
157 if (enter)
158 nmi_count++;
159}
160
161/**
162 * get_sample - sample the CPU TSC and look for likely hardware latencies
163 *
164 * Used to repeatedly capture the CPU TSC (or similar), looking for potential
165 * hardware-induced latency. Called with interrupts disabled and with
166 * hwlat_data.lock held.
167 */
168static int get_sample(void)
169{
170 struct trace_array *tr = hwlat_trace;
171 time_type start, t1, t2, last_t2;
172 s64 diff, total, last_total = 0;
173 u64 sample = 0;
174 u64 thresh = tracing_thresh;
175 u64 outer_sample = 0;
176 int ret = -1;
177
178 do_div(thresh, NSEC_PER_USEC); /* modifies interval value */
179
180 nmi_cpu = smp_processor_id();
181 nmi_total_ts = 0;
182 nmi_count = 0;
183 /* Make sure NMIs see this first */
184 barrier();
185
186 trace_hwlat_callback_enabled = true;
187
188 init_time(last_t2, 0);
189 start = time_get(); /* start timestamp */
190
191 do {
192
193 t1 = time_get(); /* we'll look for a discontinuity */
194 t2 = time_get();
195
196 if (time_u64(last_t2)) {
197 /* Check the delta from outer loop (t2 to next t1) */
198 diff = time_to_us(time_sub(t1, last_t2));
199 /* This shouldn't happen */
200 if (diff < 0) {
201 pr_err(BANNER "time running backwards\n");
202 goto out;
203 }
204 if (diff > outer_sample)
205 outer_sample = diff;
206 }
207 last_t2 = t2;
208
209 total = time_to_us(time_sub(t2, start)); /* sample width */
210
211 /* Check for possible overflows */
212 if (total < last_total) {
213 pr_err("Time total overflowed\n");
214 break;
215 }
216 last_total = total;
217
218 /* This checks the inner loop (t1 to t2) */
219 diff = time_to_us(time_sub(t2, t1)); /* current diff */
220
221 /* This shouldn't happen */
222 if (diff < 0) {
223 pr_err(BANNER "time running backwards\n");
224 goto out;
225 }
226
227 if (diff > sample)
228 sample = diff; /* only want highest value */
229
230 } while (total <= hwlat_data.sample_width);
231
232 barrier(); /* finish the above in the view for NMIs */
233 trace_hwlat_callback_enabled = false;
234 barrier(); /* Make sure nmi_total_ts is no longer updated */
235
236 ret = 0;
237
238 /* If we exceed the threshold value, we have found a hardware latency */
239 if (sample > thresh || outer_sample > thresh) {
240 struct hwlat_sample s;
241
242 ret = 1;
243
244 /* We read in microseconds */
245 if (nmi_total_ts)
246 do_div(nmi_total_ts, NSEC_PER_USEC);
247
248 hwlat_data.count++;
249 s.seqnum = hwlat_data.count;
250 s.duration = sample;
251 s.outer_duration = outer_sample;
252 s.timestamp = CURRENT_TIME;
253 s.nmi_total_ts = nmi_total_ts;
254 s.nmi_count = nmi_count;
255 trace_hwlat_sample(&s);
256
257 /* Keep a running maximum ever recorded hardware latency */
258 if (sample > tr->max_latency)
259 tr->max_latency = sample;
260 }
261
262out:
263 return ret;
264}
265
266static struct cpumask save_cpumask;
267static bool disable_migrate;
268
269static void move_to_next_cpu(void)
270{
271 static struct cpumask *current_mask;
272 int next_cpu;
273
274 if (disable_migrate)
275 return;
276
277 /* Just pick the first CPU on first iteration */
278 if (!current_mask) {
279 current_mask = &save_cpumask;
280 get_online_cpus();
281 cpumask_and(current_mask, cpu_online_mask, tracing_buffer_mask);
282 put_online_cpus();
283 next_cpu = cpumask_first(current_mask);
284 goto set_affinity;
285 }
286
287 /*
288 * If for some reason the user modifies the CPU affinity
289 * of this thread, than stop migrating for the duration
290 * of the current test.
291 */
292 if (!cpumask_equal(current_mask, &current->cpus_allowed))
293 goto disable;
294
295 get_online_cpus();
296 cpumask_and(current_mask, cpu_online_mask, tracing_buffer_mask);
297 next_cpu = cpumask_next(smp_processor_id(), current_mask);
298 put_online_cpus();
299
300 if (next_cpu >= nr_cpu_ids)
301 next_cpu = cpumask_first(current_mask);
302
303 set_affinity:
304 if (next_cpu >= nr_cpu_ids) /* Shouldn't happen! */
305 goto disable;
306
307 cpumask_clear(current_mask);
308 cpumask_set_cpu(next_cpu, current_mask);
309
310 sched_setaffinity(0, current_mask);
311 return;
312
313 disable:
314 disable_migrate = true;
315}
316
317/*
318 * kthread_fn - The CPU time sampling/hardware latency detection kernel thread
319 *
320 * Used to periodically sample the CPU TSC via a call to get_sample. We
321 * disable interrupts, which does (intentionally) introduce latency since we
322 * need to ensure nothing else might be running (and thus preempting).
323 * Obviously this should never be used in production environments.
324 *
325 * Currently this runs on which ever CPU it was scheduled on, but most
326 * real-world hardware latency situations occur across several CPUs,
327 * but we might later generalize this if we find there are any actualy
328 * systems with alternate SMI delivery or other hardware latencies.
329 */
330static int kthread_fn(void *data)
331{
332 u64 interval;
333
334 while (!kthread_should_stop()) {
335
336 move_to_next_cpu();
337
338 local_irq_disable();
339 get_sample();
340 local_irq_enable();
341
342 mutex_lock(&hwlat_data.lock);
343 interval = hwlat_data.sample_window - hwlat_data.sample_width;
344 mutex_unlock(&hwlat_data.lock);
345
346 do_div(interval, USEC_PER_MSEC); /* modifies interval value */
347
348 /* Always sleep for at least 1ms */
349 if (interval < 1)
350 interval = 1;
351
352 if (msleep_interruptible(interval))
353 break;
354 }
355
356 return 0;
357}
358
359/**
360 * start_kthread - Kick off the hardware latency sampling/detector kthread
361 *
362 * This starts the kernel thread that will sit and sample the CPU timestamp
363 * counter (TSC or similar) and look for potential hardware latencies.
364 */
365static int start_kthread(struct trace_array *tr)
366{
367 struct task_struct *kthread;
368
369 kthread = kthread_create(kthread_fn, NULL, "hwlatd");
370 if (IS_ERR(kthread)) {
371 pr_err(BANNER "could not start sampling thread\n");
372 return -ENOMEM;
373 }
374 hwlat_kthread = kthread;
375 wake_up_process(kthread);
376
377 return 0;
378}
379
380/**
381 * stop_kthread - Inform the hardware latency samping/detector kthread to stop
382 *
383 * This kicks the running hardware latency sampling/detector kernel thread and
384 * tells it to stop sampling now. Use this on unload and at system shutdown.
385 */
386static void stop_kthread(void)
387{
388 if (!hwlat_kthread)
389 return;
390 kthread_stop(hwlat_kthread);
391 hwlat_kthread = NULL;
392}
393
394/*
395 * hwlat_read - Wrapper read function for reading both window and width
396 * @filp: The active open file structure
397 * @ubuf: The userspace provided buffer to read value into
398 * @cnt: The maximum number of bytes to read
399 * @ppos: The current "file" position
400 *
401 * This function provides a generic read implementation for the global state
402 * "hwlat_data" structure filesystem entries.
403 */
404static ssize_t hwlat_read(struct file *filp, char __user *ubuf,
405 size_t cnt, loff_t *ppos)
406{
407 char buf[U64STR_SIZE];
408 u64 *entry = filp->private_data;
409 u64 val;
410 int len;
411
412 if (!entry)
413 return -EFAULT;
414
415 if (cnt > sizeof(buf))
416 cnt = sizeof(buf);
417
418 val = *entry;
419
420 len = snprintf(buf, sizeof(buf), "%llu\n", val);
421
422 return simple_read_from_buffer(ubuf, cnt, ppos, buf, len);
423}
424
425/**
426 * hwlat_width_write - Write function for "width" entry
427 * @filp: The active open file structure
428 * @ubuf: The user buffer that contains the value to write
429 * @cnt: The maximum number of bytes to write to "file"
430 * @ppos: The current position in @file
431 *
432 * This function provides a write implementation for the "width" interface
433 * to the hardware latency detector. It can be used to configure
434 * for how many us of the total window us we will actively sample for any
435 * hardware-induced latency periods. Obviously, it is not possible to
436 * sample constantly and have the system respond to a sample reader, or,
437 * worse, without having the system appear to have gone out to lunch. It
438 * is enforced that width is less that the total window size.
439 */
440static ssize_t
441hwlat_width_write(struct file *filp, const char __user *ubuf,
442 size_t cnt, loff_t *ppos)
443{
444 u64 val;
445 int err;
446
447 err = kstrtoull_from_user(ubuf, cnt, 10, &val);
448 if (err)
449 return err;
450
451 mutex_lock(&hwlat_data.lock);
452 if (val < hwlat_data.sample_window)
453 hwlat_data.sample_width = val;
454 else
455 err = -EINVAL;
456 mutex_unlock(&hwlat_data.lock);
457
458 if (err)
459 return err;
460
461 return cnt;
462}
463
464/**
465 * hwlat_window_write - Write function for "window" entry
466 * @filp: The active open file structure
467 * @ubuf: The user buffer that contains the value to write
468 * @cnt: The maximum number of bytes to write to "file"
469 * @ppos: The current position in @file
470 *
471 * This function provides a write implementation for the "window" interface
472 * to the hardware latency detetector. The window is the total time
473 * in us that will be considered one sample period. Conceptually, windows
474 * occur back-to-back and contain a sample width period during which
475 * actual sampling occurs. Can be used to write a new total window size. It
476 * is enfoced that any value written must be greater than the sample width
477 * size, or an error results.
478 */
479static ssize_t
480hwlat_window_write(struct file *filp, const char __user *ubuf,
481 size_t cnt, loff_t *ppos)
482{
483 u64 val;
484 int err;
485
486 err = kstrtoull_from_user(ubuf, cnt, 10, &val);
487 if (err)
488 return err;
489
490 mutex_lock(&hwlat_data.lock);
491 if (hwlat_data.sample_width < val)
492 hwlat_data.sample_window = val;
493 else
494 err = -EINVAL;
495 mutex_unlock(&hwlat_data.lock);
496
497 if (err)
498 return err;
499
500 return cnt;
501}
502
503static const struct file_operations width_fops = {
504 .open = tracing_open_generic,
505 .read = hwlat_read,
506 .write = hwlat_width_write,
507};
508
509static const struct file_operations window_fops = {
510 .open = tracing_open_generic,
511 .read = hwlat_read,
512 .write = hwlat_window_write,
513};
514
515/**
516 * init_tracefs - A function to initialize the tracefs interface files
517 *
518 * This function creates entries in tracefs for "hwlat_detector".
519 * It creates the hwlat_detector directory in the tracing directory,
520 * and within that directory is the count, width and window files to
521 * change and view those values.
522 */
523static int init_tracefs(void)
524{
525 struct dentry *d_tracer;
526 struct dentry *top_dir;
527
528 d_tracer = tracing_init_dentry();
529 if (IS_ERR(d_tracer))
530 return -ENOMEM;
531
532 top_dir = tracefs_create_dir("hwlat_detector", d_tracer);
533 if (!top_dir)
534 return -ENOMEM;
535
536 hwlat_sample_window = tracefs_create_file("window", 0640,
537 top_dir,
538 &hwlat_data.sample_window,
539 &window_fops);
540 if (!hwlat_sample_window)
541 goto err;
542
543 hwlat_sample_width = tracefs_create_file("width", 0644,
544 top_dir,
545 &hwlat_data.sample_width,
546 &width_fops);
547 if (!hwlat_sample_width)
548 goto err;
549
550 return 0;
551
552 err:
553 tracefs_remove_recursive(top_dir);
554 return -ENOMEM;
555}
556
557static void hwlat_tracer_start(struct trace_array *tr)
558{
559 int err;
560
561 err = start_kthread(tr);
562 if (err)
563 pr_err(BANNER "Cannot start hwlat kthread\n");
564}
565
566static void hwlat_tracer_stop(struct trace_array *tr)
567{
568 stop_kthread();
569}
570
571static bool hwlat_busy;
572
573static int hwlat_tracer_init(struct trace_array *tr)
574{
575 /* Only allow one instance to enable this */
576 if (hwlat_busy)
577 return -EBUSY;
578
579 hwlat_trace = tr;
580
581 disable_migrate = false;
582 hwlat_data.count = 0;
583 tr->max_latency = 0;
584 save_tracing_thresh = tracing_thresh;
585
586 /* tracing_thresh is in nsecs, we speak in usecs */
587 if (!tracing_thresh)
588 tracing_thresh = last_tracing_thresh;
589
590 if (tracer_tracing_is_on(tr))
591 hwlat_tracer_start(tr);
592
593 hwlat_busy = true;
594
595 return 0;
596}
597
598static void hwlat_tracer_reset(struct trace_array *tr)
599{
600 stop_kthread();
601
602 /* the tracing threshold is static between runs */
603 last_tracing_thresh = tracing_thresh;
604
605 tracing_thresh = save_tracing_thresh;
606 hwlat_busy = false;
607}
608
609static struct tracer hwlat_tracer __read_mostly =
610{
611 .name = "hwlat",
612 .init = hwlat_tracer_init,
613 .reset = hwlat_tracer_reset,
614 .start = hwlat_tracer_start,
615 .stop = hwlat_tracer_stop,
616 .allow_instances = true,
617};
618
619__init static int init_hwlat_tracer(void)
620{
621 int ret;
622
623 mutex_init(&hwlat_data.lock);
624
625 ret = register_tracer(&hwlat_tracer);
626 if (ret)
627 return ret;
628
629 init_tracefs();
630
631 return 0;
632}
633late_initcall(init_hwlat_tracer);
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 9aedb0b06683..eb6c9f1d3a93 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -253,6 +253,10 @@ static const struct fetch_type kprobes_fetch_type_table[] = {
253 ASSIGN_FETCH_TYPE(s16, u16, 1), 253 ASSIGN_FETCH_TYPE(s16, u16, 1),
254 ASSIGN_FETCH_TYPE(s32, u32, 1), 254 ASSIGN_FETCH_TYPE(s32, u32, 1),
255 ASSIGN_FETCH_TYPE(s64, u64, 1), 255 ASSIGN_FETCH_TYPE(s64, u64, 1),
256 ASSIGN_FETCH_TYPE_ALIAS(x8, u8, u8, 0),
257 ASSIGN_FETCH_TYPE_ALIAS(x16, u16, u16, 0),
258 ASSIGN_FETCH_TYPE_ALIAS(x32, u32, u32, 0),
259 ASSIGN_FETCH_TYPE_ALIAS(x64, u64, u64, 0),
256 260
257 ASSIGN_FETCH_TYPE_END 261 ASSIGN_FETCH_TYPE_END
258}; 262};
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 0bb9cf2d53e6..3fc20422c166 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -1098,6 +1098,71 @@ static struct trace_event trace_user_stack_event = {
1098 .funcs = &trace_user_stack_funcs, 1098 .funcs = &trace_user_stack_funcs,
1099}; 1099};
1100 1100
1101/* TRACE_HWLAT */
1102static enum print_line_t
1103trace_hwlat_print(struct trace_iterator *iter, int flags,
1104 struct trace_event *event)
1105{
1106 struct trace_entry *entry = iter->ent;
1107 struct trace_seq *s = &iter->seq;
1108 struct hwlat_entry *field;
1109
1110 trace_assign_type(field, entry);
1111
1112 trace_seq_printf(s, "#%-5u inner/outer(us): %4llu/%-5llu ts:%ld.%09ld",
1113 field->seqnum,
1114 field->duration,
1115 field->outer_duration,
1116 field->timestamp.tv_sec,
1117 field->timestamp.tv_nsec);
1118
1119 if (field->nmi_count) {
1120 /*
1121 * The generic sched_clock() is not NMI safe, thus
1122 * we only record the count and not the time.
1123 */
1124 if (!IS_ENABLED(CONFIG_GENERIC_SCHED_CLOCK))
1125 trace_seq_printf(s, " nmi-total:%llu",
1126 field->nmi_total_ts);
1127 trace_seq_printf(s, " nmi-count:%u",
1128 field->nmi_count);
1129 }
1130
1131 trace_seq_putc(s, '\n');
1132
1133 return trace_handle_return(s);
1134}
1135
1136
1137static enum print_line_t
1138trace_hwlat_raw(struct trace_iterator *iter, int flags,
1139 struct trace_event *event)
1140{
1141 struct hwlat_entry *field;
1142 struct trace_seq *s = &iter->seq;
1143
1144 trace_assign_type(field, iter->ent);
1145
1146 trace_seq_printf(s, "%llu %lld %ld %09ld %u\n",
1147 field->duration,
1148 field->outer_duration,
1149 field->timestamp.tv_sec,
1150 field->timestamp.tv_nsec,
1151 field->seqnum);
1152
1153 return trace_handle_return(s);
1154}
1155
1156static struct trace_event_functions trace_hwlat_funcs = {
1157 .trace = trace_hwlat_print,
1158 .raw = trace_hwlat_raw,
1159};
1160
1161static struct trace_event trace_hwlat_event = {
1162 .type = TRACE_HWLAT,
1163 .funcs = &trace_hwlat_funcs,
1164};
1165
1101/* TRACE_BPUTS */ 1166/* TRACE_BPUTS */
1102static enum print_line_t 1167static enum print_line_t
1103trace_bputs_print(struct trace_iterator *iter, int flags, 1168trace_bputs_print(struct trace_iterator *iter, int flags,
@@ -1233,6 +1298,7 @@ static struct trace_event *events[] __initdata = {
1233 &trace_bputs_event, 1298 &trace_bputs_event,
1234 &trace_bprint_event, 1299 &trace_bprint_event,
1235 &trace_print_event, 1300 &trace_print_event,
1301 &trace_hwlat_event,
1236 NULL 1302 NULL
1237}; 1303};
1238 1304
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index 74e80a582c28..8c0553d9afd3 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -36,24 +36,28 @@ const char *reserved_field_names[] = {
36}; 36};
37 37
38/* Printing in basic type function template */ 38/* Printing in basic type function template */
39#define DEFINE_BASIC_PRINT_TYPE_FUNC(type, fmt) \ 39#define DEFINE_BASIC_PRINT_TYPE_FUNC(tname, type, fmt) \
40int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, const char *name, \ 40int PRINT_TYPE_FUNC_NAME(tname)(struct trace_seq *s, const char *name, \
41 void *data, void *ent) \ 41 void *data, void *ent) \
42{ \ 42{ \
43 trace_seq_printf(s, " %s=" fmt, name, *(type *)data); \ 43 trace_seq_printf(s, " %s=" fmt, name, *(type *)data); \
44 return !trace_seq_has_overflowed(s); \ 44 return !trace_seq_has_overflowed(s); \
45} \ 45} \
46const char PRINT_TYPE_FMT_NAME(type)[] = fmt; \ 46const char PRINT_TYPE_FMT_NAME(tname)[] = fmt; \
47NOKPROBE_SYMBOL(PRINT_TYPE_FUNC_NAME(type)); 47NOKPROBE_SYMBOL(PRINT_TYPE_FUNC_NAME(tname));
48 48
49DEFINE_BASIC_PRINT_TYPE_FUNC(u8 , "0x%x") 49DEFINE_BASIC_PRINT_TYPE_FUNC(u8, u8, "%u")
50DEFINE_BASIC_PRINT_TYPE_FUNC(u16, "0x%x") 50DEFINE_BASIC_PRINT_TYPE_FUNC(u16, u16, "%u")
51DEFINE_BASIC_PRINT_TYPE_FUNC(u32, "0x%x") 51DEFINE_BASIC_PRINT_TYPE_FUNC(u32, u32, "%u")
52DEFINE_BASIC_PRINT_TYPE_FUNC(u64, "0x%Lx") 52DEFINE_BASIC_PRINT_TYPE_FUNC(u64, u64, "%Lu")
53DEFINE_BASIC_PRINT_TYPE_FUNC(s8, "%d") 53DEFINE_BASIC_PRINT_TYPE_FUNC(s8, s8, "%d")
54DEFINE_BASIC_PRINT_TYPE_FUNC(s16, "%d") 54DEFINE_BASIC_PRINT_TYPE_FUNC(s16, s16, "%d")
55DEFINE_BASIC_PRINT_TYPE_FUNC(s32, "%d") 55DEFINE_BASIC_PRINT_TYPE_FUNC(s32, s32, "%d")
56DEFINE_BASIC_PRINT_TYPE_FUNC(s64, "%Ld") 56DEFINE_BASIC_PRINT_TYPE_FUNC(s64, s64, "%Ld")
57DEFINE_BASIC_PRINT_TYPE_FUNC(x8, u8, "0x%x")
58DEFINE_BASIC_PRINT_TYPE_FUNC(x16, u16, "0x%x")
59DEFINE_BASIC_PRINT_TYPE_FUNC(x32, u32, "0x%x")
60DEFINE_BASIC_PRINT_TYPE_FUNC(x64, u64, "0x%Lx")
57 61
58/* Print type function for string type */ 62/* Print type function for string type */
59int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s, const char *name, 63int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s, const char *name,
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index 45400ca5ded1..0c0ae54d44c6 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -149,6 +149,11 @@ DECLARE_BASIC_PRINT_TYPE_FUNC(s8);
149DECLARE_BASIC_PRINT_TYPE_FUNC(s16); 149DECLARE_BASIC_PRINT_TYPE_FUNC(s16);
150DECLARE_BASIC_PRINT_TYPE_FUNC(s32); 150DECLARE_BASIC_PRINT_TYPE_FUNC(s32);
151DECLARE_BASIC_PRINT_TYPE_FUNC(s64); 151DECLARE_BASIC_PRINT_TYPE_FUNC(s64);
152DECLARE_BASIC_PRINT_TYPE_FUNC(x8);
153DECLARE_BASIC_PRINT_TYPE_FUNC(x16);
154DECLARE_BASIC_PRINT_TYPE_FUNC(x32);
155DECLARE_BASIC_PRINT_TYPE_FUNC(x64);
156
152DECLARE_BASIC_PRINT_TYPE_FUNC(string); 157DECLARE_BASIC_PRINT_TYPE_FUNC(string);
153 158
154#define FETCH_FUNC_NAME(method, type) fetch_##method##_##type 159#define FETCH_FUNC_NAME(method, type) fetch_##method##_##type
@@ -203,7 +208,7 @@ DEFINE_FETCH_##method(u32) \
203DEFINE_FETCH_##method(u64) 208DEFINE_FETCH_##method(u64)
204 209
205/* Default (unsigned long) fetch type */ 210/* Default (unsigned long) fetch type */
206#define __DEFAULT_FETCH_TYPE(t) u##t 211#define __DEFAULT_FETCH_TYPE(t) x##t
207#define _DEFAULT_FETCH_TYPE(t) __DEFAULT_FETCH_TYPE(t) 212#define _DEFAULT_FETCH_TYPE(t) __DEFAULT_FETCH_TYPE(t)
208#define DEFAULT_FETCH_TYPE _DEFAULT_FETCH_TYPE(BITS_PER_LONG) 213#define DEFAULT_FETCH_TYPE _DEFAULT_FETCH_TYPE(BITS_PER_LONG)
209#define DEFAULT_FETCH_TYPE_STR __stringify(DEFAULT_FETCH_TYPE) 214#define DEFAULT_FETCH_TYPE_STR __stringify(DEFAULT_FETCH_TYPE)
@@ -234,6 +239,10 @@ ASSIGN_FETCH_FUNC(file_offset, ftype), \
234#define ASSIGN_FETCH_TYPE(ptype, ftype, sign) \ 239#define ASSIGN_FETCH_TYPE(ptype, ftype, sign) \
235 __ASSIGN_FETCH_TYPE(#ptype, ptype, ftype, sizeof(ftype), sign, #ptype) 240 __ASSIGN_FETCH_TYPE(#ptype, ptype, ftype, sizeof(ftype), sign, #ptype)
236 241
242/* If ptype is an alias of atype, use this macro (show atype in format) */
243#define ASSIGN_FETCH_TYPE_ALIAS(ptype, atype, ftype, sign) \
244 __ASSIGN_FETCH_TYPE(#ptype, ptype, ftype, sizeof(ftype), sign, #atype)
245
237#define ASSIGN_FETCH_TYPE_END {} 246#define ASSIGN_FETCH_TYPE_END {}
238 247
239#define FETCH_TYPE_STRING 0 248#define FETCH_TYPE_STRING 0
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index b2b6efc083a4..5e10395da88e 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -610,8 +610,7 @@ static int perf_sysenter_enable(struct trace_event_call *call)
610 if (!sys_perf_refcount_enter) 610 if (!sys_perf_refcount_enter)
611 ret = register_trace_sys_enter(perf_syscall_enter, NULL); 611 ret = register_trace_sys_enter(perf_syscall_enter, NULL);
612 if (ret) { 612 if (ret) {
613 pr_info("event trace: Could not activate" 613 pr_info("event trace: Could not activate syscall entry trace point");
614 "syscall entry trace point");
615 } else { 614 } else {
616 set_bit(num, enabled_perf_enter_syscalls); 615 set_bit(num, enabled_perf_enter_syscalls);
617 sys_perf_refcount_enter++; 616 sys_perf_refcount_enter++;
@@ -682,8 +681,7 @@ static int perf_sysexit_enable(struct trace_event_call *call)
682 if (!sys_perf_refcount_exit) 681 if (!sys_perf_refcount_exit)
683 ret = register_trace_sys_exit(perf_syscall_exit, NULL); 682 ret = register_trace_sys_exit(perf_syscall_exit, NULL);
684 if (ret) { 683 if (ret) {
685 pr_info("event trace: Could not activate" 684 pr_info("event trace: Could not activate syscall exit trace point");
686 "syscall exit trace point");
687 } else { 685 } else {
688 set_bit(num, enabled_perf_exit_syscalls); 686 set_bit(num, enabled_perf_exit_syscalls);
689 sys_perf_refcount_exit++; 687 sys_perf_refcount_exit++;
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index c53485441c88..0913693caf6e 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -211,6 +211,10 @@ static const struct fetch_type uprobes_fetch_type_table[] = {
211 ASSIGN_FETCH_TYPE(s16, u16, 1), 211 ASSIGN_FETCH_TYPE(s16, u16, 1),
212 ASSIGN_FETCH_TYPE(s32, u32, 1), 212 ASSIGN_FETCH_TYPE(s32, u32, 1),
213 ASSIGN_FETCH_TYPE(s64, u64, 1), 213 ASSIGN_FETCH_TYPE(s64, u64, 1),
214 ASSIGN_FETCH_TYPE_ALIAS(x8, u8, u8, 0),
215 ASSIGN_FETCH_TYPE_ALIAS(x16, u16, u16, 0),
216 ASSIGN_FETCH_TYPE_ALIAS(x32, u32, u32, 0),
217 ASSIGN_FETCH_TYPE_ALIAS(x64, u64, u64, 0),
214 218
215 ASSIGN_FETCH_TYPE_END 219 ASSIGN_FETCH_TYPE_END
216}; 220};
@@ -427,10 +431,6 @@ static int create_trace_uprobe(int argc, char **argv)
427 pr_info("Probe point is not specified.\n"); 431 pr_info("Probe point is not specified.\n");
428 return -EINVAL; 432 return -EINVAL;
429 } 433 }
430 if (isdigit(argv[1][0])) {
431 pr_info("probe point must be have a filename.\n");
432 return -EINVAL;
433 }
434 arg = strchr(argv[1], ':'); 434 arg = strchr(argv[1], ':');
435 if (!arg) { 435 if (!arg) {
436 ret = -EINVAL; 436 ret = -EINVAL;
diff --git a/kernel/ucount.c b/kernel/ucount.c
new file mode 100644
index 000000000000..9d20d5dd298a
--- /dev/null
+++ b/kernel/ucount.c
@@ -0,0 +1,235 @@
1/*
2 * This program is free software; you can redistribute it and/or
3 * modify it under the terms of the GNU General Public License as
4 * published by the Free Software Foundation, version 2 of the
5 * License.
6 */
7
8#include <linux/stat.h>
9#include <linux/sysctl.h>
10#include <linux/slab.h>
11#include <linux/hash.h>
12#include <linux/user_namespace.h>
13
14#define UCOUNTS_HASHTABLE_BITS 10
15static struct hlist_head ucounts_hashtable[(1 << UCOUNTS_HASHTABLE_BITS)];
16static DEFINE_SPINLOCK(ucounts_lock);
17
18#define ucounts_hashfn(ns, uid) \
19 hash_long((unsigned long)__kuid_val(uid) + (unsigned long)(ns), \
20 UCOUNTS_HASHTABLE_BITS)
21#define ucounts_hashentry(ns, uid) \
22 (ucounts_hashtable + ucounts_hashfn(ns, uid))
23
24
25#ifdef CONFIG_SYSCTL
26static struct ctl_table_set *
27set_lookup(struct ctl_table_root *root)
28{
29 return &current_user_ns()->set;
30}
31
32static int set_is_seen(struct ctl_table_set *set)
33{
34 return &current_user_ns()->set == set;
35}
36
37static int set_permissions(struct ctl_table_header *head,
38 struct ctl_table *table)
39{
40 struct user_namespace *user_ns =
41 container_of(head->set, struct user_namespace, set);
42 int mode;
43
44 /* Allow users with CAP_SYS_RESOURCE unrestrained access */
45 if (ns_capable(user_ns, CAP_SYS_RESOURCE))
46 mode = (table->mode & S_IRWXU) >> 6;
47 else
48 /* Allow all others at most read-only access */
49 mode = table->mode & S_IROTH;
50 return (mode << 6) | (mode << 3) | mode;
51}
52
53static struct ctl_table_root set_root = {
54 .lookup = set_lookup,
55 .permissions = set_permissions,
56};
57
58static int zero = 0;
59static int int_max = INT_MAX;
60#define UCOUNT_ENTRY(name) \
61 { \
62 .procname = name, \
63 .maxlen = sizeof(int), \
64 .mode = 0644, \
65 .proc_handler = proc_dointvec_minmax, \
66 .extra1 = &zero, \
67 .extra2 = &int_max, \
68 }
69static struct ctl_table user_table[] = {
70 UCOUNT_ENTRY("max_user_namespaces"),
71 UCOUNT_ENTRY("max_pid_namespaces"),
72 UCOUNT_ENTRY("max_uts_namespaces"),
73 UCOUNT_ENTRY("max_ipc_namespaces"),
74 UCOUNT_ENTRY("max_net_namespaces"),
75 UCOUNT_ENTRY("max_mnt_namespaces"),
76 UCOUNT_ENTRY("max_cgroup_namespaces"),
77 { }
78};
79#endif /* CONFIG_SYSCTL */
80
81bool setup_userns_sysctls(struct user_namespace *ns)
82{
83#ifdef CONFIG_SYSCTL
84 struct ctl_table *tbl;
85 setup_sysctl_set(&ns->set, &set_root, set_is_seen);
86 tbl = kmemdup(user_table, sizeof(user_table), GFP_KERNEL);
87 if (tbl) {
88 int i;
89 for (i = 0; i < UCOUNT_COUNTS; i++) {
90 tbl[i].data = &ns->ucount_max[i];
91 }
92 ns->sysctls = __register_sysctl_table(&ns->set, "user", tbl);
93 }
94 if (!ns->sysctls) {
95 kfree(tbl);
96 retire_sysctl_set(&ns->set);
97 return false;
98 }
99#endif
100 return true;
101}
102
103void retire_userns_sysctls(struct user_namespace *ns)
104{
105#ifdef CONFIG_SYSCTL
106 struct ctl_table *tbl;
107
108 tbl = ns->sysctls->ctl_table_arg;
109 unregister_sysctl_table(ns->sysctls);
110 retire_sysctl_set(&ns->set);
111 kfree(tbl);
112#endif
113}
114
115static struct ucounts *find_ucounts(struct user_namespace *ns, kuid_t uid, struct hlist_head *hashent)
116{
117 struct ucounts *ucounts;
118
119 hlist_for_each_entry(ucounts, hashent, node) {
120 if (uid_eq(ucounts->uid, uid) && (ucounts->ns == ns))
121 return ucounts;
122 }
123 return NULL;
124}
125
126static struct ucounts *get_ucounts(struct user_namespace *ns, kuid_t uid)
127{
128 struct hlist_head *hashent = ucounts_hashentry(ns, uid);
129 struct ucounts *ucounts, *new;
130
131 spin_lock(&ucounts_lock);
132 ucounts = find_ucounts(ns, uid, hashent);
133 if (!ucounts) {
134 spin_unlock(&ucounts_lock);
135
136 new = kzalloc(sizeof(*new), GFP_KERNEL);
137 if (!new)
138 return NULL;
139
140 new->ns = ns;
141 new->uid = uid;
142 atomic_set(&new->count, 0);
143
144 spin_lock(&ucounts_lock);
145 ucounts = find_ucounts(ns, uid, hashent);
146 if (ucounts) {
147 kfree(new);
148 } else {
149 hlist_add_head(&new->node, hashent);
150 ucounts = new;
151 }
152 }
153 if (!atomic_add_unless(&ucounts->count, 1, INT_MAX))
154 ucounts = NULL;
155 spin_unlock(&ucounts_lock);
156 return ucounts;
157}
158
159static void put_ucounts(struct ucounts *ucounts)
160{
161 if (atomic_dec_and_test(&ucounts->count)) {
162 spin_lock(&ucounts_lock);
163 hlist_del_init(&ucounts->node);
164 spin_unlock(&ucounts_lock);
165
166 kfree(ucounts);
167 }
168}
169
170static inline bool atomic_inc_below(atomic_t *v, int u)
171{
172 int c, old;
173 c = atomic_read(v);
174 for (;;) {
175 if (unlikely(c >= u))
176 return false;
177 old = atomic_cmpxchg(v, c, c+1);
178 if (likely(old == c))
179 return true;
180 c = old;
181 }
182}
183
184struct ucounts *inc_ucount(struct user_namespace *ns, kuid_t uid,
185 enum ucount_type type)
186{
187 struct ucounts *ucounts, *iter, *bad;
188 struct user_namespace *tns;
189 ucounts = get_ucounts(ns, uid);
190 for (iter = ucounts; iter; iter = tns->ucounts) {
191 int max;
192 tns = iter->ns;
193 max = READ_ONCE(tns->ucount_max[type]);
194 if (!atomic_inc_below(&iter->ucount[type], max))
195 goto fail;
196 }
197 return ucounts;
198fail:
199 bad = iter;
200 for (iter = ucounts; iter != bad; iter = iter->ns->ucounts)
201 atomic_dec(&iter->ucount[type]);
202
203 put_ucounts(ucounts);
204 return NULL;
205}
206
207void dec_ucount(struct ucounts *ucounts, enum ucount_type type)
208{
209 struct ucounts *iter;
210 for (iter = ucounts; iter; iter = iter->ns->ucounts) {
211 int dec = atomic_dec_if_positive(&iter->ucount[type]);
212 WARN_ON_ONCE(dec < 0);
213 }
214 put_ucounts(ucounts);
215}
216
217static __init int user_namespace_sysctl_init(void)
218{
219#ifdef CONFIG_SYSCTL
220 static struct ctl_table_header *user_header;
221 static struct ctl_table empty[1];
222 /*
223 * It is necessary to register the user directory in the
224 * default set so that registrations in the child sets work
225 * properly.
226 */
227 user_header = register_sysctl("user", empty);
228 BUG_ON(!user_header);
229 BUG_ON(!setup_userns_sysctls(&init_user_ns));
230#endif
231 return 0;
232}
233subsys_initcall(user_namespace_sysctl_init);
234
235
diff --git a/kernel/uid16.c b/kernel/uid16.c
index d58cc4d8f0d1..cc40793464e3 100644
--- a/kernel/uid16.c
+++ b/kernel/uid16.c
@@ -117,7 +117,7 @@ static int groups16_to_user(old_gid_t __user *grouplist,
117 kgid_t kgid; 117 kgid_t kgid;
118 118
119 for (i = 0; i < group_info->ngroups; i++) { 119 for (i = 0; i < group_info->ngroups; i++) {
120 kgid = GROUP_AT(group_info, i); 120 kgid = group_info->gid[i];
121 group = high2lowgid(from_kgid_munged(user_ns, kgid)); 121 group = high2lowgid(from_kgid_munged(user_ns, kgid));
122 if (put_user(group, grouplist+i)) 122 if (put_user(group, grouplist+i))
123 return -EFAULT; 123 return -EFAULT;
@@ -142,7 +142,7 @@ static int groups16_from_user(struct group_info *group_info,
142 if (!gid_valid(kgid)) 142 if (!gid_valid(kgid))
143 return -EINVAL; 143 return -EINVAL;
144 144
145 GROUP_AT(group_info, i) = kgid; 145 group_info->gid[i] = kgid;
146 } 146 }
147 147
148 return 0; 148 return 0;
diff --git a/kernel/up.c b/kernel/up.c
index 1760bf3d1463..ee81ac9af4ca 100644
--- a/kernel/up.c
+++ b/kernel/up.c
@@ -6,6 +6,7 @@
6#include <linux/kernel.h> 6#include <linux/kernel.h>
7#include <linux/export.h> 7#include <linux/export.h>
8#include <linux/smp.h> 8#include <linux/smp.h>
9#include <linux/hypervisor.h>
9 10
10int smp_call_function_single(int cpu, void (*func) (void *info), void *info, 11int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
11 int wait) 12 int wait)
@@ -82,3 +83,20 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
82 preempt_enable(); 83 preempt_enable();
83} 84}
84EXPORT_SYMBOL(on_each_cpu_cond); 85EXPORT_SYMBOL(on_each_cpu_cond);
86
87int smp_call_on_cpu(unsigned int cpu, int (*func)(void *), void *par, bool phys)
88{
89 int ret;
90
91 if (cpu != 0)
92 return -ENXIO;
93
94 if (phys)
95 hypervisor_pin_vcpu(0);
96 ret = func(par);
97 if (phys)
98 hypervisor_pin_vcpu(-1);
99
100 return ret;
101}
102EXPORT_SYMBOL_GPL(smp_call_on_cpu);
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 68f594212759..86b7854fec8e 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -29,6 +29,17 @@ static DEFINE_MUTEX(userns_state_mutex);
29static bool new_idmap_permitted(const struct file *file, 29static bool new_idmap_permitted(const struct file *file,
30 struct user_namespace *ns, int cap_setid, 30 struct user_namespace *ns, int cap_setid,
31 struct uid_gid_map *map); 31 struct uid_gid_map *map);
32static void free_user_ns(struct work_struct *work);
33
34static struct ucounts *inc_user_namespaces(struct user_namespace *ns, kuid_t uid)
35{
36 return inc_ucount(ns, uid, UCOUNT_USER_NAMESPACES);
37}
38
39static void dec_user_namespaces(struct ucounts *ucounts)
40{
41 return dec_ucount(ucounts, UCOUNT_USER_NAMESPACES);
42}
32 43
33static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns) 44static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns)
34{ 45{
@@ -62,10 +73,16 @@ int create_user_ns(struct cred *new)
62 struct user_namespace *ns, *parent_ns = new->user_ns; 73 struct user_namespace *ns, *parent_ns = new->user_ns;
63 kuid_t owner = new->euid; 74 kuid_t owner = new->euid;
64 kgid_t group = new->egid; 75 kgid_t group = new->egid;
65 int ret; 76 struct ucounts *ucounts;
77 int ret, i;
66 78
79 ret = -ENOSPC;
67 if (parent_ns->level > 32) 80 if (parent_ns->level > 32)
68 return -EUSERS; 81 goto fail;
82
83 ucounts = inc_user_namespaces(parent_ns, owner);
84 if (!ucounts)
85 goto fail;
69 86
70 /* 87 /*
71 * Verify that we can not violate the policy of which files 88 * Verify that we can not violate the policy of which files
@@ -73,26 +90,27 @@ int create_user_ns(struct cred *new)
73 * by verifing that the root directory is at the root of the 90 * by verifing that the root directory is at the root of the
74 * mount namespace which allows all files to be accessed. 91 * mount namespace which allows all files to be accessed.
75 */ 92 */
93 ret = -EPERM;
76 if (current_chrooted()) 94 if (current_chrooted())
77 return -EPERM; 95 goto fail_dec;
78 96
79 /* The creator needs a mapping in the parent user namespace 97 /* The creator needs a mapping in the parent user namespace
80 * or else we won't be able to reasonably tell userspace who 98 * or else we won't be able to reasonably tell userspace who
81 * created a user_namespace. 99 * created a user_namespace.
82 */ 100 */
101 ret = -EPERM;
83 if (!kuid_has_mapping(parent_ns, owner) || 102 if (!kuid_has_mapping(parent_ns, owner) ||
84 !kgid_has_mapping(parent_ns, group)) 103 !kgid_has_mapping(parent_ns, group))
85 return -EPERM; 104 goto fail_dec;
86 105
106 ret = -ENOMEM;
87 ns = kmem_cache_zalloc(user_ns_cachep, GFP_KERNEL); 107 ns = kmem_cache_zalloc(user_ns_cachep, GFP_KERNEL);
88 if (!ns) 108 if (!ns)
89 return -ENOMEM; 109 goto fail_dec;
90 110
91 ret = ns_alloc_inum(&ns->ns); 111 ret = ns_alloc_inum(&ns->ns);
92 if (ret) { 112 if (ret)
93 kmem_cache_free(user_ns_cachep, ns); 113 goto fail_free;
94 return ret;
95 }
96 ns->ns.ops = &userns_operations; 114 ns->ns.ops = &userns_operations;
97 115
98 atomic_set(&ns->count, 1); 116 atomic_set(&ns->count, 1);
@@ -101,18 +119,37 @@ int create_user_ns(struct cred *new)
101 ns->level = parent_ns->level + 1; 119 ns->level = parent_ns->level + 1;
102 ns->owner = owner; 120 ns->owner = owner;
103 ns->group = group; 121 ns->group = group;
122 INIT_WORK(&ns->work, free_user_ns);
123 for (i = 0; i < UCOUNT_COUNTS; i++) {
124 ns->ucount_max[i] = INT_MAX;
125 }
126 ns->ucounts = ucounts;
104 127
105 /* Inherit USERNS_SETGROUPS_ALLOWED from our parent */ 128 /* Inherit USERNS_SETGROUPS_ALLOWED from our parent */
106 mutex_lock(&userns_state_mutex); 129 mutex_lock(&userns_state_mutex);
107 ns->flags = parent_ns->flags; 130 ns->flags = parent_ns->flags;
108 mutex_unlock(&userns_state_mutex); 131 mutex_unlock(&userns_state_mutex);
109 132
110 set_cred_user_ns(new, ns);
111
112#ifdef CONFIG_PERSISTENT_KEYRINGS 133#ifdef CONFIG_PERSISTENT_KEYRINGS
113 init_rwsem(&ns->persistent_keyring_register_sem); 134 init_rwsem(&ns->persistent_keyring_register_sem);
114#endif 135#endif
136 ret = -ENOMEM;
137 if (!setup_userns_sysctls(ns))
138 goto fail_keyring;
139
140 set_cred_user_ns(new, ns);
115 return 0; 141 return 0;
142fail_keyring:
143#ifdef CONFIG_PERSISTENT_KEYRINGS
144 key_put(ns->persistent_keyring_register);
145#endif
146 ns_free_inum(&ns->ns);
147fail_free:
148 kmem_cache_free(user_ns_cachep, ns);
149fail_dec:
150 dec_user_namespaces(ucounts);
151fail:
152 return ret;
116} 153}
117 154
118int unshare_userns(unsigned long unshare_flags, struct cred **new_cred) 155int unshare_userns(unsigned long unshare_flags, struct cred **new_cred)
@@ -135,21 +172,30 @@ int unshare_userns(unsigned long unshare_flags, struct cred **new_cred)
135 return err; 172 return err;
136} 173}
137 174
138void free_user_ns(struct user_namespace *ns) 175static void free_user_ns(struct work_struct *work)
139{ 176{
140 struct user_namespace *parent; 177 struct user_namespace *parent, *ns =
178 container_of(work, struct user_namespace, work);
141 179
142 do { 180 do {
181 struct ucounts *ucounts = ns->ucounts;
143 parent = ns->parent; 182 parent = ns->parent;
183 retire_userns_sysctls(ns);
144#ifdef CONFIG_PERSISTENT_KEYRINGS 184#ifdef CONFIG_PERSISTENT_KEYRINGS
145 key_put(ns->persistent_keyring_register); 185 key_put(ns->persistent_keyring_register);
146#endif 186#endif
147 ns_free_inum(&ns->ns); 187 ns_free_inum(&ns->ns);
148 kmem_cache_free(user_ns_cachep, ns); 188 kmem_cache_free(user_ns_cachep, ns);
189 dec_user_namespaces(ucounts);
149 ns = parent; 190 ns = parent;
150 } while (atomic_dec_and_test(&parent->count)); 191 } while (atomic_dec_and_test(&parent->count));
151} 192}
152EXPORT_SYMBOL(free_user_ns); 193
194void __put_user_ns(struct user_namespace *ns)
195{
196 schedule_work(&ns->work);
197}
198EXPORT_SYMBOL(__put_user_ns);
153 199
154static u32 map_id_range_down(struct uid_gid_map *map, u32 id, u32 count) 200static u32 map_id_range_down(struct uid_gid_map *map, u32 id, u32 count)
155{ 201{
@@ -1004,12 +1050,37 @@ static int userns_install(struct nsproxy *nsproxy, struct ns_common *ns)
1004 return commit_creds(cred); 1050 return commit_creds(cred);
1005} 1051}
1006 1052
1053struct ns_common *ns_get_owner(struct ns_common *ns)
1054{
1055 struct user_namespace *my_user_ns = current_user_ns();
1056 struct user_namespace *owner, *p;
1057
1058 /* See if the owner is in the current user namespace */
1059 owner = p = ns->ops->owner(ns);
1060 for (;;) {
1061 if (!p)
1062 return ERR_PTR(-EPERM);
1063 if (p == my_user_ns)
1064 break;
1065 p = p->parent;
1066 }
1067
1068 return &get_user_ns(owner)->ns;
1069}
1070
1071static struct user_namespace *userns_owner(struct ns_common *ns)
1072{
1073 return to_user_ns(ns)->parent;
1074}
1075
1007const struct proc_ns_operations userns_operations = { 1076const struct proc_ns_operations userns_operations = {
1008 .name = "user", 1077 .name = "user",
1009 .type = CLONE_NEWUSER, 1078 .type = CLONE_NEWUSER,
1010 .get = userns_get, 1079 .get = userns_get,
1011 .put = userns_put, 1080 .put = userns_put,
1012 .install = userns_install, 1081 .install = userns_install,
1082 .owner = userns_owner,
1083 .get_parent = ns_get_owner,
1013}; 1084};
1014 1085
1015static __init int user_namespaces_init(void) 1086static __init int user_namespaces_init(void)
diff --git a/kernel/utsname.c b/kernel/utsname.c
index 831ea7108232..6976cd47dcf6 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -17,6 +17,16 @@
17#include <linux/user_namespace.h> 17#include <linux/user_namespace.h>
18#include <linux/proc_ns.h> 18#include <linux/proc_ns.h>
19 19
20static struct ucounts *inc_uts_namespaces(struct user_namespace *ns)
21{
22 return inc_ucount(ns, current_euid(), UCOUNT_UTS_NAMESPACES);
23}
24
25static void dec_uts_namespaces(struct ucounts *ucounts)
26{
27 dec_ucount(ucounts, UCOUNT_UTS_NAMESPACES);
28}
29
20static struct uts_namespace *create_uts_ns(void) 30static struct uts_namespace *create_uts_ns(void)
21{ 31{
22 struct uts_namespace *uts_ns; 32 struct uts_namespace *uts_ns;
@@ -36,18 +46,24 @@ static struct uts_namespace *clone_uts_ns(struct user_namespace *user_ns,
36 struct uts_namespace *old_ns) 46 struct uts_namespace *old_ns)
37{ 47{
38 struct uts_namespace *ns; 48 struct uts_namespace *ns;
49 struct ucounts *ucounts;
39 int err; 50 int err;
40 51
52 err = -ENOSPC;
53 ucounts = inc_uts_namespaces(user_ns);
54 if (!ucounts)
55 goto fail;
56
57 err = -ENOMEM;
41 ns = create_uts_ns(); 58 ns = create_uts_ns();
42 if (!ns) 59 if (!ns)
43 return ERR_PTR(-ENOMEM); 60 goto fail_dec;
44 61
45 err = ns_alloc_inum(&ns->ns); 62 err = ns_alloc_inum(&ns->ns);
46 if (err) { 63 if (err)
47 kfree(ns); 64 goto fail_free;
48 return ERR_PTR(err);
49 }
50 65
66 ns->ucounts = ucounts;
51 ns->ns.ops = &utsns_operations; 67 ns->ns.ops = &utsns_operations;
52 68
53 down_read(&uts_sem); 69 down_read(&uts_sem);
@@ -55,6 +71,13 @@ static struct uts_namespace *clone_uts_ns(struct user_namespace *user_ns,
55 ns->user_ns = get_user_ns(user_ns); 71 ns->user_ns = get_user_ns(user_ns);
56 up_read(&uts_sem); 72 up_read(&uts_sem);
57 return ns; 73 return ns;
74
75fail_free:
76 kfree(ns);
77fail_dec:
78 dec_uts_namespaces(ucounts);
79fail:
80 return ERR_PTR(err);
58} 81}
59 82
60/* 83/*
@@ -85,6 +108,7 @@ void free_uts_ns(struct kref *kref)
85 struct uts_namespace *ns; 108 struct uts_namespace *ns;
86 109
87 ns = container_of(kref, struct uts_namespace, kref); 110 ns = container_of(kref, struct uts_namespace, kref);
111 dec_uts_namespaces(ns->ucounts);
88 put_user_ns(ns->user_ns); 112 put_user_ns(ns->user_ns);
89 ns_free_inum(&ns->ns); 113 ns_free_inum(&ns->ns);
90 kfree(ns); 114 kfree(ns);
@@ -130,10 +154,16 @@ static int utsns_install(struct nsproxy *nsproxy, struct ns_common *new)
130 return 0; 154 return 0;
131} 155}
132 156
157static struct user_namespace *utsns_owner(struct ns_common *ns)
158{
159 return to_uts_ns(ns)->user_ns;
160}
161
133const struct proc_ns_operations utsns_operations = { 162const struct proc_ns_operations utsns_operations = {
134 .name = "uts", 163 .name = "uts",
135 .type = CLONE_NEWUTS, 164 .type = CLONE_NEWUTS,
136 .get = utsns_get, 165 .get = utsns_get,
137 .put = utsns_put, 166 .put = utsns_put,
138 .install = utsns_install, 167 .install = utsns_install,
168 .owner = utsns_owner,
139}; 169};
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index ef071ca73fc3..479d840db286 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -2974,6 +2974,31 @@ bool flush_delayed_work(struct delayed_work *dwork)
2974} 2974}
2975EXPORT_SYMBOL(flush_delayed_work); 2975EXPORT_SYMBOL(flush_delayed_work);
2976 2976
2977static bool __cancel_work(struct work_struct *work, bool is_dwork)
2978{
2979 unsigned long flags;
2980 int ret;
2981
2982 do {
2983 ret = try_to_grab_pending(work, is_dwork, &flags);
2984 } while (unlikely(ret == -EAGAIN));
2985
2986 if (unlikely(ret < 0))
2987 return false;
2988
2989 set_work_pool_and_clear_pending(work, get_work_pool_id(work));
2990 local_irq_restore(flags);
2991 return ret;
2992}
2993
2994/*
2995 * See cancel_delayed_work()
2996 */
2997bool cancel_work(struct work_struct *work)
2998{
2999 return __cancel_work(work, false);
3000}
3001
2977/** 3002/**
2978 * cancel_delayed_work - cancel a delayed work 3003 * cancel_delayed_work - cancel a delayed work
2979 * @dwork: delayed_work to cancel 3004 * @dwork: delayed_work to cancel
@@ -2992,20 +3017,7 @@ EXPORT_SYMBOL(flush_delayed_work);
2992 */ 3017 */
2993bool cancel_delayed_work(struct delayed_work *dwork) 3018bool cancel_delayed_work(struct delayed_work *dwork)
2994{ 3019{
2995 unsigned long flags; 3020 return __cancel_work(&dwork->work, true);
2996 int ret;
2997
2998 do {
2999 ret = try_to_grab_pending(&dwork->work, true, &flags);
3000 } while (unlikely(ret == -EAGAIN));
3001
3002 if (unlikely(ret < 0))
3003 return false;
3004
3005 set_work_pool_and_clear_pending(&dwork->work,
3006 get_work_pool_id(&dwork->work));
3007 local_irq_restore(flags);
3008 return ret;
3009} 3021}
3010EXPORT_SYMBOL(cancel_delayed_work); 3022EXPORT_SYMBOL(cancel_delayed_work);
3011 3023
@@ -4249,7 +4261,7 @@ void print_worker_info(const char *log_lvl, struct task_struct *task)
4249 * This function is called without any synchronization and @task 4261 * This function is called without any synchronization and @task
4250 * could be in any state. Be careful with dereferences. 4262 * could be in any state. Be careful with dereferences.
4251 */ 4263 */
4252 worker = probe_kthread_data(task); 4264 worker = kthread_probe_data(task);
4253 4265
4254 /* 4266 /*
4255 * Carefully copy the associated workqueue's workfn and name. Keep 4267 * Carefully copy the associated workqueue's workfn and name. Keep