diff options
Diffstat (limited to 'kernel')
122 files changed, 6346 insertions, 2580 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index e2ec54e2b952..eb26e12c6c2a 100644 --- a/kernel/Makefile +++ b/kernel/Makefile | |||
@@ -9,7 +9,7 @@ obj-y = fork.o exec_domain.o panic.o \ | |||
9 | extable.o params.o \ | 9 | extable.o params.o \ |
10 | kthread.o sys_ni.o nsproxy.o \ | 10 | kthread.o sys_ni.o nsproxy.o \ |
11 | notifier.o ksysfs.o cred.o reboot.o \ | 11 | notifier.o ksysfs.o cred.o reboot.o \ |
12 | async.o range.o smpboot.o | 12 | async.o range.o smpboot.o ucount.o |
13 | 13 | ||
14 | obj-$(CONFIG_MULTIUSER) += groups.o | 14 | obj-$(CONFIG_MULTIUSER) += groups.o |
15 | 15 | ||
diff --git a/kernel/audit.c b/kernel/audit.c index a8a91bd2b2a9..f1ca11613379 100644 --- a/kernel/audit.c +++ b/kernel/audit.c | |||
@@ -877,6 +877,12 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) | |||
877 | return err; | 877 | return err; |
878 | } | 878 | } |
879 | if (s.mask & AUDIT_STATUS_PID) { | 879 | if (s.mask & AUDIT_STATUS_PID) { |
880 | /* NOTE: we are using task_tgid_vnr() below because | ||
881 | * the s.pid value is relative to the namespace | ||
882 | * of the caller; at present this doesn't matter | ||
883 | * much since you can really only run auditd | ||
884 | * from the initial pid namespace, but something | ||
885 | * to keep in mind if this changes */ | ||
880 | int new_pid = s.pid; | 886 | int new_pid = s.pid; |
881 | pid_t requesting_pid = task_tgid_vnr(current); | 887 | pid_t requesting_pid = task_tgid_vnr(current); |
882 | 888 | ||
@@ -1917,7 +1923,7 @@ void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk) | |||
1917 | " euid=%u suid=%u fsuid=%u" | 1923 | " euid=%u suid=%u fsuid=%u" |
1918 | " egid=%u sgid=%u fsgid=%u tty=%s ses=%u", | 1924 | " egid=%u sgid=%u fsgid=%u tty=%s ses=%u", |
1919 | task_ppid_nr(tsk), | 1925 | task_ppid_nr(tsk), |
1920 | task_pid_nr(tsk), | 1926 | task_tgid_nr(tsk), |
1921 | from_kuid(&init_user_ns, audit_get_loginuid(tsk)), | 1927 | from_kuid(&init_user_ns, audit_get_loginuid(tsk)), |
1922 | from_kuid(&init_user_ns, cred->uid), | 1928 | from_kuid(&init_user_ns, cred->uid), |
1923 | from_kgid(&init_user_ns, cred->gid), | 1929 | from_kgid(&init_user_ns, cred->gid), |
diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 5abf1dc1f91c..2cd5256dbff7 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c | |||
@@ -457,7 +457,7 @@ static int audit_filter_rules(struct task_struct *tsk, | |||
457 | 457 | ||
458 | switch (f->type) { | 458 | switch (f->type) { |
459 | case AUDIT_PID: | 459 | case AUDIT_PID: |
460 | pid = task_pid_nr(tsk); | 460 | pid = task_tgid_nr(tsk); |
461 | result = audit_comparator(pid, f->op, f->val); | 461 | result = audit_comparator(pid, f->op, f->val); |
462 | break; | 462 | break; |
463 | case AUDIT_PPID: | 463 | case AUDIT_PPID: |
@@ -1993,7 +1993,7 @@ static void audit_log_set_loginuid(kuid_t koldloginuid, kuid_t kloginuid, | |||
1993 | loginuid = from_kuid(&init_user_ns, kloginuid), | 1993 | loginuid = from_kuid(&init_user_ns, kloginuid), |
1994 | tty = audit_get_tty(current); | 1994 | tty = audit_get_tty(current); |
1995 | 1995 | ||
1996 | audit_log_format(ab, "pid=%d uid=%u", task_pid_nr(current), uid); | 1996 | audit_log_format(ab, "pid=%d uid=%u", task_tgid_nr(current), uid); |
1997 | audit_log_task_context(ab); | 1997 | audit_log_task_context(ab); |
1998 | audit_log_format(ab, " old-auid=%u auid=%u tty=%s old-ses=%u ses=%u res=%d", | 1998 | audit_log_format(ab, " old-auid=%u auid=%u tty=%s old-ses=%u ses=%u res=%d", |
1999 | oldloginuid, loginuid, tty ? tty_name(tty) : "(none)", | 1999 | oldloginuid, loginuid, tty ? tty_name(tty) : "(none)", |
@@ -2220,7 +2220,7 @@ void __audit_ptrace(struct task_struct *t) | |||
2220 | { | 2220 | { |
2221 | struct audit_context *context = current->audit_context; | 2221 | struct audit_context *context = current->audit_context; |
2222 | 2222 | ||
2223 | context->target_pid = task_pid_nr(t); | 2223 | context->target_pid = task_tgid_nr(t); |
2224 | context->target_auid = audit_get_loginuid(t); | 2224 | context->target_auid = audit_get_loginuid(t); |
2225 | context->target_uid = task_uid(t); | 2225 | context->target_uid = task_uid(t); |
2226 | context->target_sessionid = audit_get_sessionid(t); | 2226 | context->target_sessionid = audit_get_sessionid(t); |
@@ -2245,7 +2245,7 @@ int __audit_signal_info(int sig, struct task_struct *t) | |||
2245 | 2245 | ||
2246 | if (audit_pid && t->tgid == audit_pid) { | 2246 | if (audit_pid && t->tgid == audit_pid) { |
2247 | if (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1 || sig == SIGUSR2) { | 2247 | if (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1 || sig == SIGUSR2) { |
2248 | audit_sig_pid = task_pid_nr(tsk); | 2248 | audit_sig_pid = task_tgid_nr(tsk); |
2249 | if (uid_valid(tsk->loginuid)) | 2249 | if (uid_valid(tsk->loginuid)) |
2250 | audit_sig_uid = tsk->loginuid; | 2250 | audit_sig_uid = tsk->loginuid; |
2251 | else | 2251 | else |
@@ -2345,7 +2345,7 @@ int __audit_log_bprm_fcaps(struct linux_binprm *bprm, | |||
2345 | void __audit_log_capset(const struct cred *new, const struct cred *old) | 2345 | void __audit_log_capset(const struct cred *new, const struct cred *old) |
2346 | { | 2346 | { |
2347 | struct audit_context *context = current->audit_context; | 2347 | struct audit_context *context = current->audit_context; |
2348 | context->capset.pid = task_pid_nr(current); | 2348 | context->capset.pid = task_tgid_nr(current); |
2349 | context->capset.cap.effective = new->cap_effective; | 2349 | context->capset.cap.effective = new->cap_effective; |
2350 | context->capset.cap.inheritable = new->cap_effective; | 2350 | context->capset.cap.inheritable = new->cap_effective; |
2351 | context->capset.cap.permitted = new->cap_permitted; | 2351 | context->capset.cap.permitted = new->cap_permitted; |
@@ -2377,7 +2377,7 @@ static void audit_log_task(struct audit_buffer *ab) | |||
2377 | from_kgid(&init_user_ns, gid), | 2377 | from_kgid(&init_user_ns, gid), |
2378 | sessionid); | 2378 | sessionid); |
2379 | audit_log_task_context(ab); | 2379 | audit_log_task_context(ab); |
2380 | audit_log_format(ab, " pid=%d comm=", task_pid_nr(current)); | 2380 | audit_log_format(ab, " pid=%d comm=", task_tgid_nr(current)); |
2381 | audit_log_untrustedstring(ab, get_task_comm(comm, current)); | 2381 | audit_log_untrustedstring(ab, get_task_comm(comm, current)); |
2382 | audit_log_d_path_exe(ab, current->mm); | 2382 | audit_log_d_path_exe(ab, current->mm); |
2383 | } | 2383 | } |
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 633a650d7aeb..a2ac051c342f 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c | |||
@@ -538,7 +538,7 @@ static int __init register_perf_event_array_map(void) | |||
538 | } | 538 | } |
539 | late_initcall(register_perf_event_array_map); | 539 | late_initcall(register_perf_event_array_map); |
540 | 540 | ||
541 | #ifdef CONFIG_SOCK_CGROUP_DATA | 541 | #ifdef CONFIG_CGROUPS |
542 | static void *cgroup_fd_array_get_ptr(struct bpf_map *map, | 542 | static void *cgroup_fd_array_get_ptr(struct bpf_map *map, |
543 | struct file *map_file /* not used */, | 543 | struct file *map_file /* not used */, |
544 | int fd) | 544 | int fd) |
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 03fd23d4d587..aa6d98154106 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c | |||
@@ -1018,7 +1018,7 @@ void bpf_user_rnd_init_once(void) | |||
1018 | prandom_init_once(&bpf_user_rnd_state); | 1018 | prandom_init_once(&bpf_user_rnd_state); |
1019 | } | 1019 | } |
1020 | 1020 | ||
1021 | u64 bpf_user_rnd_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) | 1021 | BPF_CALL_0(bpf_user_rnd_u32) |
1022 | { | 1022 | { |
1023 | /* Should someone ever have the rather unwise idea to use some | 1023 | /* Should someone ever have the rather unwise idea to use some |
1024 | * of the registers passed into this function, then note that | 1024 | * of the registers passed into this function, then note that |
@@ -1031,7 +1031,7 @@ u64 bpf_user_rnd_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) | |||
1031 | 1031 | ||
1032 | state = &get_cpu_var(bpf_user_rnd_state); | 1032 | state = &get_cpu_var(bpf_user_rnd_state); |
1033 | res = prandom_u32_state(state); | 1033 | res = prandom_u32_state(state); |
1034 | put_cpu_var(state); | 1034 | put_cpu_var(bpf_user_rnd_state); |
1035 | 1035 | ||
1036 | return res; | 1036 | return res; |
1037 | } | 1037 | } |
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 570eeca7bdfa..ad1bc67aff1b 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c | |||
@@ -687,7 +687,8 @@ static void delete_all_elements(struct bpf_htab *htab) | |||
687 | 687 | ||
688 | hlist_for_each_entry_safe(l, n, head, hash_node) { | 688 | hlist_for_each_entry_safe(l, n, head, hash_node) { |
689 | hlist_del_rcu(&l->hash_node); | 689 | hlist_del_rcu(&l->hash_node); |
690 | htab_elem_free(htab, l); | 690 | if (l->state != HTAB_EXTRA_ELEM_USED) |
691 | htab_elem_free(htab, l); | ||
691 | } | 692 | } |
692 | } | 693 | } |
693 | } | 694 | } |
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 1ea3afba1a4f..39918402e6e9 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c | |||
@@ -16,6 +16,7 @@ | |||
16 | #include <linux/ktime.h> | 16 | #include <linux/ktime.h> |
17 | #include <linux/sched.h> | 17 | #include <linux/sched.h> |
18 | #include <linux/uidgid.h> | 18 | #include <linux/uidgid.h> |
19 | #include <linux/filter.h> | ||
19 | 20 | ||
20 | /* If kernel subsystem is allowing eBPF programs to call this function, | 21 | /* If kernel subsystem is allowing eBPF programs to call this function, |
21 | * inside its own verifier_ops->get_func_proto() callback it should return | 22 | * inside its own verifier_ops->get_func_proto() callback it should return |
@@ -26,48 +27,32 @@ | |||
26 | * if program is allowed to access maps, so check rcu_read_lock_held in | 27 | * if program is allowed to access maps, so check rcu_read_lock_held in |
27 | * all three functions. | 28 | * all three functions. |
28 | */ | 29 | */ |
29 | static u64 bpf_map_lookup_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) | 30 | BPF_CALL_2(bpf_map_lookup_elem, struct bpf_map *, map, void *, key) |
30 | { | 31 | { |
31 | /* verifier checked that R1 contains a valid pointer to bpf_map | ||
32 | * and R2 points to a program stack and map->key_size bytes were | ||
33 | * initialized | ||
34 | */ | ||
35 | struct bpf_map *map = (struct bpf_map *) (unsigned long) r1; | ||
36 | void *key = (void *) (unsigned long) r2; | ||
37 | void *value; | ||
38 | |||
39 | WARN_ON_ONCE(!rcu_read_lock_held()); | 32 | WARN_ON_ONCE(!rcu_read_lock_held()); |
40 | 33 | return (unsigned long) map->ops->map_lookup_elem(map, key); | |
41 | value = map->ops->map_lookup_elem(map, key); | ||
42 | |||
43 | /* lookup() returns either pointer to element value or NULL | ||
44 | * which is the meaning of PTR_TO_MAP_VALUE_OR_NULL type | ||
45 | */ | ||
46 | return (unsigned long) value; | ||
47 | } | 34 | } |
48 | 35 | ||
49 | const struct bpf_func_proto bpf_map_lookup_elem_proto = { | 36 | const struct bpf_func_proto bpf_map_lookup_elem_proto = { |
50 | .func = bpf_map_lookup_elem, | 37 | .func = bpf_map_lookup_elem, |
51 | .gpl_only = false, | 38 | .gpl_only = false, |
39 | .pkt_access = true, | ||
52 | .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, | 40 | .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, |
53 | .arg1_type = ARG_CONST_MAP_PTR, | 41 | .arg1_type = ARG_CONST_MAP_PTR, |
54 | .arg2_type = ARG_PTR_TO_MAP_KEY, | 42 | .arg2_type = ARG_PTR_TO_MAP_KEY, |
55 | }; | 43 | }; |
56 | 44 | ||
57 | static u64 bpf_map_update_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) | 45 | BPF_CALL_4(bpf_map_update_elem, struct bpf_map *, map, void *, key, |
46 | void *, value, u64, flags) | ||
58 | { | 47 | { |
59 | struct bpf_map *map = (struct bpf_map *) (unsigned long) r1; | ||
60 | void *key = (void *) (unsigned long) r2; | ||
61 | void *value = (void *) (unsigned long) r3; | ||
62 | |||
63 | WARN_ON_ONCE(!rcu_read_lock_held()); | 48 | WARN_ON_ONCE(!rcu_read_lock_held()); |
64 | 49 | return map->ops->map_update_elem(map, key, value, flags); | |
65 | return map->ops->map_update_elem(map, key, value, r4); | ||
66 | } | 50 | } |
67 | 51 | ||
68 | const struct bpf_func_proto bpf_map_update_elem_proto = { | 52 | const struct bpf_func_proto bpf_map_update_elem_proto = { |
69 | .func = bpf_map_update_elem, | 53 | .func = bpf_map_update_elem, |
70 | .gpl_only = false, | 54 | .gpl_only = false, |
55 | .pkt_access = true, | ||
71 | .ret_type = RET_INTEGER, | 56 | .ret_type = RET_INTEGER, |
72 | .arg1_type = ARG_CONST_MAP_PTR, | 57 | .arg1_type = ARG_CONST_MAP_PTR, |
73 | .arg2_type = ARG_PTR_TO_MAP_KEY, | 58 | .arg2_type = ARG_PTR_TO_MAP_KEY, |
@@ -75,19 +60,16 @@ const struct bpf_func_proto bpf_map_update_elem_proto = { | |||
75 | .arg4_type = ARG_ANYTHING, | 60 | .arg4_type = ARG_ANYTHING, |
76 | }; | 61 | }; |
77 | 62 | ||
78 | static u64 bpf_map_delete_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) | 63 | BPF_CALL_2(bpf_map_delete_elem, struct bpf_map *, map, void *, key) |
79 | { | 64 | { |
80 | struct bpf_map *map = (struct bpf_map *) (unsigned long) r1; | ||
81 | void *key = (void *) (unsigned long) r2; | ||
82 | |||
83 | WARN_ON_ONCE(!rcu_read_lock_held()); | 65 | WARN_ON_ONCE(!rcu_read_lock_held()); |
84 | |||
85 | return map->ops->map_delete_elem(map, key); | 66 | return map->ops->map_delete_elem(map, key); |
86 | } | 67 | } |
87 | 68 | ||
88 | const struct bpf_func_proto bpf_map_delete_elem_proto = { | 69 | const struct bpf_func_proto bpf_map_delete_elem_proto = { |
89 | .func = bpf_map_delete_elem, | 70 | .func = bpf_map_delete_elem, |
90 | .gpl_only = false, | 71 | .gpl_only = false, |
72 | .pkt_access = true, | ||
91 | .ret_type = RET_INTEGER, | 73 | .ret_type = RET_INTEGER, |
92 | .arg1_type = ARG_CONST_MAP_PTR, | 74 | .arg1_type = ARG_CONST_MAP_PTR, |
93 | .arg2_type = ARG_PTR_TO_MAP_KEY, | 75 | .arg2_type = ARG_PTR_TO_MAP_KEY, |
@@ -99,7 +81,7 @@ const struct bpf_func_proto bpf_get_prandom_u32_proto = { | |||
99 | .ret_type = RET_INTEGER, | 81 | .ret_type = RET_INTEGER, |
100 | }; | 82 | }; |
101 | 83 | ||
102 | static u64 bpf_get_smp_processor_id(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) | 84 | BPF_CALL_0(bpf_get_smp_processor_id) |
103 | { | 85 | { |
104 | return smp_processor_id(); | 86 | return smp_processor_id(); |
105 | } | 87 | } |
@@ -110,7 +92,7 @@ const struct bpf_func_proto bpf_get_smp_processor_id_proto = { | |||
110 | .ret_type = RET_INTEGER, | 92 | .ret_type = RET_INTEGER, |
111 | }; | 93 | }; |
112 | 94 | ||
113 | static u64 bpf_ktime_get_ns(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) | 95 | BPF_CALL_0(bpf_ktime_get_ns) |
114 | { | 96 | { |
115 | /* NMI safe access to clock monotonic */ | 97 | /* NMI safe access to clock monotonic */ |
116 | return ktime_get_mono_fast_ns(); | 98 | return ktime_get_mono_fast_ns(); |
@@ -122,11 +104,11 @@ const struct bpf_func_proto bpf_ktime_get_ns_proto = { | |||
122 | .ret_type = RET_INTEGER, | 104 | .ret_type = RET_INTEGER, |
123 | }; | 105 | }; |
124 | 106 | ||
125 | static u64 bpf_get_current_pid_tgid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) | 107 | BPF_CALL_0(bpf_get_current_pid_tgid) |
126 | { | 108 | { |
127 | struct task_struct *task = current; | 109 | struct task_struct *task = current; |
128 | 110 | ||
129 | if (!task) | 111 | if (unlikely(!task)) |
130 | return -EINVAL; | 112 | return -EINVAL; |
131 | 113 | ||
132 | return (u64) task->tgid << 32 | task->pid; | 114 | return (u64) task->tgid << 32 | task->pid; |
@@ -138,18 +120,18 @@ const struct bpf_func_proto bpf_get_current_pid_tgid_proto = { | |||
138 | .ret_type = RET_INTEGER, | 120 | .ret_type = RET_INTEGER, |
139 | }; | 121 | }; |
140 | 122 | ||
141 | static u64 bpf_get_current_uid_gid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) | 123 | BPF_CALL_0(bpf_get_current_uid_gid) |
142 | { | 124 | { |
143 | struct task_struct *task = current; | 125 | struct task_struct *task = current; |
144 | kuid_t uid; | 126 | kuid_t uid; |
145 | kgid_t gid; | 127 | kgid_t gid; |
146 | 128 | ||
147 | if (!task) | 129 | if (unlikely(!task)) |
148 | return -EINVAL; | 130 | return -EINVAL; |
149 | 131 | ||
150 | current_uid_gid(&uid, &gid); | 132 | current_uid_gid(&uid, &gid); |
151 | return (u64) from_kgid(&init_user_ns, gid) << 32 | | 133 | return (u64) from_kgid(&init_user_ns, gid) << 32 | |
152 | from_kuid(&init_user_ns, uid); | 134 | from_kuid(&init_user_ns, uid); |
153 | } | 135 | } |
154 | 136 | ||
155 | const struct bpf_func_proto bpf_get_current_uid_gid_proto = { | 137 | const struct bpf_func_proto bpf_get_current_uid_gid_proto = { |
@@ -158,10 +140,9 @@ const struct bpf_func_proto bpf_get_current_uid_gid_proto = { | |||
158 | .ret_type = RET_INTEGER, | 140 | .ret_type = RET_INTEGER, |
159 | }; | 141 | }; |
160 | 142 | ||
161 | static u64 bpf_get_current_comm(u64 r1, u64 size, u64 r3, u64 r4, u64 r5) | 143 | BPF_CALL_2(bpf_get_current_comm, char *, buf, u32, size) |
162 | { | 144 | { |
163 | struct task_struct *task = current; | 145 | struct task_struct *task = current; |
164 | char *buf = (char *) (long) r1; | ||
165 | 146 | ||
166 | if (unlikely(!task)) | 147 | if (unlikely(!task)) |
167 | goto err_clear; | 148 | goto err_clear; |
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 5967b870a895..1ed8473ec537 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c | |||
@@ -97,7 +97,7 @@ static struct inode *bpf_get_inode(struct super_block *sb, | |||
97 | return ERR_PTR(-ENOSPC); | 97 | return ERR_PTR(-ENOSPC); |
98 | 98 | ||
99 | inode->i_ino = get_next_ino(); | 99 | inode->i_ino = get_next_ino(); |
100 | inode->i_atime = CURRENT_TIME; | 100 | inode->i_atime = current_time(inode); |
101 | inode->i_mtime = inode->i_atime; | 101 | inode->i_mtime = inode->i_atime; |
102 | inode->i_ctime = inode->i_atime; | 102 | inode->i_ctime = inode->i_atime; |
103 | 103 | ||
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index bf4495fcd25d..732ae16d12b7 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c | |||
@@ -116,10 +116,9 @@ free_smap: | |||
116 | return ERR_PTR(err); | 116 | return ERR_PTR(err); |
117 | } | 117 | } |
118 | 118 | ||
119 | u64 bpf_get_stackid(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5) | 119 | BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map, |
120 | u64, flags) | ||
120 | { | 121 | { |
121 | struct pt_regs *regs = (struct pt_regs *) (long) r1; | ||
122 | struct bpf_map *map = (struct bpf_map *) (long) r2; | ||
123 | struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map); | 122 | struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map); |
124 | struct perf_callchain_entry *trace; | 123 | struct perf_callchain_entry *trace; |
125 | struct stack_map_bucket *bucket, *new_bucket, *old_bucket; | 124 | struct stack_map_bucket *bucket, *new_bucket, *old_bucket; |
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 228f962447a5..237f3d6a7ddc 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c | |||
@@ -194,7 +194,7 @@ static int map_create(union bpf_attr *attr) | |||
194 | 194 | ||
195 | err = bpf_map_charge_memlock(map); | 195 | err = bpf_map_charge_memlock(map); |
196 | if (err) | 196 | if (err) |
197 | goto free_map; | 197 | goto free_map_nouncharge; |
198 | 198 | ||
199 | err = bpf_map_new_fd(map); | 199 | err = bpf_map_new_fd(map); |
200 | if (err < 0) | 200 | if (err < 0) |
@@ -204,6 +204,8 @@ static int map_create(union bpf_attr *attr) | |||
204 | return err; | 204 | return err; |
205 | 205 | ||
206 | free_map: | 206 | free_map: |
207 | bpf_map_uncharge_memlock(map); | ||
208 | free_map_nouncharge: | ||
207 | map->ops->map_free(map); | 209 | map->ops->map_free(map); |
208 | return err; | 210 | return err; |
209 | } | 211 | } |
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index daea765d72e6..6a936159c6e0 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c | |||
@@ -14,6 +14,7 @@ | |||
14 | #include <linux/types.h> | 14 | #include <linux/types.h> |
15 | #include <linux/slab.h> | 15 | #include <linux/slab.h> |
16 | #include <linux/bpf.h> | 16 | #include <linux/bpf.h> |
17 | #include <linux/bpf_verifier.h> | ||
17 | #include <linux/filter.h> | 18 | #include <linux/filter.h> |
18 | #include <net/netlink.h> | 19 | #include <net/netlink.h> |
19 | #include <linux/file.h> | 20 | #include <linux/file.h> |
@@ -126,76 +127,16 @@ | |||
126 | * are set to NOT_INIT to indicate that they are no longer readable. | 127 | * are set to NOT_INIT to indicate that they are no longer readable. |
127 | */ | 128 | */ |
128 | 129 | ||
129 | struct reg_state { | ||
130 | enum bpf_reg_type type; | ||
131 | union { | ||
132 | /* valid when type == CONST_IMM | PTR_TO_STACK | UNKNOWN_VALUE */ | ||
133 | s64 imm; | ||
134 | |||
135 | /* valid when type == PTR_TO_PACKET* */ | ||
136 | struct { | ||
137 | u32 id; | ||
138 | u16 off; | ||
139 | u16 range; | ||
140 | }; | ||
141 | |||
142 | /* valid when type == CONST_PTR_TO_MAP | PTR_TO_MAP_VALUE | | ||
143 | * PTR_TO_MAP_VALUE_OR_NULL | ||
144 | */ | ||
145 | struct bpf_map *map_ptr; | ||
146 | }; | ||
147 | }; | ||
148 | |||
149 | enum bpf_stack_slot_type { | ||
150 | STACK_INVALID, /* nothing was stored in this stack slot */ | ||
151 | STACK_SPILL, /* register spilled into stack */ | ||
152 | STACK_MISC /* BPF program wrote some data into this slot */ | ||
153 | }; | ||
154 | |||
155 | #define BPF_REG_SIZE 8 /* size of eBPF register in bytes */ | ||
156 | |||
157 | /* state of the program: | ||
158 | * type of all registers and stack info | ||
159 | */ | ||
160 | struct verifier_state { | ||
161 | struct reg_state regs[MAX_BPF_REG]; | ||
162 | u8 stack_slot_type[MAX_BPF_STACK]; | ||
163 | struct reg_state spilled_regs[MAX_BPF_STACK / BPF_REG_SIZE]; | ||
164 | }; | ||
165 | |||
166 | /* linked list of verifier states used to prune search */ | ||
167 | struct verifier_state_list { | ||
168 | struct verifier_state state; | ||
169 | struct verifier_state_list *next; | ||
170 | }; | ||
171 | |||
172 | /* verifier_state + insn_idx are pushed to stack when branch is encountered */ | 130 | /* verifier_state + insn_idx are pushed to stack when branch is encountered */ |
173 | struct verifier_stack_elem { | 131 | struct bpf_verifier_stack_elem { |
174 | /* verifer state is 'st' | 132 | /* verifer state is 'st' |
175 | * before processing instruction 'insn_idx' | 133 | * before processing instruction 'insn_idx' |
176 | * and after processing instruction 'prev_insn_idx' | 134 | * and after processing instruction 'prev_insn_idx' |
177 | */ | 135 | */ |
178 | struct verifier_state st; | 136 | struct bpf_verifier_state st; |
179 | int insn_idx; | 137 | int insn_idx; |
180 | int prev_insn_idx; | 138 | int prev_insn_idx; |
181 | struct verifier_stack_elem *next; | 139 | struct bpf_verifier_stack_elem *next; |
182 | }; | ||
183 | |||
184 | #define MAX_USED_MAPS 64 /* max number of maps accessed by one eBPF program */ | ||
185 | |||
186 | /* single container for all structs | ||
187 | * one verifier_env per bpf_check() call | ||
188 | */ | ||
189 | struct verifier_env { | ||
190 | struct bpf_prog *prog; /* eBPF program being verified */ | ||
191 | struct verifier_stack_elem *head; /* stack of verifier states to be processed */ | ||
192 | int stack_size; /* number of states to be processed */ | ||
193 | struct verifier_state cur_state; /* current verifier state */ | ||
194 | struct verifier_state_list **explored_states; /* search pruning optimization */ | ||
195 | struct bpf_map *used_maps[MAX_USED_MAPS]; /* array of map's used by eBPF program */ | ||
196 | u32 used_map_cnt; /* number of used maps */ | ||
197 | u32 id_gen; /* used to generate unique reg IDs */ | ||
198 | bool allow_ptr_leaks; | ||
199 | }; | 140 | }; |
200 | 141 | ||
201 | #define BPF_COMPLEXITY_LIMIT_INSNS 65536 | 142 | #define BPF_COMPLEXITY_LIMIT_INSNS 65536 |
@@ -204,6 +145,7 @@ struct verifier_env { | |||
204 | struct bpf_call_arg_meta { | 145 | struct bpf_call_arg_meta { |
205 | struct bpf_map *map_ptr; | 146 | struct bpf_map *map_ptr; |
206 | bool raw_mode; | 147 | bool raw_mode; |
148 | bool pkt_access; | ||
207 | int regno; | 149 | int regno; |
208 | int access_size; | 150 | int access_size; |
209 | }; | 151 | }; |
@@ -240,6 +182,7 @@ static const char * const reg_type_str[] = { | |||
240 | [CONST_PTR_TO_MAP] = "map_ptr", | 182 | [CONST_PTR_TO_MAP] = "map_ptr", |
241 | [PTR_TO_MAP_VALUE] = "map_value", | 183 | [PTR_TO_MAP_VALUE] = "map_value", |
242 | [PTR_TO_MAP_VALUE_OR_NULL] = "map_value_or_null", | 184 | [PTR_TO_MAP_VALUE_OR_NULL] = "map_value_or_null", |
185 | [PTR_TO_MAP_VALUE_ADJ] = "map_value_adj", | ||
243 | [FRAME_PTR] = "fp", | 186 | [FRAME_PTR] = "fp", |
244 | [PTR_TO_STACK] = "fp", | 187 | [PTR_TO_STACK] = "fp", |
245 | [CONST_IMM] = "imm", | 188 | [CONST_IMM] = "imm", |
@@ -247,9 +190,9 @@ static const char * const reg_type_str[] = { | |||
247 | [PTR_TO_PACKET_END] = "pkt_end", | 190 | [PTR_TO_PACKET_END] = "pkt_end", |
248 | }; | 191 | }; |
249 | 192 | ||
250 | static void print_verifier_state(struct verifier_state *state) | 193 | static void print_verifier_state(struct bpf_verifier_state *state) |
251 | { | 194 | { |
252 | struct reg_state *reg; | 195 | struct bpf_reg_state *reg; |
253 | enum bpf_reg_type t; | 196 | enum bpf_reg_type t; |
254 | int i; | 197 | int i; |
255 | 198 | ||
@@ -267,10 +210,17 @@ static void print_verifier_state(struct verifier_state *state) | |||
267 | else if (t == UNKNOWN_VALUE && reg->imm) | 210 | else if (t == UNKNOWN_VALUE && reg->imm) |
268 | verbose("%lld", reg->imm); | 211 | verbose("%lld", reg->imm); |
269 | else if (t == CONST_PTR_TO_MAP || t == PTR_TO_MAP_VALUE || | 212 | else if (t == CONST_PTR_TO_MAP || t == PTR_TO_MAP_VALUE || |
270 | t == PTR_TO_MAP_VALUE_OR_NULL) | 213 | t == PTR_TO_MAP_VALUE_OR_NULL || |
214 | t == PTR_TO_MAP_VALUE_ADJ) | ||
271 | verbose("(ks=%d,vs=%d)", | 215 | verbose("(ks=%d,vs=%d)", |
272 | reg->map_ptr->key_size, | 216 | reg->map_ptr->key_size, |
273 | reg->map_ptr->value_size); | 217 | reg->map_ptr->value_size); |
218 | if (reg->min_value != BPF_REGISTER_MIN_RANGE) | ||
219 | verbose(",min_value=%lld", | ||
220 | (long long)reg->min_value); | ||
221 | if (reg->max_value != BPF_REGISTER_MAX_RANGE) | ||
222 | verbose(",max_value=%llu", | ||
223 | (unsigned long long)reg->max_value); | ||
274 | } | 224 | } |
275 | for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) { | 225 | for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) { |
276 | if (state->stack_slot_type[i] == STACK_SPILL) | 226 | if (state->stack_slot_type[i] == STACK_SPILL) |
@@ -425,9 +375,9 @@ static void print_bpf_insn(struct bpf_insn *insn) | |||
425 | } | 375 | } |
426 | } | 376 | } |
427 | 377 | ||
428 | static int pop_stack(struct verifier_env *env, int *prev_insn_idx) | 378 | static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx) |
429 | { | 379 | { |
430 | struct verifier_stack_elem *elem; | 380 | struct bpf_verifier_stack_elem *elem; |
431 | int insn_idx; | 381 | int insn_idx; |
432 | 382 | ||
433 | if (env->head == NULL) | 383 | if (env->head == NULL) |
@@ -444,12 +394,12 @@ static int pop_stack(struct verifier_env *env, int *prev_insn_idx) | |||
444 | return insn_idx; | 394 | return insn_idx; |
445 | } | 395 | } |
446 | 396 | ||
447 | static struct verifier_state *push_stack(struct verifier_env *env, int insn_idx, | 397 | static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env, |
448 | int prev_insn_idx) | 398 | int insn_idx, int prev_insn_idx) |
449 | { | 399 | { |
450 | struct verifier_stack_elem *elem; | 400 | struct bpf_verifier_stack_elem *elem; |
451 | 401 | ||
452 | elem = kmalloc(sizeof(struct verifier_stack_elem), GFP_KERNEL); | 402 | elem = kmalloc(sizeof(struct bpf_verifier_stack_elem), GFP_KERNEL); |
453 | if (!elem) | 403 | if (!elem) |
454 | goto err; | 404 | goto err; |
455 | 405 | ||
@@ -475,13 +425,15 @@ static const int caller_saved[CALLER_SAVED_REGS] = { | |||
475 | BPF_REG_0, BPF_REG_1, BPF_REG_2, BPF_REG_3, BPF_REG_4, BPF_REG_5 | 425 | BPF_REG_0, BPF_REG_1, BPF_REG_2, BPF_REG_3, BPF_REG_4, BPF_REG_5 |
476 | }; | 426 | }; |
477 | 427 | ||
478 | static void init_reg_state(struct reg_state *regs) | 428 | static void init_reg_state(struct bpf_reg_state *regs) |
479 | { | 429 | { |
480 | int i; | 430 | int i; |
481 | 431 | ||
482 | for (i = 0; i < MAX_BPF_REG; i++) { | 432 | for (i = 0; i < MAX_BPF_REG; i++) { |
483 | regs[i].type = NOT_INIT; | 433 | regs[i].type = NOT_INIT; |
484 | regs[i].imm = 0; | 434 | regs[i].imm = 0; |
435 | regs[i].min_value = BPF_REGISTER_MIN_RANGE; | ||
436 | regs[i].max_value = BPF_REGISTER_MAX_RANGE; | ||
485 | } | 437 | } |
486 | 438 | ||
487 | /* frame pointer */ | 439 | /* frame pointer */ |
@@ -491,20 +443,26 @@ static void init_reg_state(struct reg_state *regs) | |||
491 | regs[BPF_REG_1].type = PTR_TO_CTX; | 443 | regs[BPF_REG_1].type = PTR_TO_CTX; |
492 | } | 444 | } |
493 | 445 | ||
494 | static void mark_reg_unknown_value(struct reg_state *regs, u32 regno) | 446 | static void mark_reg_unknown_value(struct bpf_reg_state *regs, u32 regno) |
495 | { | 447 | { |
496 | BUG_ON(regno >= MAX_BPF_REG); | 448 | BUG_ON(regno >= MAX_BPF_REG); |
497 | regs[regno].type = UNKNOWN_VALUE; | 449 | regs[regno].type = UNKNOWN_VALUE; |
498 | regs[regno].imm = 0; | 450 | regs[regno].imm = 0; |
499 | } | 451 | } |
500 | 452 | ||
453 | static void reset_reg_range_values(struct bpf_reg_state *regs, u32 regno) | ||
454 | { | ||
455 | regs[regno].min_value = BPF_REGISTER_MIN_RANGE; | ||
456 | regs[regno].max_value = BPF_REGISTER_MAX_RANGE; | ||
457 | } | ||
458 | |||
501 | enum reg_arg_type { | 459 | enum reg_arg_type { |
502 | SRC_OP, /* register is used as source operand */ | 460 | SRC_OP, /* register is used as source operand */ |
503 | DST_OP, /* register is used as destination operand */ | 461 | DST_OP, /* register is used as destination operand */ |
504 | DST_OP_NO_MARK /* same as above, check only, don't mark */ | 462 | DST_OP_NO_MARK /* same as above, check only, don't mark */ |
505 | }; | 463 | }; |
506 | 464 | ||
507 | static int check_reg_arg(struct reg_state *regs, u32 regno, | 465 | static int check_reg_arg(struct bpf_reg_state *regs, u32 regno, |
508 | enum reg_arg_type t) | 466 | enum reg_arg_type t) |
509 | { | 467 | { |
510 | if (regno >= MAX_BPF_REG) { | 468 | if (regno >= MAX_BPF_REG) { |
@@ -564,8 +522,8 @@ static bool is_spillable_regtype(enum bpf_reg_type type) | |||
564 | /* check_stack_read/write functions track spill/fill of registers, | 522 | /* check_stack_read/write functions track spill/fill of registers, |
565 | * stack boundary and alignment are checked in check_mem_access() | 523 | * stack boundary and alignment are checked in check_mem_access() |
566 | */ | 524 | */ |
567 | static int check_stack_write(struct verifier_state *state, int off, int size, | 525 | static int check_stack_write(struct bpf_verifier_state *state, int off, |
568 | int value_regno) | 526 | int size, int value_regno) |
569 | { | 527 | { |
570 | int i; | 528 | int i; |
571 | /* caller checked that off % size == 0 and -MAX_BPF_STACK <= off < 0, | 529 | /* caller checked that off % size == 0 and -MAX_BPF_STACK <= off < 0, |
@@ -590,7 +548,7 @@ static int check_stack_write(struct verifier_state *state, int off, int size, | |||
590 | } else { | 548 | } else { |
591 | /* regular write of data into stack */ | 549 | /* regular write of data into stack */ |
592 | state->spilled_regs[(MAX_BPF_STACK + off) / BPF_REG_SIZE] = | 550 | state->spilled_regs[(MAX_BPF_STACK + off) / BPF_REG_SIZE] = |
593 | (struct reg_state) {}; | 551 | (struct bpf_reg_state) {}; |
594 | 552 | ||
595 | for (i = 0; i < size; i++) | 553 | for (i = 0; i < size; i++) |
596 | state->stack_slot_type[MAX_BPF_STACK + off + i] = STACK_MISC; | 554 | state->stack_slot_type[MAX_BPF_STACK + off + i] = STACK_MISC; |
@@ -598,7 +556,7 @@ static int check_stack_write(struct verifier_state *state, int off, int size, | |||
598 | return 0; | 556 | return 0; |
599 | } | 557 | } |
600 | 558 | ||
601 | static int check_stack_read(struct verifier_state *state, int off, int size, | 559 | static int check_stack_read(struct bpf_verifier_state *state, int off, int size, |
602 | int value_regno) | 560 | int value_regno) |
603 | { | 561 | { |
604 | u8 *slot_type; | 562 | u8 *slot_type; |
@@ -639,7 +597,7 @@ static int check_stack_read(struct verifier_state *state, int off, int size, | |||
639 | } | 597 | } |
640 | 598 | ||
641 | /* check read/write into map element returned by bpf_map_lookup_elem() */ | 599 | /* check read/write into map element returned by bpf_map_lookup_elem() */ |
642 | static int check_map_access(struct verifier_env *env, u32 regno, int off, | 600 | static int check_map_access(struct bpf_verifier_env *env, u32 regno, int off, |
643 | int size) | 601 | int size) |
644 | { | 602 | { |
645 | struct bpf_map *map = env->cur_state.regs[regno].map_ptr; | 603 | struct bpf_map *map = env->cur_state.regs[regno].map_ptr; |
@@ -654,24 +612,31 @@ static int check_map_access(struct verifier_env *env, u32 regno, int off, | |||
654 | 612 | ||
655 | #define MAX_PACKET_OFF 0xffff | 613 | #define MAX_PACKET_OFF 0xffff |
656 | 614 | ||
657 | static bool may_write_pkt_data(enum bpf_prog_type type) | 615 | static bool may_access_direct_pkt_data(struct bpf_verifier_env *env, |
616 | const struct bpf_call_arg_meta *meta) | ||
658 | { | 617 | { |
659 | switch (type) { | 618 | switch (env->prog->type) { |
619 | case BPF_PROG_TYPE_SCHED_CLS: | ||
620 | case BPF_PROG_TYPE_SCHED_ACT: | ||
660 | case BPF_PROG_TYPE_XDP: | 621 | case BPF_PROG_TYPE_XDP: |
622 | if (meta) | ||
623 | return meta->pkt_access; | ||
624 | |||
625 | env->seen_direct_write = true; | ||
661 | return true; | 626 | return true; |
662 | default: | 627 | default: |
663 | return false; | 628 | return false; |
664 | } | 629 | } |
665 | } | 630 | } |
666 | 631 | ||
667 | static int check_packet_access(struct verifier_env *env, u32 regno, int off, | 632 | static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off, |
668 | int size) | 633 | int size) |
669 | { | 634 | { |
670 | struct reg_state *regs = env->cur_state.regs; | 635 | struct bpf_reg_state *regs = env->cur_state.regs; |
671 | struct reg_state *reg = ®s[regno]; | 636 | struct bpf_reg_state *reg = ®s[regno]; |
672 | 637 | ||
673 | off += reg->off; | 638 | off += reg->off; |
674 | if (off < 0 || off + size > reg->range) { | 639 | if (off < 0 || size <= 0 || off + size > reg->range) { |
675 | verbose("invalid access to packet, off=%d size=%d, R%d(id=%d,off=%d,r=%d)\n", | 640 | verbose("invalid access to packet, off=%d size=%d, R%d(id=%d,off=%d,r=%d)\n", |
676 | off, size, regno, reg->id, reg->off, reg->range); | 641 | off, size, regno, reg->id, reg->off, reg->range); |
677 | return -EACCES; | 642 | return -EACCES; |
@@ -680,9 +645,13 @@ static int check_packet_access(struct verifier_env *env, u32 regno, int off, | |||
680 | } | 645 | } |
681 | 646 | ||
682 | /* check access to 'struct bpf_context' fields */ | 647 | /* check access to 'struct bpf_context' fields */ |
683 | static int check_ctx_access(struct verifier_env *env, int off, int size, | 648 | static int check_ctx_access(struct bpf_verifier_env *env, int off, int size, |
684 | enum bpf_access_type t, enum bpf_reg_type *reg_type) | 649 | enum bpf_access_type t, enum bpf_reg_type *reg_type) |
685 | { | 650 | { |
651 | /* for analyzer ctx accesses are already validated and converted */ | ||
652 | if (env->analyzer_ops) | ||
653 | return 0; | ||
654 | |||
686 | if (env->prog->aux->ops->is_valid_access && | 655 | if (env->prog->aux->ops->is_valid_access && |
687 | env->prog->aux->ops->is_valid_access(off, size, t, reg_type)) { | 656 | env->prog->aux->ops->is_valid_access(off, size, t, reg_type)) { |
688 | /* remember the offset of last byte accessed in ctx */ | 657 | /* remember the offset of last byte accessed in ctx */ |
@@ -695,7 +664,7 @@ static int check_ctx_access(struct verifier_env *env, int off, int size, | |||
695 | return -EACCES; | 664 | return -EACCES; |
696 | } | 665 | } |
697 | 666 | ||
698 | static bool is_pointer_value(struct verifier_env *env, int regno) | 667 | static bool is_pointer_value(struct bpf_verifier_env *env, int regno) |
699 | { | 668 | { |
700 | if (env->allow_ptr_leaks) | 669 | if (env->allow_ptr_leaks) |
701 | return false; | 670 | return false; |
@@ -709,28 +678,19 @@ static bool is_pointer_value(struct verifier_env *env, int regno) | |||
709 | } | 678 | } |
710 | } | 679 | } |
711 | 680 | ||
712 | static int check_ptr_alignment(struct verifier_env *env, struct reg_state *reg, | 681 | static int check_ptr_alignment(struct bpf_verifier_env *env, |
713 | int off, int size) | 682 | struct bpf_reg_state *reg, int off, int size) |
714 | { | 683 | { |
715 | if (reg->type != PTR_TO_PACKET) { | 684 | if (reg->type != PTR_TO_PACKET && reg->type != PTR_TO_MAP_VALUE_ADJ) { |
716 | if (off % size != 0) { | 685 | if (off % size != 0) { |
717 | verbose("misaligned access off %d size %d\n", off, size); | 686 | verbose("misaligned access off %d size %d\n", |
687 | off, size); | ||
718 | return -EACCES; | 688 | return -EACCES; |
719 | } else { | 689 | } else { |
720 | return 0; | 690 | return 0; |
721 | } | 691 | } |
722 | } | 692 | } |
723 | 693 | ||
724 | switch (env->prog->type) { | ||
725 | case BPF_PROG_TYPE_SCHED_CLS: | ||
726 | case BPF_PROG_TYPE_SCHED_ACT: | ||
727 | case BPF_PROG_TYPE_XDP: | ||
728 | break; | ||
729 | default: | ||
730 | verbose("verifier is misconfigured\n"); | ||
731 | return -EACCES; | ||
732 | } | ||
733 | |||
734 | if (IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) | 694 | if (IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) |
735 | /* misaligned access to packet is ok on x86,arm,arm64 */ | 695 | /* misaligned access to packet is ok on x86,arm,arm64 */ |
736 | return 0; | 696 | return 0; |
@@ -741,7 +701,8 @@ static int check_ptr_alignment(struct verifier_env *env, struct reg_state *reg, | |||
741 | } | 701 | } |
742 | 702 | ||
743 | /* skb->data is NET_IP_ALIGN-ed */ | 703 | /* skb->data is NET_IP_ALIGN-ed */ |
744 | if ((NET_IP_ALIGN + reg->off + off) % size != 0) { | 704 | if (reg->type == PTR_TO_PACKET && |
705 | (NET_IP_ALIGN + reg->off + off) % size != 0) { | ||
745 | verbose("misaligned packet access off %d+%d+%d size %d\n", | 706 | verbose("misaligned packet access off %d+%d+%d size %d\n", |
746 | NET_IP_ALIGN, reg->off, off, size); | 707 | NET_IP_ALIGN, reg->off, off, size); |
747 | return -EACCES; | 708 | return -EACCES; |
@@ -755,12 +716,12 @@ static int check_ptr_alignment(struct verifier_env *env, struct reg_state *reg, | |||
755 | * if t==write && value_regno==-1, some unknown value is stored into memory | 716 | * if t==write && value_regno==-1, some unknown value is stored into memory |
756 | * if t==read && value_regno==-1, don't care what we read from memory | 717 | * if t==read && value_regno==-1, don't care what we read from memory |
757 | */ | 718 | */ |
758 | static int check_mem_access(struct verifier_env *env, u32 regno, int off, | 719 | static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off, |
759 | int bpf_size, enum bpf_access_type t, | 720 | int bpf_size, enum bpf_access_type t, |
760 | int value_regno) | 721 | int value_regno) |
761 | { | 722 | { |
762 | struct verifier_state *state = &env->cur_state; | 723 | struct bpf_verifier_state *state = &env->cur_state; |
763 | struct reg_state *reg = &state->regs[regno]; | 724 | struct bpf_reg_state *reg = &state->regs[regno]; |
764 | int size, err = 0; | 725 | int size, err = 0; |
765 | 726 | ||
766 | if (reg->type == PTR_TO_STACK) | 727 | if (reg->type == PTR_TO_STACK) |
@@ -774,12 +735,52 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off, | |||
774 | if (err) | 735 | if (err) |
775 | return err; | 736 | return err; |
776 | 737 | ||
777 | if (reg->type == PTR_TO_MAP_VALUE) { | 738 | if (reg->type == PTR_TO_MAP_VALUE || |
739 | reg->type == PTR_TO_MAP_VALUE_ADJ) { | ||
778 | if (t == BPF_WRITE && value_regno >= 0 && | 740 | if (t == BPF_WRITE && value_regno >= 0 && |
779 | is_pointer_value(env, value_regno)) { | 741 | is_pointer_value(env, value_regno)) { |
780 | verbose("R%d leaks addr into map\n", value_regno); | 742 | verbose("R%d leaks addr into map\n", value_regno); |
781 | return -EACCES; | 743 | return -EACCES; |
782 | } | 744 | } |
745 | |||
746 | /* If we adjusted the register to this map value at all then we | ||
747 | * need to change off and size to min_value and max_value | ||
748 | * respectively to make sure our theoretical access will be | ||
749 | * safe. | ||
750 | */ | ||
751 | if (reg->type == PTR_TO_MAP_VALUE_ADJ) { | ||
752 | if (log_level) | ||
753 | print_verifier_state(state); | ||
754 | env->varlen_map_value_access = true; | ||
755 | /* The minimum value is only important with signed | ||
756 | * comparisons where we can't assume the floor of a | ||
757 | * value is 0. If we are using signed variables for our | ||
758 | * index'es we need to make sure that whatever we use | ||
759 | * will have a set floor within our range. | ||
760 | */ | ||
761 | if (reg->min_value < 0) { | ||
762 | verbose("R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n", | ||
763 | regno); | ||
764 | return -EACCES; | ||
765 | } | ||
766 | err = check_map_access(env, regno, reg->min_value + off, | ||
767 | size); | ||
768 | if (err) { | ||
769 | verbose("R%d min value is outside of the array range\n", | ||
770 | regno); | ||
771 | return err; | ||
772 | } | ||
773 | |||
774 | /* If we haven't set a max value then we need to bail | ||
775 | * since we can't be sure we won't do bad things. | ||
776 | */ | ||
777 | if (reg->max_value == BPF_REGISTER_MAX_RANGE) { | ||
778 | verbose("R%d unbounded memory access, make sure to bounds check any array access into a map\n", | ||
779 | regno); | ||
780 | return -EACCES; | ||
781 | } | ||
782 | off += reg->max_value; | ||
783 | } | ||
783 | err = check_map_access(env, regno, off, size); | 784 | err = check_map_access(env, regno, off, size); |
784 | if (!err && t == BPF_READ && value_regno >= 0) | 785 | if (!err && t == BPF_READ && value_regno >= 0) |
785 | mark_reg_unknown_value(state->regs, value_regno); | 786 | mark_reg_unknown_value(state->regs, value_regno); |
@@ -795,9 +796,8 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off, | |||
795 | err = check_ctx_access(env, off, size, t, ®_type); | 796 | err = check_ctx_access(env, off, size, t, ®_type); |
796 | if (!err && t == BPF_READ && value_regno >= 0) { | 797 | if (!err && t == BPF_READ && value_regno >= 0) { |
797 | mark_reg_unknown_value(state->regs, value_regno); | 798 | mark_reg_unknown_value(state->regs, value_regno); |
798 | if (env->allow_ptr_leaks) | 799 | /* note that reg.[id|off|range] == 0 */ |
799 | /* note that reg.[id|off|range] == 0 */ | 800 | state->regs[value_regno].type = reg_type; |
800 | state->regs[value_regno].type = reg_type; | ||
801 | } | 801 | } |
802 | 802 | ||
803 | } else if (reg->type == FRAME_PTR || reg->type == PTR_TO_STACK) { | 803 | } else if (reg->type == FRAME_PTR || reg->type == PTR_TO_STACK) { |
@@ -817,7 +817,7 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off, | |||
817 | err = check_stack_read(state, off, size, value_regno); | 817 | err = check_stack_read(state, off, size, value_regno); |
818 | } | 818 | } |
819 | } else if (state->regs[regno].type == PTR_TO_PACKET) { | 819 | } else if (state->regs[regno].type == PTR_TO_PACKET) { |
820 | if (t == BPF_WRITE && !may_write_pkt_data(env->prog->type)) { | 820 | if (t == BPF_WRITE && !may_access_direct_pkt_data(env, NULL)) { |
821 | verbose("cannot write into packet\n"); | 821 | verbose("cannot write into packet\n"); |
822 | return -EACCES; | 822 | return -EACCES; |
823 | } | 823 | } |
@@ -846,9 +846,9 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off, | |||
846 | return err; | 846 | return err; |
847 | } | 847 | } |
848 | 848 | ||
849 | static int check_xadd(struct verifier_env *env, struct bpf_insn *insn) | 849 | static int check_xadd(struct bpf_verifier_env *env, struct bpf_insn *insn) |
850 | { | 850 | { |
851 | struct reg_state *regs = env->cur_state.regs; | 851 | struct bpf_reg_state *regs = env->cur_state.regs; |
852 | int err; | 852 | int err; |
853 | 853 | ||
854 | if ((BPF_SIZE(insn->code) != BPF_W && BPF_SIZE(insn->code) != BPF_DW) || | 854 | if ((BPF_SIZE(insn->code) != BPF_W && BPF_SIZE(insn->code) != BPF_DW) || |
@@ -882,12 +882,12 @@ static int check_xadd(struct verifier_env *env, struct bpf_insn *insn) | |||
882 | * bytes from that pointer, make sure that it's within stack boundary | 882 | * bytes from that pointer, make sure that it's within stack boundary |
883 | * and all elements of stack are initialized | 883 | * and all elements of stack are initialized |
884 | */ | 884 | */ |
885 | static int check_stack_boundary(struct verifier_env *env, int regno, | 885 | static int check_stack_boundary(struct bpf_verifier_env *env, int regno, |
886 | int access_size, bool zero_size_allowed, | 886 | int access_size, bool zero_size_allowed, |
887 | struct bpf_call_arg_meta *meta) | 887 | struct bpf_call_arg_meta *meta) |
888 | { | 888 | { |
889 | struct verifier_state *state = &env->cur_state; | 889 | struct bpf_verifier_state *state = &env->cur_state; |
890 | struct reg_state *regs = state->regs; | 890 | struct bpf_reg_state *regs = state->regs; |
891 | int off, i; | 891 | int off, i; |
892 | 892 | ||
893 | if (regs[regno].type != PTR_TO_STACK) { | 893 | if (regs[regno].type != PTR_TO_STACK) { |
@@ -926,18 +926,18 @@ static int check_stack_boundary(struct verifier_env *env, int regno, | |||
926 | return 0; | 926 | return 0; |
927 | } | 927 | } |
928 | 928 | ||
929 | static int check_func_arg(struct verifier_env *env, u32 regno, | 929 | static int check_func_arg(struct bpf_verifier_env *env, u32 regno, |
930 | enum bpf_arg_type arg_type, | 930 | enum bpf_arg_type arg_type, |
931 | struct bpf_call_arg_meta *meta) | 931 | struct bpf_call_arg_meta *meta) |
932 | { | 932 | { |
933 | struct reg_state *reg = env->cur_state.regs + regno; | 933 | struct bpf_reg_state *regs = env->cur_state.regs, *reg = ®s[regno]; |
934 | enum bpf_reg_type expected_type; | 934 | enum bpf_reg_type expected_type, type = reg->type; |
935 | int err = 0; | 935 | int err = 0; |
936 | 936 | ||
937 | if (arg_type == ARG_DONTCARE) | 937 | if (arg_type == ARG_DONTCARE) |
938 | return 0; | 938 | return 0; |
939 | 939 | ||
940 | if (reg->type == NOT_INIT) { | 940 | if (type == NOT_INIT) { |
941 | verbose("R%d !read_ok\n", regno); | 941 | verbose("R%d !read_ok\n", regno); |
942 | return -EACCES; | 942 | return -EACCES; |
943 | } | 943 | } |
@@ -950,16 +950,29 @@ static int check_func_arg(struct verifier_env *env, u32 regno, | |||
950 | return 0; | 950 | return 0; |
951 | } | 951 | } |
952 | 952 | ||
953 | if (type == PTR_TO_PACKET && !may_access_direct_pkt_data(env, meta)) { | ||
954 | verbose("helper access to the packet is not allowed\n"); | ||
955 | return -EACCES; | ||
956 | } | ||
957 | |||
953 | if (arg_type == ARG_PTR_TO_MAP_KEY || | 958 | if (arg_type == ARG_PTR_TO_MAP_KEY || |
954 | arg_type == ARG_PTR_TO_MAP_VALUE) { | 959 | arg_type == ARG_PTR_TO_MAP_VALUE) { |
955 | expected_type = PTR_TO_STACK; | 960 | expected_type = PTR_TO_STACK; |
961 | if (type != PTR_TO_PACKET && type != expected_type) | ||
962 | goto err_type; | ||
956 | } else if (arg_type == ARG_CONST_STACK_SIZE || | 963 | } else if (arg_type == ARG_CONST_STACK_SIZE || |
957 | arg_type == ARG_CONST_STACK_SIZE_OR_ZERO) { | 964 | arg_type == ARG_CONST_STACK_SIZE_OR_ZERO) { |
958 | expected_type = CONST_IMM; | 965 | expected_type = CONST_IMM; |
966 | if (type != expected_type) | ||
967 | goto err_type; | ||
959 | } else if (arg_type == ARG_CONST_MAP_PTR) { | 968 | } else if (arg_type == ARG_CONST_MAP_PTR) { |
960 | expected_type = CONST_PTR_TO_MAP; | 969 | expected_type = CONST_PTR_TO_MAP; |
970 | if (type != expected_type) | ||
971 | goto err_type; | ||
961 | } else if (arg_type == ARG_PTR_TO_CTX) { | 972 | } else if (arg_type == ARG_PTR_TO_CTX) { |
962 | expected_type = PTR_TO_CTX; | 973 | expected_type = PTR_TO_CTX; |
974 | if (type != expected_type) | ||
975 | goto err_type; | ||
963 | } else if (arg_type == ARG_PTR_TO_STACK || | 976 | } else if (arg_type == ARG_PTR_TO_STACK || |
964 | arg_type == ARG_PTR_TO_RAW_STACK) { | 977 | arg_type == ARG_PTR_TO_RAW_STACK) { |
965 | expected_type = PTR_TO_STACK; | 978 | expected_type = PTR_TO_STACK; |
@@ -967,20 +980,16 @@ static int check_func_arg(struct verifier_env *env, u32 regno, | |||
967 | * passed in as argument, it's a CONST_IMM type. Final test | 980 | * passed in as argument, it's a CONST_IMM type. Final test |
968 | * happens during stack boundary checking. | 981 | * happens during stack boundary checking. |
969 | */ | 982 | */ |
970 | if (reg->type == CONST_IMM && reg->imm == 0) | 983 | if (type == CONST_IMM && reg->imm == 0) |
971 | expected_type = CONST_IMM; | 984 | /* final test in check_stack_boundary() */; |
985 | else if (type != PTR_TO_PACKET && type != expected_type) | ||
986 | goto err_type; | ||
972 | meta->raw_mode = arg_type == ARG_PTR_TO_RAW_STACK; | 987 | meta->raw_mode = arg_type == ARG_PTR_TO_RAW_STACK; |
973 | } else { | 988 | } else { |
974 | verbose("unsupported arg_type %d\n", arg_type); | 989 | verbose("unsupported arg_type %d\n", arg_type); |
975 | return -EFAULT; | 990 | return -EFAULT; |
976 | } | 991 | } |
977 | 992 | ||
978 | if (reg->type != expected_type) { | ||
979 | verbose("R%d type=%s expected=%s\n", regno, | ||
980 | reg_type_str[reg->type], reg_type_str[expected_type]); | ||
981 | return -EACCES; | ||
982 | } | ||
983 | |||
984 | if (arg_type == ARG_CONST_MAP_PTR) { | 993 | if (arg_type == ARG_CONST_MAP_PTR) { |
985 | /* bpf_map_xxx(map_ptr) call: remember that map_ptr */ | 994 | /* bpf_map_xxx(map_ptr) call: remember that map_ptr */ |
986 | meta->map_ptr = reg->map_ptr; | 995 | meta->map_ptr = reg->map_ptr; |
@@ -998,8 +1007,13 @@ static int check_func_arg(struct verifier_env *env, u32 regno, | |||
998 | verbose("invalid map_ptr to access map->key\n"); | 1007 | verbose("invalid map_ptr to access map->key\n"); |
999 | return -EACCES; | 1008 | return -EACCES; |
1000 | } | 1009 | } |
1001 | err = check_stack_boundary(env, regno, meta->map_ptr->key_size, | 1010 | if (type == PTR_TO_PACKET) |
1002 | false, NULL); | 1011 | err = check_packet_access(env, regno, 0, |
1012 | meta->map_ptr->key_size); | ||
1013 | else | ||
1014 | err = check_stack_boundary(env, regno, | ||
1015 | meta->map_ptr->key_size, | ||
1016 | false, NULL); | ||
1003 | } else if (arg_type == ARG_PTR_TO_MAP_VALUE) { | 1017 | } else if (arg_type == ARG_PTR_TO_MAP_VALUE) { |
1004 | /* bpf_map_xxx(..., map_ptr, ..., value) call: | 1018 | /* bpf_map_xxx(..., map_ptr, ..., value) call: |
1005 | * check [value, value + map->value_size) validity | 1019 | * check [value, value + map->value_size) validity |
@@ -1009,9 +1023,13 @@ static int check_func_arg(struct verifier_env *env, u32 regno, | |||
1009 | verbose("invalid map_ptr to access map->value\n"); | 1023 | verbose("invalid map_ptr to access map->value\n"); |
1010 | return -EACCES; | 1024 | return -EACCES; |
1011 | } | 1025 | } |
1012 | err = check_stack_boundary(env, regno, | 1026 | if (type == PTR_TO_PACKET) |
1013 | meta->map_ptr->value_size, | 1027 | err = check_packet_access(env, regno, 0, |
1014 | false, NULL); | 1028 | meta->map_ptr->value_size); |
1029 | else | ||
1030 | err = check_stack_boundary(env, regno, | ||
1031 | meta->map_ptr->value_size, | ||
1032 | false, NULL); | ||
1015 | } else if (arg_type == ARG_CONST_STACK_SIZE || | 1033 | } else if (arg_type == ARG_CONST_STACK_SIZE || |
1016 | arg_type == ARG_CONST_STACK_SIZE_OR_ZERO) { | 1034 | arg_type == ARG_CONST_STACK_SIZE_OR_ZERO) { |
1017 | bool zero_size_allowed = (arg_type == ARG_CONST_STACK_SIZE_OR_ZERO); | 1035 | bool zero_size_allowed = (arg_type == ARG_CONST_STACK_SIZE_OR_ZERO); |
@@ -1025,11 +1043,18 @@ static int check_func_arg(struct verifier_env *env, u32 regno, | |||
1025 | verbose("ARG_CONST_STACK_SIZE cannot be first argument\n"); | 1043 | verbose("ARG_CONST_STACK_SIZE cannot be first argument\n"); |
1026 | return -EACCES; | 1044 | return -EACCES; |
1027 | } | 1045 | } |
1028 | err = check_stack_boundary(env, regno - 1, reg->imm, | 1046 | if (regs[regno - 1].type == PTR_TO_PACKET) |
1029 | zero_size_allowed, meta); | 1047 | err = check_packet_access(env, regno - 1, 0, reg->imm); |
1048 | else | ||
1049 | err = check_stack_boundary(env, regno - 1, reg->imm, | ||
1050 | zero_size_allowed, meta); | ||
1030 | } | 1051 | } |
1031 | 1052 | ||
1032 | return err; | 1053 | return err; |
1054 | err_type: | ||
1055 | verbose("R%d type=%s expected=%s\n", regno, | ||
1056 | reg_type_str[type], reg_type_str[expected_type]); | ||
1057 | return -EACCES; | ||
1033 | } | 1058 | } |
1034 | 1059 | ||
1035 | static int check_map_func_compatibility(struct bpf_map *map, int func_id) | 1060 | static int check_map_func_compatibility(struct bpf_map *map, int func_id) |
@@ -1053,7 +1078,8 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id) | |||
1053 | goto error; | 1078 | goto error; |
1054 | break; | 1079 | break; |
1055 | case BPF_MAP_TYPE_CGROUP_ARRAY: | 1080 | case BPF_MAP_TYPE_CGROUP_ARRAY: |
1056 | if (func_id != BPF_FUNC_skb_under_cgroup) | 1081 | if (func_id != BPF_FUNC_skb_under_cgroup && |
1082 | func_id != BPF_FUNC_current_task_under_cgroup) | ||
1057 | goto error; | 1083 | goto error; |
1058 | break; | 1084 | break; |
1059 | default: | 1085 | default: |
@@ -1075,6 +1101,7 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id) | |||
1075 | if (map->map_type != BPF_MAP_TYPE_STACK_TRACE) | 1101 | if (map->map_type != BPF_MAP_TYPE_STACK_TRACE) |
1076 | goto error; | 1102 | goto error; |
1077 | break; | 1103 | break; |
1104 | case BPF_FUNC_current_task_under_cgroup: | ||
1078 | case BPF_FUNC_skb_under_cgroup: | 1105 | case BPF_FUNC_skb_under_cgroup: |
1079 | if (map->map_type != BPF_MAP_TYPE_CGROUP_ARRAY) | 1106 | if (map->map_type != BPF_MAP_TYPE_CGROUP_ARRAY) |
1080 | goto error; | 1107 | goto error; |
@@ -1108,10 +1135,10 @@ static int check_raw_mode(const struct bpf_func_proto *fn) | |||
1108 | return count > 1 ? -EINVAL : 0; | 1135 | return count > 1 ? -EINVAL : 0; |
1109 | } | 1136 | } |
1110 | 1137 | ||
1111 | static void clear_all_pkt_pointers(struct verifier_env *env) | 1138 | static void clear_all_pkt_pointers(struct bpf_verifier_env *env) |
1112 | { | 1139 | { |
1113 | struct verifier_state *state = &env->cur_state; | 1140 | struct bpf_verifier_state *state = &env->cur_state; |
1114 | struct reg_state *regs = state->regs, *reg; | 1141 | struct bpf_reg_state *regs = state->regs, *reg; |
1115 | int i; | 1142 | int i; |
1116 | 1143 | ||
1117 | for (i = 0; i < MAX_BPF_REG; i++) | 1144 | for (i = 0; i < MAX_BPF_REG; i++) |
@@ -1131,12 +1158,12 @@ static void clear_all_pkt_pointers(struct verifier_env *env) | |||
1131 | } | 1158 | } |
1132 | } | 1159 | } |
1133 | 1160 | ||
1134 | static int check_call(struct verifier_env *env, int func_id) | 1161 | static int check_call(struct bpf_verifier_env *env, int func_id) |
1135 | { | 1162 | { |
1136 | struct verifier_state *state = &env->cur_state; | 1163 | struct bpf_verifier_state *state = &env->cur_state; |
1137 | const struct bpf_func_proto *fn = NULL; | 1164 | const struct bpf_func_proto *fn = NULL; |
1138 | struct reg_state *regs = state->regs; | 1165 | struct bpf_reg_state *regs = state->regs; |
1139 | struct reg_state *reg; | 1166 | struct bpf_reg_state *reg; |
1140 | struct bpf_call_arg_meta meta; | 1167 | struct bpf_call_arg_meta meta; |
1141 | bool changes_data; | 1168 | bool changes_data; |
1142 | int i, err; | 1169 | int i, err; |
@@ -1164,6 +1191,7 @@ static int check_call(struct verifier_env *env, int func_id) | |||
1164 | changes_data = bpf_helper_changes_skb_data(fn->func); | 1191 | changes_data = bpf_helper_changes_skb_data(fn->func); |
1165 | 1192 | ||
1166 | memset(&meta, 0, sizeof(meta)); | 1193 | memset(&meta, 0, sizeof(meta)); |
1194 | meta.pkt_access = fn->pkt_access; | ||
1167 | 1195 | ||
1168 | /* We only support one arg being in raw mode at the moment, which | 1196 | /* We only support one arg being in raw mode at the moment, which |
1169 | * is sufficient for the helper functions we have right now. | 1197 | * is sufficient for the helper functions we have right now. |
@@ -1214,6 +1242,7 @@ static int check_call(struct verifier_env *env, int func_id) | |||
1214 | regs[BPF_REG_0].type = NOT_INIT; | 1242 | regs[BPF_REG_0].type = NOT_INIT; |
1215 | } else if (fn->ret_type == RET_PTR_TO_MAP_VALUE_OR_NULL) { | 1243 | } else if (fn->ret_type == RET_PTR_TO_MAP_VALUE_OR_NULL) { |
1216 | regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL; | 1244 | regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL; |
1245 | regs[BPF_REG_0].max_value = regs[BPF_REG_0].min_value = 0; | ||
1217 | /* remember map_ptr, so that check_map_access() | 1246 | /* remember map_ptr, so that check_map_access() |
1218 | * can check 'value_size' boundary of memory access | 1247 | * can check 'value_size' boundary of memory access |
1219 | * to map element returned from bpf_map_lookup_elem() | 1248 | * to map element returned from bpf_map_lookup_elem() |
@@ -1238,12 +1267,13 @@ static int check_call(struct verifier_env *env, int func_id) | |||
1238 | return 0; | 1267 | return 0; |
1239 | } | 1268 | } |
1240 | 1269 | ||
1241 | static int check_packet_ptr_add(struct verifier_env *env, struct bpf_insn *insn) | 1270 | static int check_packet_ptr_add(struct bpf_verifier_env *env, |
1271 | struct bpf_insn *insn) | ||
1242 | { | 1272 | { |
1243 | struct reg_state *regs = env->cur_state.regs; | 1273 | struct bpf_reg_state *regs = env->cur_state.regs; |
1244 | struct reg_state *dst_reg = ®s[insn->dst_reg]; | 1274 | struct bpf_reg_state *dst_reg = ®s[insn->dst_reg]; |
1245 | struct reg_state *src_reg = ®s[insn->src_reg]; | 1275 | struct bpf_reg_state *src_reg = ®s[insn->src_reg]; |
1246 | struct reg_state tmp_reg; | 1276 | struct bpf_reg_state tmp_reg; |
1247 | s32 imm; | 1277 | s32 imm; |
1248 | 1278 | ||
1249 | if (BPF_SRC(insn->code) == BPF_K) { | 1279 | if (BPF_SRC(insn->code) == BPF_K) { |
@@ -1311,10 +1341,10 @@ add_imm: | |||
1311 | return 0; | 1341 | return 0; |
1312 | } | 1342 | } |
1313 | 1343 | ||
1314 | static int evaluate_reg_alu(struct verifier_env *env, struct bpf_insn *insn) | 1344 | static int evaluate_reg_alu(struct bpf_verifier_env *env, struct bpf_insn *insn) |
1315 | { | 1345 | { |
1316 | struct reg_state *regs = env->cur_state.regs; | 1346 | struct bpf_reg_state *regs = env->cur_state.regs; |
1317 | struct reg_state *dst_reg = ®s[insn->dst_reg]; | 1347 | struct bpf_reg_state *dst_reg = ®s[insn->dst_reg]; |
1318 | u8 opcode = BPF_OP(insn->code); | 1348 | u8 opcode = BPF_OP(insn->code); |
1319 | s64 imm_log2; | 1349 | s64 imm_log2; |
1320 | 1350 | ||
@@ -1324,7 +1354,7 @@ static int evaluate_reg_alu(struct verifier_env *env, struct bpf_insn *insn) | |||
1324 | */ | 1354 | */ |
1325 | 1355 | ||
1326 | if (BPF_SRC(insn->code) == BPF_X) { | 1356 | if (BPF_SRC(insn->code) == BPF_X) { |
1327 | struct reg_state *src_reg = ®s[insn->src_reg]; | 1357 | struct bpf_reg_state *src_reg = ®s[insn->src_reg]; |
1328 | 1358 | ||
1329 | if (src_reg->type == UNKNOWN_VALUE && src_reg->imm > 0 && | 1359 | if (src_reg->type == UNKNOWN_VALUE && src_reg->imm > 0 && |
1330 | dst_reg->imm && opcode == BPF_ADD) { | 1360 | dst_reg->imm && opcode == BPF_ADD) { |
@@ -1413,11 +1443,12 @@ static int evaluate_reg_alu(struct verifier_env *env, struct bpf_insn *insn) | |||
1413 | return 0; | 1443 | return 0; |
1414 | } | 1444 | } |
1415 | 1445 | ||
1416 | static int evaluate_reg_imm_alu(struct verifier_env *env, struct bpf_insn *insn) | 1446 | static int evaluate_reg_imm_alu(struct bpf_verifier_env *env, |
1447 | struct bpf_insn *insn) | ||
1417 | { | 1448 | { |
1418 | struct reg_state *regs = env->cur_state.regs; | 1449 | struct bpf_reg_state *regs = env->cur_state.regs; |
1419 | struct reg_state *dst_reg = ®s[insn->dst_reg]; | 1450 | struct bpf_reg_state *dst_reg = ®s[insn->dst_reg]; |
1420 | struct reg_state *src_reg = ®s[insn->src_reg]; | 1451 | struct bpf_reg_state *src_reg = ®s[insn->src_reg]; |
1421 | u8 opcode = BPF_OP(insn->code); | 1452 | u8 opcode = BPF_OP(insn->code); |
1422 | 1453 | ||
1423 | /* dst_reg->type == CONST_IMM here, simulate execution of 'add' insn. | 1454 | /* dst_reg->type == CONST_IMM here, simulate execution of 'add' insn. |
@@ -1433,10 +1464,134 @@ static int evaluate_reg_imm_alu(struct verifier_env *env, struct bpf_insn *insn) | |||
1433 | return 0; | 1464 | return 0; |
1434 | } | 1465 | } |
1435 | 1466 | ||
1467 | static void check_reg_overflow(struct bpf_reg_state *reg) | ||
1468 | { | ||
1469 | if (reg->max_value > BPF_REGISTER_MAX_RANGE) | ||
1470 | reg->max_value = BPF_REGISTER_MAX_RANGE; | ||
1471 | if (reg->min_value < BPF_REGISTER_MIN_RANGE || | ||
1472 | reg->min_value > BPF_REGISTER_MAX_RANGE) | ||
1473 | reg->min_value = BPF_REGISTER_MIN_RANGE; | ||
1474 | } | ||
1475 | |||
1476 | static void adjust_reg_min_max_vals(struct bpf_verifier_env *env, | ||
1477 | struct bpf_insn *insn) | ||
1478 | { | ||
1479 | struct bpf_reg_state *regs = env->cur_state.regs, *dst_reg; | ||
1480 | s64 min_val = BPF_REGISTER_MIN_RANGE; | ||
1481 | u64 max_val = BPF_REGISTER_MAX_RANGE; | ||
1482 | bool min_set = false, max_set = false; | ||
1483 | u8 opcode = BPF_OP(insn->code); | ||
1484 | |||
1485 | dst_reg = ®s[insn->dst_reg]; | ||
1486 | if (BPF_SRC(insn->code) == BPF_X) { | ||
1487 | check_reg_overflow(®s[insn->src_reg]); | ||
1488 | min_val = regs[insn->src_reg].min_value; | ||
1489 | max_val = regs[insn->src_reg].max_value; | ||
1490 | |||
1491 | /* If the source register is a random pointer then the | ||
1492 | * min_value/max_value values represent the range of the known | ||
1493 | * accesses into that value, not the actual min/max value of the | ||
1494 | * register itself. In this case we have to reset the reg range | ||
1495 | * values so we know it is not safe to look at. | ||
1496 | */ | ||
1497 | if (regs[insn->src_reg].type != CONST_IMM && | ||
1498 | regs[insn->src_reg].type != UNKNOWN_VALUE) { | ||
1499 | min_val = BPF_REGISTER_MIN_RANGE; | ||
1500 | max_val = BPF_REGISTER_MAX_RANGE; | ||
1501 | } | ||
1502 | } else if (insn->imm < BPF_REGISTER_MAX_RANGE && | ||
1503 | (s64)insn->imm > BPF_REGISTER_MIN_RANGE) { | ||
1504 | min_val = max_val = insn->imm; | ||
1505 | min_set = max_set = true; | ||
1506 | } | ||
1507 | |||
1508 | /* We don't know anything about what was done to this register, mark it | ||
1509 | * as unknown. | ||
1510 | */ | ||
1511 | if (min_val == BPF_REGISTER_MIN_RANGE && | ||
1512 | max_val == BPF_REGISTER_MAX_RANGE) { | ||
1513 | reset_reg_range_values(regs, insn->dst_reg); | ||
1514 | return; | ||
1515 | } | ||
1516 | |||
1517 | /* If one of our values was at the end of our ranges then we can't just | ||
1518 | * do our normal operations to the register, we need to set the values | ||
1519 | * to the min/max since they are undefined. | ||
1520 | */ | ||
1521 | if (min_val == BPF_REGISTER_MIN_RANGE) | ||
1522 | dst_reg->min_value = BPF_REGISTER_MIN_RANGE; | ||
1523 | if (max_val == BPF_REGISTER_MAX_RANGE) | ||
1524 | dst_reg->max_value = BPF_REGISTER_MAX_RANGE; | ||
1525 | |||
1526 | switch (opcode) { | ||
1527 | case BPF_ADD: | ||
1528 | if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE) | ||
1529 | dst_reg->min_value += min_val; | ||
1530 | if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE) | ||
1531 | dst_reg->max_value += max_val; | ||
1532 | break; | ||
1533 | case BPF_SUB: | ||
1534 | if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE) | ||
1535 | dst_reg->min_value -= min_val; | ||
1536 | if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE) | ||
1537 | dst_reg->max_value -= max_val; | ||
1538 | break; | ||
1539 | case BPF_MUL: | ||
1540 | if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE) | ||
1541 | dst_reg->min_value *= min_val; | ||
1542 | if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE) | ||
1543 | dst_reg->max_value *= max_val; | ||
1544 | break; | ||
1545 | case BPF_AND: | ||
1546 | /* Disallow AND'ing of negative numbers, ain't nobody got time | ||
1547 | * for that. Otherwise the minimum is 0 and the max is the max | ||
1548 | * value we could AND against. | ||
1549 | */ | ||
1550 | if (min_val < 0) | ||
1551 | dst_reg->min_value = BPF_REGISTER_MIN_RANGE; | ||
1552 | else | ||
1553 | dst_reg->min_value = 0; | ||
1554 | dst_reg->max_value = max_val; | ||
1555 | break; | ||
1556 | case BPF_LSH: | ||
1557 | /* Gotta have special overflow logic here, if we're shifting | ||
1558 | * more than MAX_RANGE then just assume we have an invalid | ||
1559 | * range. | ||
1560 | */ | ||
1561 | if (min_val > ilog2(BPF_REGISTER_MAX_RANGE)) | ||
1562 | dst_reg->min_value = BPF_REGISTER_MIN_RANGE; | ||
1563 | else if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE) | ||
1564 | dst_reg->min_value <<= min_val; | ||
1565 | |||
1566 | if (max_val > ilog2(BPF_REGISTER_MAX_RANGE)) | ||
1567 | dst_reg->max_value = BPF_REGISTER_MAX_RANGE; | ||
1568 | else if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE) | ||
1569 | dst_reg->max_value <<= max_val; | ||
1570 | break; | ||
1571 | case BPF_RSH: | ||
1572 | /* RSH by a negative number is undefined, and the BPF_RSH is an | ||
1573 | * unsigned shift, so make the appropriate casts. | ||
1574 | */ | ||
1575 | if (min_val < 0 || dst_reg->min_value < 0) | ||
1576 | dst_reg->min_value = BPF_REGISTER_MIN_RANGE; | ||
1577 | else | ||
1578 | dst_reg->min_value = | ||
1579 | (u64)(dst_reg->min_value) >> min_val; | ||
1580 | if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE) | ||
1581 | dst_reg->max_value >>= max_val; | ||
1582 | break; | ||
1583 | default: | ||
1584 | reset_reg_range_values(regs, insn->dst_reg); | ||
1585 | break; | ||
1586 | } | ||
1587 | |||
1588 | check_reg_overflow(dst_reg); | ||
1589 | } | ||
1590 | |||
1436 | /* check validity of 32-bit and 64-bit arithmetic operations */ | 1591 | /* check validity of 32-bit and 64-bit arithmetic operations */ |
1437 | static int check_alu_op(struct verifier_env *env, struct bpf_insn *insn) | 1592 | static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) |
1438 | { | 1593 | { |
1439 | struct reg_state *regs = env->cur_state.regs, *dst_reg; | 1594 | struct bpf_reg_state *regs = env->cur_state.regs, *dst_reg; |
1440 | u8 opcode = BPF_OP(insn->code); | 1595 | u8 opcode = BPF_OP(insn->code); |
1441 | int err; | 1596 | int err; |
1442 | 1597 | ||
@@ -1496,6 +1651,11 @@ static int check_alu_op(struct verifier_env *env, struct bpf_insn *insn) | |||
1496 | if (err) | 1651 | if (err) |
1497 | return err; | 1652 | return err; |
1498 | 1653 | ||
1654 | /* we are setting our register to something new, we need to | ||
1655 | * reset its range values. | ||
1656 | */ | ||
1657 | reset_reg_range_values(regs, insn->dst_reg); | ||
1658 | |||
1499 | if (BPF_SRC(insn->code) == BPF_X) { | 1659 | if (BPF_SRC(insn->code) == BPF_X) { |
1500 | if (BPF_CLASS(insn->code) == BPF_ALU64) { | 1660 | if (BPF_CLASS(insn->code) == BPF_ALU64) { |
1501 | /* case: R1 = R2 | 1661 | /* case: R1 = R2 |
@@ -1517,6 +1677,8 @@ static int check_alu_op(struct verifier_env *env, struct bpf_insn *insn) | |||
1517 | */ | 1677 | */ |
1518 | regs[insn->dst_reg].type = CONST_IMM; | 1678 | regs[insn->dst_reg].type = CONST_IMM; |
1519 | regs[insn->dst_reg].imm = insn->imm; | 1679 | regs[insn->dst_reg].imm = insn->imm; |
1680 | regs[insn->dst_reg].max_value = insn->imm; | ||
1681 | regs[insn->dst_reg].min_value = insn->imm; | ||
1520 | } | 1682 | } |
1521 | 1683 | ||
1522 | } else if (opcode > BPF_END) { | 1684 | } else if (opcode > BPF_END) { |
@@ -1569,6 +1731,9 @@ static int check_alu_op(struct verifier_env *env, struct bpf_insn *insn) | |||
1569 | 1731 | ||
1570 | dst_reg = ®s[insn->dst_reg]; | 1732 | dst_reg = ®s[insn->dst_reg]; |
1571 | 1733 | ||
1734 | /* first we want to adjust our ranges. */ | ||
1735 | adjust_reg_min_max_vals(env, insn); | ||
1736 | |||
1572 | /* pattern match 'bpf_add Rx, imm' instruction */ | 1737 | /* pattern match 'bpf_add Rx, imm' instruction */ |
1573 | if (opcode == BPF_ADD && BPF_CLASS(insn->code) == BPF_ALU64 && | 1738 | if (opcode == BPF_ADD && BPF_CLASS(insn->code) == BPF_ALU64 && |
1574 | dst_reg->type == FRAME_PTR && BPF_SRC(insn->code) == BPF_K) { | 1739 | dst_reg->type == FRAME_PTR && BPF_SRC(insn->code) == BPF_K) { |
@@ -1603,28 +1768,58 @@ static int check_alu_op(struct verifier_env *env, struct bpf_insn *insn) | |||
1603 | return -EACCES; | 1768 | return -EACCES; |
1604 | } | 1769 | } |
1605 | 1770 | ||
1606 | /* mark dest operand */ | 1771 | /* If we did pointer math on a map value then just set it to our |
1607 | mark_reg_unknown_value(regs, insn->dst_reg); | 1772 | * PTR_TO_MAP_VALUE_ADJ type so we can deal with any stores or |
1773 | * loads to this register appropriately, otherwise just mark the | ||
1774 | * register as unknown. | ||
1775 | */ | ||
1776 | if (env->allow_ptr_leaks && | ||
1777 | (dst_reg->type == PTR_TO_MAP_VALUE || | ||
1778 | dst_reg->type == PTR_TO_MAP_VALUE_ADJ)) | ||
1779 | dst_reg->type = PTR_TO_MAP_VALUE_ADJ; | ||
1780 | else | ||
1781 | mark_reg_unknown_value(regs, insn->dst_reg); | ||
1608 | } | 1782 | } |
1609 | 1783 | ||
1610 | return 0; | 1784 | return 0; |
1611 | } | 1785 | } |
1612 | 1786 | ||
1613 | static void find_good_pkt_pointers(struct verifier_env *env, | 1787 | static void find_good_pkt_pointers(struct bpf_verifier_state *state, |
1614 | struct reg_state *dst_reg) | 1788 | struct bpf_reg_state *dst_reg) |
1615 | { | 1789 | { |
1616 | struct verifier_state *state = &env->cur_state; | 1790 | struct bpf_reg_state *regs = state->regs, *reg; |
1617 | struct reg_state *regs = state->regs, *reg; | ||
1618 | int i; | 1791 | int i; |
1619 | /* r2 = r3; | 1792 | |
1620 | * r2 += 8 | 1793 | /* LLVM can generate two kind of checks: |
1621 | * if (r2 > pkt_end) goto somewhere | 1794 | * |
1622 | * r2 == dst_reg, pkt_end == src_reg, | 1795 | * Type 1: |
1623 | * r2=pkt(id=n,off=8,r=0) | 1796 | * |
1624 | * r3=pkt(id=n,off=0,r=0) | 1797 | * r2 = r3; |
1625 | * find register r3 and mark its range as r3=pkt(id=n,off=0,r=8) | 1798 | * r2 += 8; |
1626 | * so that range of bytes [r3, r3 + 8) is safe to access | 1799 | * if (r2 > pkt_end) goto <handle exception> |
1800 | * <access okay> | ||
1801 | * | ||
1802 | * Where: | ||
1803 | * r2 == dst_reg, pkt_end == src_reg | ||
1804 | * r2=pkt(id=n,off=8,r=0) | ||
1805 | * r3=pkt(id=n,off=0,r=0) | ||
1806 | * | ||
1807 | * Type 2: | ||
1808 | * | ||
1809 | * r2 = r3; | ||
1810 | * r2 += 8; | ||
1811 | * if (pkt_end >= r2) goto <access okay> | ||
1812 | * <handle exception> | ||
1813 | * | ||
1814 | * Where: | ||
1815 | * pkt_end == dst_reg, r2 == src_reg | ||
1816 | * r2=pkt(id=n,off=8,r=0) | ||
1817 | * r3=pkt(id=n,off=0,r=0) | ||
1818 | * | ||
1819 | * Find register r3 and mark its range as r3=pkt(id=n,off=0,r=8) | ||
1820 | * so that range of bytes [r3, r3 + 8) is safe to access. | ||
1627 | */ | 1821 | */ |
1822 | |||
1628 | for (i = 0; i < MAX_BPF_REG; i++) | 1823 | for (i = 0; i < MAX_BPF_REG; i++) |
1629 | if (regs[i].type == PTR_TO_PACKET && regs[i].id == dst_reg->id) | 1824 | if (regs[i].type == PTR_TO_PACKET && regs[i].id == dst_reg->id) |
1630 | regs[i].range = dst_reg->off; | 1825 | regs[i].range = dst_reg->off; |
@@ -1638,11 +1833,109 @@ static void find_good_pkt_pointers(struct verifier_env *env, | |||
1638 | } | 1833 | } |
1639 | } | 1834 | } |
1640 | 1835 | ||
1641 | static int check_cond_jmp_op(struct verifier_env *env, | 1836 | /* Adjusts the register min/max values in the case that the dst_reg is the |
1837 | * variable register that we are working on, and src_reg is a constant or we're | ||
1838 | * simply doing a BPF_K check. | ||
1839 | */ | ||
1840 | static void reg_set_min_max(struct bpf_reg_state *true_reg, | ||
1841 | struct bpf_reg_state *false_reg, u64 val, | ||
1842 | u8 opcode) | ||
1843 | { | ||
1844 | switch (opcode) { | ||
1845 | case BPF_JEQ: | ||
1846 | /* If this is false then we know nothing Jon Snow, but if it is | ||
1847 | * true then we know for sure. | ||
1848 | */ | ||
1849 | true_reg->max_value = true_reg->min_value = val; | ||
1850 | break; | ||
1851 | case BPF_JNE: | ||
1852 | /* If this is true we know nothing Jon Snow, but if it is false | ||
1853 | * we know the value for sure; | ||
1854 | */ | ||
1855 | false_reg->max_value = false_reg->min_value = val; | ||
1856 | break; | ||
1857 | case BPF_JGT: | ||
1858 | /* Unsigned comparison, the minimum value is 0. */ | ||
1859 | false_reg->min_value = 0; | ||
1860 | case BPF_JSGT: | ||
1861 | /* If this is false then we know the maximum val is val, | ||
1862 | * otherwise we know the min val is val+1. | ||
1863 | */ | ||
1864 | false_reg->max_value = val; | ||
1865 | true_reg->min_value = val + 1; | ||
1866 | break; | ||
1867 | case BPF_JGE: | ||
1868 | /* Unsigned comparison, the minimum value is 0. */ | ||
1869 | false_reg->min_value = 0; | ||
1870 | case BPF_JSGE: | ||
1871 | /* If this is false then we know the maximum value is val - 1, | ||
1872 | * otherwise we know the mimimum value is val. | ||
1873 | */ | ||
1874 | false_reg->max_value = val - 1; | ||
1875 | true_reg->min_value = val; | ||
1876 | break; | ||
1877 | default: | ||
1878 | break; | ||
1879 | } | ||
1880 | |||
1881 | check_reg_overflow(false_reg); | ||
1882 | check_reg_overflow(true_reg); | ||
1883 | } | ||
1884 | |||
1885 | /* Same as above, but for the case that dst_reg is a CONST_IMM reg and src_reg | ||
1886 | * is the variable reg. | ||
1887 | */ | ||
1888 | static void reg_set_min_max_inv(struct bpf_reg_state *true_reg, | ||
1889 | struct bpf_reg_state *false_reg, u64 val, | ||
1890 | u8 opcode) | ||
1891 | { | ||
1892 | switch (opcode) { | ||
1893 | case BPF_JEQ: | ||
1894 | /* If this is false then we know nothing Jon Snow, but if it is | ||
1895 | * true then we know for sure. | ||
1896 | */ | ||
1897 | true_reg->max_value = true_reg->min_value = val; | ||
1898 | break; | ||
1899 | case BPF_JNE: | ||
1900 | /* If this is true we know nothing Jon Snow, but if it is false | ||
1901 | * we know the value for sure; | ||
1902 | */ | ||
1903 | false_reg->max_value = false_reg->min_value = val; | ||
1904 | break; | ||
1905 | case BPF_JGT: | ||
1906 | /* Unsigned comparison, the minimum value is 0. */ | ||
1907 | true_reg->min_value = 0; | ||
1908 | case BPF_JSGT: | ||
1909 | /* | ||
1910 | * If this is false, then the val is <= the register, if it is | ||
1911 | * true the register <= to the val. | ||
1912 | */ | ||
1913 | false_reg->min_value = val; | ||
1914 | true_reg->max_value = val - 1; | ||
1915 | break; | ||
1916 | case BPF_JGE: | ||
1917 | /* Unsigned comparison, the minimum value is 0. */ | ||
1918 | true_reg->min_value = 0; | ||
1919 | case BPF_JSGE: | ||
1920 | /* If this is false then constant < register, if it is true then | ||
1921 | * the register < constant. | ||
1922 | */ | ||
1923 | false_reg->min_value = val + 1; | ||
1924 | true_reg->max_value = val; | ||
1925 | break; | ||
1926 | default: | ||
1927 | break; | ||
1928 | } | ||
1929 | |||
1930 | check_reg_overflow(false_reg); | ||
1931 | check_reg_overflow(true_reg); | ||
1932 | } | ||
1933 | |||
1934 | static int check_cond_jmp_op(struct bpf_verifier_env *env, | ||
1642 | struct bpf_insn *insn, int *insn_idx) | 1935 | struct bpf_insn *insn, int *insn_idx) |
1643 | { | 1936 | { |
1644 | struct reg_state *regs = env->cur_state.regs, *dst_reg; | 1937 | struct bpf_verifier_state *other_branch, *this_branch = &env->cur_state; |
1645 | struct verifier_state *other_branch; | 1938 | struct bpf_reg_state *regs = this_branch->regs, *dst_reg; |
1646 | u8 opcode = BPF_OP(insn->code); | 1939 | u8 opcode = BPF_OP(insn->code); |
1647 | int err; | 1940 | int err; |
1648 | 1941 | ||
@@ -1704,7 +1997,24 @@ static int check_cond_jmp_op(struct verifier_env *env, | |||
1704 | if (!other_branch) | 1997 | if (!other_branch) |
1705 | return -EFAULT; | 1998 | return -EFAULT; |
1706 | 1999 | ||
1707 | /* detect if R == 0 where R is returned value from bpf_map_lookup_elem() */ | 2000 | /* detect if we are comparing against a constant value so we can adjust |
2001 | * our min/max values for our dst register. | ||
2002 | */ | ||
2003 | if (BPF_SRC(insn->code) == BPF_X) { | ||
2004 | if (regs[insn->src_reg].type == CONST_IMM) | ||
2005 | reg_set_min_max(&other_branch->regs[insn->dst_reg], | ||
2006 | dst_reg, regs[insn->src_reg].imm, | ||
2007 | opcode); | ||
2008 | else if (dst_reg->type == CONST_IMM) | ||
2009 | reg_set_min_max_inv(&other_branch->regs[insn->src_reg], | ||
2010 | ®s[insn->src_reg], dst_reg->imm, | ||
2011 | opcode); | ||
2012 | } else { | ||
2013 | reg_set_min_max(&other_branch->regs[insn->dst_reg], | ||
2014 | dst_reg, insn->imm, opcode); | ||
2015 | } | ||
2016 | |||
2017 | /* detect if R == 0 where R is returned from bpf_map_lookup_elem() */ | ||
1708 | if (BPF_SRC(insn->code) == BPF_K && | 2018 | if (BPF_SRC(insn->code) == BPF_K && |
1709 | insn->imm == 0 && (opcode == BPF_JEQ || opcode == BPF_JNE) && | 2019 | insn->imm == 0 && (opcode == BPF_JEQ || opcode == BPF_JNE) && |
1710 | dst_reg->type == PTR_TO_MAP_VALUE_OR_NULL) { | 2020 | dst_reg->type == PTR_TO_MAP_VALUE_OR_NULL) { |
@@ -1723,13 +2033,17 @@ static int check_cond_jmp_op(struct verifier_env *env, | |||
1723 | } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGT && | 2033 | } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGT && |
1724 | dst_reg->type == PTR_TO_PACKET && | 2034 | dst_reg->type == PTR_TO_PACKET && |
1725 | regs[insn->src_reg].type == PTR_TO_PACKET_END) { | 2035 | regs[insn->src_reg].type == PTR_TO_PACKET_END) { |
1726 | find_good_pkt_pointers(env, dst_reg); | 2036 | find_good_pkt_pointers(this_branch, dst_reg); |
2037 | } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGE && | ||
2038 | dst_reg->type == PTR_TO_PACKET_END && | ||
2039 | regs[insn->src_reg].type == PTR_TO_PACKET) { | ||
2040 | find_good_pkt_pointers(other_branch, ®s[insn->src_reg]); | ||
1727 | } else if (is_pointer_value(env, insn->dst_reg)) { | 2041 | } else if (is_pointer_value(env, insn->dst_reg)) { |
1728 | verbose("R%d pointer comparison prohibited\n", insn->dst_reg); | 2042 | verbose("R%d pointer comparison prohibited\n", insn->dst_reg); |
1729 | return -EACCES; | 2043 | return -EACCES; |
1730 | } | 2044 | } |
1731 | if (log_level) | 2045 | if (log_level) |
1732 | print_verifier_state(&env->cur_state); | 2046 | print_verifier_state(this_branch); |
1733 | return 0; | 2047 | return 0; |
1734 | } | 2048 | } |
1735 | 2049 | ||
@@ -1742,9 +2056,9 @@ static struct bpf_map *ld_imm64_to_map_ptr(struct bpf_insn *insn) | |||
1742 | } | 2056 | } |
1743 | 2057 | ||
1744 | /* verify BPF_LD_IMM64 instruction */ | 2058 | /* verify BPF_LD_IMM64 instruction */ |
1745 | static int check_ld_imm(struct verifier_env *env, struct bpf_insn *insn) | 2059 | static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn) |
1746 | { | 2060 | { |
1747 | struct reg_state *regs = env->cur_state.regs; | 2061 | struct bpf_reg_state *regs = env->cur_state.regs; |
1748 | int err; | 2062 | int err; |
1749 | 2063 | ||
1750 | if (BPF_SIZE(insn->code) != BPF_DW) { | 2064 | if (BPF_SIZE(insn->code) != BPF_DW) { |
@@ -1760,9 +2074,19 @@ static int check_ld_imm(struct verifier_env *env, struct bpf_insn *insn) | |||
1760 | if (err) | 2074 | if (err) |
1761 | return err; | 2075 | return err; |
1762 | 2076 | ||
1763 | if (insn->src_reg == 0) | 2077 | if (insn->src_reg == 0) { |
1764 | /* generic move 64-bit immediate into a register */ | 2078 | /* generic move 64-bit immediate into a register, |
2079 | * only analyzer needs to collect the ld_imm value. | ||
2080 | */ | ||
2081 | u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm; | ||
2082 | |||
2083 | if (!env->analyzer_ops) | ||
2084 | return 0; | ||
2085 | |||
2086 | regs[insn->dst_reg].type = CONST_IMM; | ||
2087 | regs[insn->dst_reg].imm = imm; | ||
1765 | return 0; | 2088 | return 0; |
2089 | } | ||
1766 | 2090 | ||
1767 | /* replace_map_fd_with_map_ptr() should have caught bad ld_imm64 */ | 2091 | /* replace_map_fd_with_map_ptr() should have caught bad ld_imm64 */ |
1768 | BUG_ON(insn->src_reg != BPF_PSEUDO_MAP_FD); | 2092 | BUG_ON(insn->src_reg != BPF_PSEUDO_MAP_FD); |
@@ -1799,11 +2123,11 @@ static bool may_access_skb(enum bpf_prog_type type) | |||
1799 | * Output: | 2123 | * Output: |
1800 | * R0 - 8/16/32-bit skb data converted to cpu endianness | 2124 | * R0 - 8/16/32-bit skb data converted to cpu endianness |
1801 | */ | 2125 | */ |
1802 | static int check_ld_abs(struct verifier_env *env, struct bpf_insn *insn) | 2126 | static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) |
1803 | { | 2127 | { |
1804 | struct reg_state *regs = env->cur_state.regs; | 2128 | struct bpf_reg_state *regs = env->cur_state.regs; |
1805 | u8 mode = BPF_MODE(insn->code); | 2129 | u8 mode = BPF_MODE(insn->code); |
1806 | struct reg_state *reg; | 2130 | struct bpf_reg_state *reg; |
1807 | int i, err; | 2131 | int i, err; |
1808 | 2132 | ||
1809 | if (!may_access_skb(env->prog->type)) { | 2133 | if (!may_access_skb(env->prog->type)) { |
@@ -1889,7 +2213,7 @@ enum { | |||
1889 | BRANCH = 2, | 2213 | BRANCH = 2, |
1890 | }; | 2214 | }; |
1891 | 2215 | ||
1892 | #define STATE_LIST_MARK ((struct verifier_state_list *) -1L) | 2216 | #define STATE_LIST_MARK ((struct bpf_verifier_state_list *) -1L) |
1893 | 2217 | ||
1894 | static int *insn_stack; /* stack of insns to process */ | 2218 | static int *insn_stack; /* stack of insns to process */ |
1895 | static int cur_stack; /* current stack index */ | 2219 | static int cur_stack; /* current stack index */ |
@@ -1900,7 +2224,7 @@ static int *insn_state; | |||
1900 | * w - next instruction | 2224 | * w - next instruction |
1901 | * e - edge | 2225 | * e - edge |
1902 | */ | 2226 | */ |
1903 | static int push_insn(int t, int w, int e, struct verifier_env *env) | 2227 | static int push_insn(int t, int w, int e, struct bpf_verifier_env *env) |
1904 | { | 2228 | { |
1905 | if (e == FALLTHROUGH && insn_state[t] >= (DISCOVERED | FALLTHROUGH)) | 2229 | if (e == FALLTHROUGH && insn_state[t] >= (DISCOVERED | FALLTHROUGH)) |
1906 | return 0; | 2230 | return 0; |
@@ -1941,7 +2265,7 @@ static int push_insn(int t, int w, int e, struct verifier_env *env) | |||
1941 | /* non-recursive depth-first-search to detect loops in BPF program | 2265 | /* non-recursive depth-first-search to detect loops in BPF program |
1942 | * loop == back-edge in directed graph | 2266 | * loop == back-edge in directed graph |
1943 | */ | 2267 | */ |
1944 | static int check_cfg(struct verifier_env *env) | 2268 | static int check_cfg(struct bpf_verifier_env *env) |
1945 | { | 2269 | { |
1946 | struct bpf_insn *insns = env->prog->insnsi; | 2270 | struct bpf_insn *insns = env->prog->insnsi; |
1947 | int insn_cnt = env->prog->len; | 2271 | int insn_cnt = env->prog->len; |
@@ -2050,7 +2374,8 @@ err_free: | |||
2050 | /* the following conditions reduce the number of explored insns | 2374 | /* the following conditions reduce the number of explored insns |
2051 | * from ~140k to ~80k for ultra large programs that use a lot of ptr_to_packet | 2375 | * from ~140k to ~80k for ultra large programs that use a lot of ptr_to_packet |
2052 | */ | 2376 | */ |
2053 | static bool compare_ptrs_to_packet(struct reg_state *old, struct reg_state *cur) | 2377 | static bool compare_ptrs_to_packet(struct bpf_reg_state *old, |
2378 | struct bpf_reg_state *cur) | ||
2054 | { | 2379 | { |
2055 | if (old->id != cur->id) | 2380 | if (old->id != cur->id) |
2056 | return false; | 2381 | return false; |
@@ -2125,9 +2450,11 @@ static bool compare_ptrs_to_packet(struct reg_state *old, struct reg_state *cur) | |||
2125 | * whereas register type in current state is meaningful, it means that | 2450 | * whereas register type in current state is meaningful, it means that |
2126 | * the current state will reach 'bpf_exit' instruction safely | 2451 | * the current state will reach 'bpf_exit' instruction safely |
2127 | */ | 2452 | */ |
2128 | static bool states_equal(struct verifier_state *old, struct verifier_state *cur) | 2453 | static bool states_equal(struct bpf_verifier_env *env, |
2454 | struct bpf_verifier_state *old, | ||
2455 | struct bpf_verifier_state *cur) | ||
2129 | { | 2456 | { |
2130 | struct reg_state *rold, *rcur; | 2457 | struct bpf_reg_state *rold, *rcur; |
2131 | int i; | 2458 | int i; |
2132 | 2459 | ||
2133 | for (i = 0; i < MAX_BPF_REG; i++) { | 2460 | for (i = 0; i < MAX_BPF_REG; i++) { |
@@ -2137,6 +2464,13 @@ static bool states_equal(struct verifier_state *old, struct verifier_state *cur) | |||
2137 | if (memcmp(rold, rcur, sizeof(*rold)) == 0) | 2464 | if (memcmp(rold, rcur, sizeof(*rold)) == 0) |
2138 | continue; | 2465 | continue; |
2139 | 2466 | ||
2467 | /* If the ranges were not the same, but everything else was and | ||
2468 | * we didn't do a variable access into a map then we are a-ok. | ||
2469 | */ | ||
2470 | if (!env->varlen_map_value_access && | ||
2471 | rold->type == rcur->type && rold->imm == rcur->imm) | ||
2472 | continue; | ||
2473 | |||
2140 | if (rold->type == NOT_INIT || | 2474 | if (rold->type == NOT_INIT || |
2141 | (rold->type == UNKNOWN_VALUE && rcur->type != NOT_INIT)) | 2475 | (rold->type == UNKNOWN_VALUE && rcur->type != NOT_INIT)) |
2142 | continue; | 2476 | continue; |
@@ -2167,9 +2501,9 @@ static bool states_equal(struct verifier_state *old, struct verifier_state *cur) | |||
2167 | * the same, check that stored pointers types | 2501 | * the same, check that stored pointers types |
2168 | * are the same as well. | 2502 | * are the same as well. |
2169 | * Ex: explored safe path could have stored | 2503 | * Ex: explored safe path could have stored |
2170 | * (struct reg_state) {.type = PTR_TO_STACK, .imm = -8} | 2504 | * (bpf_reg_state) {.type = PTR_TO_STACK, .imm = -8} |
2171 | * but current path has stored: | 2505 | * but current path has stored: |
2172 | * (struct reg_state) {.type = PTR_TO_STACK, .imm = -16} | 2506 | * (bpf_reg_state) {.type = PTR_TO_STACK, .imm = -16} |
2173 | * such verifier states are not equivalent. | 2507 | * such verifier states are not equivalent. |
2174 | * return false to continue verification of this path | 2508 | * return false to continue verification of this path |
2175 | */ | 2509 | */ |
@@ -2180,10 +2514,10 @@ static bool states_equal(struct verifier_state *old, struct verifier_state *cur) | |||
2180 | return true; | 2514 | return true; |
2181 | } | 2515 | } |
2182 | 2516 | ||
2183 | static int is_state_visited(struct verifier_env *env, int insn_idx) | 2517 | static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) |
2184 | { | 2518 | { |
2185 | struct verifier_state_list *new_sl; | 2519 | struct bpf_verifier_state_list *new_sl; |
2186 | struct verifier_state_list *sl; | 2520 | struct bpf_verifier_state_list *sl; |
2187 | 2521 | ||
2188 | sl = env->explored_states[insn_idx]; | 2522 | sl = env->explored_states[insn_idx]; |
2189 | if (!sl) | 2523 | if (!sl) |
@@ -2193,7 +2527,7 @@ static int is_state_visited(struct verifier_env *env, int insn_idx) | |||
2193 | return 0; | 2527 | return 0; |
2194 | 2528 | ||
2195 | while (sl != STATE_LIST_MARK) { | 2529 | while (sl != STATE_LIST_MARK) { |
2196 | if (states_equal(&sl->state, &env->cur_state)) | 2530 | if (states_equal(env, &sl->state, &env->cur_state)) |
2197 | /* reached equivalent register/stack state, | 2531 | /* reached equivalent register/stack state, |
2198 | * prune the search | 2532 | * prune the search |
2199 | */ | 2533 | */ |
@@ -2207,7 +2541,7 @@ static int is_state_visited(struct verifier_env *env, int insn_idx) | |||
2207 | * it will be rejected. Since there are no loops, we won't be | 2541 | * it will be rejected. Since there are no loops, we won't be |
2208 | * seeing this 'insn_idx' instruction again on the way to bpf_exit | 2542 | * seeing this 'insn_idx' instruction again on the way to bpf_exit |
2209 | */ | 2543 | */ |
2210 | new_sl = kmalloc(sizeof(struct verifier_state_list), GFP_USER); | 2544 | new_sl = kmalloc(sizeof(struct bpf_verifier_state_list), GFP_USER); |
2211 | if (!new_sl) | 2545 | if (!new_sl) |
2212 | return -ENOMEM; | 2546 | return -ENOMEM; |
2213 | 2547 | ||
@@ -2218,11 +2552,20 @@ static int is_state_visited(struct verifier_env *env, int insn_idx) | |||
2218 | return 0; | 2552 | return 0; |
2219 | } | 2553 | } |
2220 | 2554 | ||
2221 | static int do_check(struct verifier_env *env) | 2555 | static int ext_analyzer_insn_hook(struct bpf_verifier_env *env, |
2556 | int insn_idx, int prev_insn_idx) | ||
2557 | { | ||
2558 | if (!env->analyzer_ops || !env->analyzer_ops->insn_hook) | ||
2559 | return 0; | ||
2560 | |||
2561 | return env->analyzer_ops->insn_hook(env, insn_idx, prev_insn_idx); | ||
2562 | } | ||
2563 | |||
2564 | static int do_check(struct bpf_verifier_env *env) | ||
2222 | { | 2565 | { |
2223 | struct verifier_state *state = &env->cur_state; | 2566 | struct bpf_verifier_state *state = &env->cur_state; |
2224 | struct bpf_insn *insns = env->prog->insnsi; | 2567 | struct bpf_insn *insns = env->prog->insnsi; |
2225 | struct reg_state *regs = state->regs; | 2568 | struct bpf_reg_state *regs = state->regs; |
2226 | int insn_cnt = env->prog->len; | 2569 | int insn_cnt = env->prog->len; |
2227 | int insn_idx, prev_insn_idx = 0; | 2570 | int insn_idx, prev_insn_idx = 0; |
2228 | int insn_processed = 0; | 2571 | int insn_processed = 0; |
@@ -2230,6 +2573,7 @@ static int do_check(struct verifier_env *env) | |||
2230 | 2573 | ||
2231 | init_reg_state(regs); | 2574 | init_reg_state(regs); |
2232 | insn_idx = 0; | 2575 | insn_idx = 0; |
2576 | env->varlen_map_value_access = false; | ||
2233 | for (;;) { | 2577 | for (;;) { |
2234 | struct bpf_insn *insn; | 2578 | struct bpf_insn *insn; |
2235 | u8 class; | 2579 | u8 class; |
@@ -2276,13 +2620,17 @@ static int do_check(struct verifier_env *env) | |||
2276 | print_bpf_insn(insn); | 2620 | print_bpf_insn(insn); |
2277 | } | 2621 | } |
2278 | 2622 | ||
2623 | err = ext_analyzer_insn_hook(env, insn_idx, prev_insn_idx); | ||
2624 | if (err) | ||
2625 | return err; | ||
2626 | |||
2279 | if (class == BPF_ALU || class == BPF_ALU64) { | 2627 | if (class == BPF_ALU || class == BPF_ALU64) { |
2280 | err = check_alu_op(env, insn); | 2628 | err = check_alu_op(env, insn); |
2281 | if (err) | 2629 | if (err) |
2282 | return err; | 2630 | return err; |
2283 | 2631 | ||
2284 | } else if (class == BPF_LDX) { | 2632 | } else if (class == BPF_LDX) { |
2285 | enum bpf_reg_type src_reg_type; | 2633 | enum bpf_reg_type *prev_src_type, src_reg_type; |
2286 | 2634 | ||
2287 | /* check for reserved fields is already done */ | 2635 | /* check for reserved fields is already done */ |
2288 | 2636 | ||
@@ -2306,21 +2654,25 @@ static int do_check(struct verifier_env *env) | |||
2306 | if (err) | 2654 | if (err) |
2307 | return err; | 2655 | return err; |
2308 | 2656 | ||
2309 | if (BPF_SIZE(insn->code) != BPF_W) { | 2657 | reset_reg_range_values(regs, insn->dst_reg); |
2658 | if (BPF_SIZE(insn->code) != BPF_W && | ||
2659 | BPF_SIZE(insn->code) != BPF_DW) { | ||
2310 | insn_idx++; | 2660 | insn_idx++; |
2311 | continue; | 2661 | continue; |
2312 | } | 2662 | } |
2313 | 2663 | ||
2314 | if (insn->imm == 0) { | 2664 | prev_src_type = &env->insn_aux_data[insn_idx].ptr_type; |
2665 | |||
2666 | if (*prev_src_type == NOT_INIT) { | ||
2315 | /* saw a valid insn | 2667 | /* saw a valid insn |
2316 | * dst_reg = *(u32 *)(src_reg + off) | 2668 | * dst_reg = *(u32 *)(src_reg + off) |
2317 | * use reserved 'imm' field to mark this insn | 2669 | * save type to validate intersecting paths |
2318 | */ | 2670 | */ |
2319 | insn->imm = src_reg_type; | 2671 | *prev_src_type = src_reg_type; |
2320 | 2672 | ||
2321 | } else if (src_reg_type != insn->imm && | 2673 | } else if (src_reg_type != *prev_src_type && |
2322 | (src_reg_type == PTR_TO_CTX || | 2674 | (src_reg_type == PTR_TO_CTX || |
2323 | insn->imm == PTR_TO_CTX)) { | 2675 | *prev_src_type == PTR_TO_CTX)) { |
2324 | /* ABuser program is trying to use the same insn | 2676 | /* ABuser program is trying to use the same insn |
2325 | * dst_reg = *(u32*) (src_reg + off) | 2677 | * dst_reg = *(u32*) (src_reg + off) |
2326 | * with different pointer types: | 2678 | * with different pointer types: |
@@ -2333,7 +2685,7 @@ static int do_check(struct verifier_env *env) | |||
2333 | } | 2685 | } |
2334 | 2686 | ||
2335 | } else if (class == BPF_STX) { | 2687 | } else if (class == BPF_STX) { |
2336 | enum bpf_reg_type dst_reg_type; | 2688 | enum bpf_reg_type *prev_dst_type, dst_reg_type; |
2337 | 2689 | ||
2338 | if (BPF_MODE(insn->code) == BPF_XADD) { | 2690 | if (BPF_MODE(insn->code) == BPF_XADD) { |
2339 | err = check_xadd(env, insn); | 2691 | err = check_xadd(env, insn); |
@@ -2361,11 +2713,13 @@ static int do_check(struct verifier_env *env) | |||
2361 | if (err) | 2713 | if (err) |
2362 | return err; | 2714 | return err; |
2363 | 2715 | ||
2364 | if (insn->imm == 0) { | 2716 | prev_dst_type = &env->insn_aux_data[insn_idx].ptr_type; |
2365 | insn->imm = dst_reg_type; | 2717 | |
2366 | } else if (dst_reg_type != insn->imm && | 2718 | if (*prev_dst_type == NOT_INIT) { |
2719 | *prev_dst_type = dst_reg_type; | ||
2720 | } else if (dst_reg_type != *prev_dst_type && | ||
2367 | (dst_reg_type == PTR_TO_CTX || | 2721 | (dst_reg_type == PTR_TO_CTX || |
2368 | insn->imm == PTR_TO_CTX)) { | 2722 | *prev_dst_type == PTR_TO_CTX)) { |
2369 | verbose("same insn cannot be used with different pointers\n"); | 2723 | verbose("same insn cannot be used with different pointers\n"); |
2370 | return -EINVAL; | 2724 | return -EINVAL; |
2371 | } | 2725 | } |
@@ -2471,6 +2825,7 @@ process_bpf_exit: | |||
2471 | verbose("invalid BPF_LD mode\n"); | 2825 | verbose("invalid BPF_LD mode\n"); |
2472 | return -EINVAL; | 2826 | return -EINVAL; |
2473 | } | 2827 | } |
2828 | reset_reg_range_values(regs, insn->dst_reg); | ||
2474 | } else { | 2829 | } else { |
2475 | verbose("unknown insn class %d\n", class); | 2830 | verbose("unknown insn class %d\n", class); |
2476 | return -EINVAL; | 2831 | return -EINVAL; |
@@ -2483,14 +2838,28 @@ process_bpf_exit: | |||
2483 | return 0; | 2838 | return 0; |
2484 | } | 2839 | } |
2485 | 2840 | ||
2841 | static int check_map_prog_compatibility(struct bpf_map *map, | ||
2842 | struct bpf_prog *prog) | ||
2843 | |||
2844 | { | ||
2845 | if (prog->type == BPF_PROG_TYPE_PERF_EVENT && | ||
2846 | (map->map_type == BPF_MAP_TYPE_HASH || | ||
2847 | map->map_type == BPF_MAP_TYPE_PERCPU_HASH) && | ||
2848 | (map->map_flags & BPF_F_NO_PREALLOC)) { | ||
2849 | verbose("perf_event programs can only use preallocated hash map\n"); | ||
2850 | return -EINVAL; | ||
2851 | } | ||
2852 | return 0; | ||
2853 | } | ||
2854 | |||
2486 | /* look for pseudo eBPF instructions that access map FDs and | 2855 | /* look for pseudo eBPF instructions that access map FDs and |
2487 | * replace them with actual map pointers | 2856 | * replace them with actual map pointers |
2488 | */ | 2857 | */ |
2489 | static int replace_map_fd_with_map_ptr(struct verifier_env *env) | 2858 | static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env) |
2490 | { | 2859 | { |
2491 | struct bpf_insn *insn = env->prog->insnsi; | 2860 | struct bpf_insn *insn = env->prog->insnsi; |
2492 | int insn_cnt = env->prog->len; | 2861 | int insn_cnt = env->prog->len; |
2493 | int i, j; | 2862 | int i, j, err; |
2494 | 2863 | ||
2495 | for (i = 0; i < insn_cnt; i++, insn++) { | 2864 | for (i = 0; i < insn_cnt; i++, insn++) { |
2496 | if (BPF_CLASS(insn->code) == BPF_LDX && | 2865 | if (BPF_CLASS(insn->code) == BPF_LDX && |
@@ -2534,6 +2903,12 @@ static int replace_map_fd_with_map_ptr(struct verifier_env *env) | |||
2534 | return PTR_ERR(map); | 2903 | return PTR_ERR(map); |
2535 | } | 2904 | } |
2536 | 2905 | ||
2906 | err = check_map_prog_compatibility(map, env->prog); | ||
2907 | if (err) { | ||
2908 | fdput(f); | ||
2909 | return err; | ||
2910 | } | ||
2911 | |||
2537 | /* store map pointer inside BPF_LD_IMM64 instruction */ | 2912 | /* store map pointer inside BPF_LD_IMM64 instruction */ |
2538 | insn[0].imm = (u32) (unsigned long) map; | 2913 | insn[0].imm = (u32) (unsigned long) map; |
2539 | insn[1].imm = ((u64) (unsigned long) map) >> 32; | 2914 | insn[1].imm = ((u64) (unsigned long) map) >> 32; |
@@ -2577,7 +2952,7 @@ next_insn: | |||
2577 | } | 2952 | } |
2578 | 2953 | ||
2579 | /* drop refcnt of maps used by the rejected program */ | 2954 | /* drop refcnt of maps used by the rejected program */ |
2580 | static void release_maps(struct verifier_env *env) | 2955 | static void release_maps(struct bpf_verifier_env *env) |
2581 | { | 2956 | { |
2582 | int i; | 2957 | int i; |
2583 | 2958 | ||
@@ -2586,7 +2961,7 @@ static void release_maps(struct verifier_env *env) | |||
2586 | } | 2961 | } |
2587 | 2962 | ||
2588 | /* convert pseudo BPF_LD_IMM64 into generic BPF_LD_IMM64 */ | 2963 | /* convert pseudo BPF_LD_IMM64 into generic BPF_LD_IMM64 */ |
2589 | static void convert_pseudo_ld_imm64(struct verifier_env *env) | 2964 | static void convert_pseudo_ld_imm64(struct bpf_verifier_env *env) |
2590 | { | 2965 | { |
2591 | struct bpf_insn *insn = env->prog->insnsi; | 2966 | struct bpf_insn *insn = env->prog->insnsi; |
2592 | int insn_cnt = env->prog->len; | 2967 | int insn_cnt = env->prog->len; |
@@ -2600,62 +2975,74 @@ static void convert_pseudo_ld_imm64(struct verifier_env *env) | |||
2600 | /* convert load instructions that access fields of 'struct __sk_buff' | 2975 | /* convert load instructions that access fields of 'struct __sk_buff' |
2601 | * into sequence of instructions that access fields of 'struct sk_buff' | 2976 | * into sequence of instructions that access fields of 'struct sk_buff' |
2602 | */ | 2977 | */ |
2603 | static int convert_ctx_accesses(struct verifier_env *env) | 2978 | static int convert_ctx_accesses(struct bpf_verifier_env *env) |
2604 | { | 2979 | { |
2605 | struct bpf_insn *insn = env->prog->insnsi; | 2980 | const struct bpf_verifier_ops *ops = env->prog->aux->ops; |
2606 | int insn_cnt = env->prog->len; | 2981 | const int insn_cnt = env->prog->len; |
2607 | struct bpf_insn insn_buf[16]; | 2982 | struct bpf_insn insn_buf[16], *insn; |
2608 | struct bpf_prog *new_prog; | 2983 | struct bpf_prog *new_prog; |
2609 | enum bpf_access_type type; | 2984 | enum bpf_access_type type; |
2610 | int i; | 2985 | int i, cnt, delta = 0; |
2611 | 2986 | ||
2612 | if (!env->prog->aux->ops->convert_ctx_access) | 2987 | if (ops->gen_prologue) { |
2988 | cnt = ops->gen_prologue(insn_buf, env->seen_direct_write, | ||
2989 | env->prog); | ||
2990 | if (cnt >= ARRAY_SIZE(insn_buf)) { | ||
2991 | verbose("bpf verifier is misconfigured\n"); | ||
2992 | return -EINVAL; | ||
2993 | } else if (cnt) { | ||
2994 | new_prog = bpf_patch_insn_single(env->prog, 0, | ||
2995 | insn_buf, cnt); | ||
2996 | if (!new_prog) | ||
2997 | return -ENOMEM; | ||
2998 | env->prog = new_prog; | ||
2999 | delta += cnt - 1; | ||
3000 | } | ||
3001 | } | ||
3002 | |||
3003 | if (!ops->convert_ctx_access) | ||
2613 | return 0; | 3004 | return 0; |
2614 | 3005 | ||
2615 | for (i = 0; i < insn_cnt; i++, insn++) { | 3006 | insn = env->prog->insnsi + delta; |
2616 | u32 insn_delta, cnt; | ||
2617 | 3007 | ||
2618 | if (insn->code == (BPF_LDX | BPF_MEM | BPF_W)) | 3008 | for (i = 0; i < insn_cnt; i++, insn++) { |
3009 | if (insn->code == (BPF_LDX | BPF_MEM | BPF_W) || | ||
3010 | insn->code == (BPF_LDX | BPF_MEM | BPF_DW)) | ||
2619 | type = BPF_READ; | 3011 | type = BPF_READ; |
2620 | else if (insn->code == (BPF_STX | BPF_MEM | BPF_W)) | 3012 | else if (insn->code == (BPF_STX | BPF_MEM | BPF_W) || |
3013 | insn->code == (BPF_STX | BPF_MEM | BPF_DW)) | ||
2621 | type = BPF_WRITE; | 3014 | type = BPF_WRITE; |
2622 | else | 3015 | else |
2623 | continue; | 3016 | continue; |
2624 | 3017 | ||
2625 | if (insn->imm != PTR_TO_CTX) { | 3018 | if (env->insn_aux_data[i].ptr_type != PTR_TO_CTX) |
2626 | /* clear internal mark */ | ||
2627 | insn->imm = 0; | ||
2628 | continue; | 3019 | continue; |
2629 | } | ||
2630 | 3020 | ||
2631 | cnt = env->prog->aux->ops-> | 3021 | cnt = ops->convert_ctx_access(type, insn->dst_reg, insn->src_reg, |
2632 | convert_ctx_access(type, insn->dst_reg, insn->src_reg, | 3022 | insn->off, insn_buf, env->prog); |
2633 | insn->off, insn_buf, env->prog); | ||
2634 | if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) { | 3023 | if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) { |
2635 | verbose("bpf verifier is misconfigured\n"); | 3024 | verbose("bpf verifier is misconfigured\n"); |
2636 | return -EINVAL; | 3025 | return -EINVAL; |
2637 | } | 3026 | } |
2638 | 3027 | ||
2639 | new_prog = bpf_patch_insn_single(env->prog, i, insn_buf, cnt); | 3028 | new_prog = bpf_patch_insn_single(env->prog, i + delta, insn_buf, |
3029 | cnt); | ||
2640 | if (!new_prog) | 3030 | if (!new_prog) |
2641 | return -ENOMEM; | 3031 | return -ENOMEM; |
2642 | 3032 | ||
2643 | insn_delta = cnt - 1; | 3033 | delta += cnt - 1; |
2644 | 3034 | ||
2645 | /* keep walking new program and skip insns we just inserted */ | 3035 | /* keep walking new program and skip insns we just inserted */ |
2646 | env->prog = new_prog; | 3036 | env->prog = new_prog; |
2647 | insn = new_prog->insnsi + i + insn_delta; | 3037 | insn = new_prog->insnsi + i + delta; |
2648 | |||
2649 | insn_cnt += insn_delta; | ||
2650 | i += insn_delta; | ||
2651 | } | 3038 | } |
2652 | 3039 | ||
2653 | return 0; | 3040 | return 0; |
2654 | } | 3041 | } |
2655 | 3042 | ||
2656 | static void free_states(struct verifier_env *env) | 3043 | static void free_states(struct bpf_verifier_env *env) |
2657 | { | 3044 | { |
2658 | struct verifier_state_list *sl, *sln; | 3045 | struct bpf_verifier_state_list *sl, *sln; |
2659 | int i; | 3046 | int i; |
2660 | 3047 | ||
2661 | if (!env->explored_states) | 3048 | if (!env->explored_states) |
@@ -2678,19 +3065,24 @@ static void free_states(struct verifier_env *env) | |||
2678 | int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) | 3065 | int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) |
2679 | { | 3066 | { |
2680 | char __user *log_ubuf = NULL; | 3067 | char __user *log_ubuf = NULL; |
2681 | struct verifier_env *env; | 3068 | struct bpf_verifier_env *env; |
2682 | int ret = -EINVAL; | 3069 | int ret = -EINVAL; |
2683 | 3070 | ||
2684 | if ((*prog)->len <= 0 || (*prog)->len > BPF_MAXINSNS) | 3071 | if ((*prog)->len <= 0 || (*prog)->len > BPF_MAXINSNS) |
2685 | return -E2BIG; | 3072 | return -E2BIG; |
2686 | 3073 | ||
2687 | /* 'struct verifier_env' can be global, but since it's not small, | 3074 | /* 'struct bpf_verifier_env' can be global, but since it's not small, |
2688 | * allocate/free it every time bpf_check() is called | 3075 | * allocate/free it every time bpf_check() is called |
2689 | */ | 3076 | */ |
2690 | env = kzalloc(sizeof(struct verifier_env), GFP_KERNEL); | 3077 | env = kzalloc(sizeof(struct bpf_verifier_env), GFP_KERNEL); |
2691 | if (!env) | 3078 | if (!env) |
2692 | return -ENOMEM; | 3079 | return -ENOMEM; |
2693 | 3080 | ||
3081 | env->insn_aux_data = vzalloc(sizeof(struct bpf_insn_aux_data) * | ||
3082 | (*prog)->len); | ||
3083 | ret = -ENOMEM; | ||
3084 | if (!env->insn_aux_data) | ||
3085 | goto err_free_env; | ||
2694 | env->prog = *prog; | 3086 | env->prog = *prog; |
2695 | 3087 | ||
2696 | /* grab the mutex to protect few globals used by verifier */ | 3088 | /* grab the mutex to protect few globals used by verifier */ |
@@ -2709,12 +3101,12 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) | |||
2709 | /* log_* values have to be sane */ | 3101 | /* log_* values have to be sane */ |
2710 | if (log_size < 128 || log_size > UINT_MAX >> 8 || | 3102 | if (log_size < 128 || log_size > UINT_MAX >> 8 || |
2711 | log_level == 0 || log_ubuf == NULL) | 3103 | log_level == 0 || log_ubuf == NULL) |
2712 | goto free_env; | 3104 | goto err_unlock; |
2713 | 3105 | ||
2714 | ret = -ENOMEM; | 3106 | ret = -ENOMEM; |
2715 | log_buf = vmalloc(log_size); | 3107 | log_buf = vmalloc(log_size); |
2716 | if (!log_buf) | 3108 | if (!log_buf) |
2717 | goto free_env; | 3109 | goto err_unlock; |
2718 | } else { | 3110 | } else { |
2719 | log_level = 0; | 3111 | log_level = 0; |
2720 | } | 3112 | } |
@@ -2724,7 +3116,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) | |||
2724 | goto skip_full_check; | 3116 | goto skip_full_check; |
2725 | 3117 | ||
2726 | env->explored_states = kcalloc(env->prog->len, | 3118 | env->explored_states = kcalloc(env->prog->len, |
2727 | sizeof(struct verifier_state_list *), | 3119 | sizeof(struct bpf_verifier_state_list *), |
2728 | GFP_USER); | 3120 | GFP_USER); |
2729 | ret = -ENOMEM; | 3121 | ret = -ENOMEM; |
2730 | if (!env->explored_states) | 3122 | if (!env->explored_states) |
@@ -2783,14 +3175,67 @@ skip_full_check: | |||
2783 | free_log_buf: | 3175 | free_log_buf: |
2784 | if (log_level) | 3176 | if (log_level) |
2785 | vfree(log_buf); | 3177 | vfree(log_buf); |
2786 | free_env: | ||
2787 | if (!env->prog->aux->used_maps) | 3178 | if (!env->prog->aux->used_maps) |
2788 | /* if we didn't copy map pointers into bpf_prog_info, release | 3179 | /* if we didn't copy map pointers into bpf_prog_info, release |
2789 | * them now. Otherwise free_bpf_prog_info() will release them. | 3180 | * them now. Otherwise free_bpf_prog_info() will release them. |
2790 | */ | 3181 | */ |
2791 | release_maps(env); | 3182 | release_maps(env); |
2792 | *prog = env->prog; | 3183 | *prog = env->prog; |
3184 | err_unlock: | ||
3185 | mutex_unlock(&bpf_verifier_lock); | ||
3186 | vfree(env->insn_aux_data); | ||
3187 | err_free_env: | ||
2793 | kfree(env); | 3188 | kfree(env); |
3189 | return ret; | ||
3190 | } | ||
3191 | |||
3192 | int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops, | ||
3193 | void *priv) | ||
3194 | { | ||
3195 | struct bpf_verifier_env *env; | ||
3196 | int ret; | ||
3197 | |||
3198 | env = kzalloc(sizeof(struct bpf_verifier_env), GFP_KERNEL); | ||
3199 | if (!env) | ||
3200 | return -ENOMEM; | ||
3201 | |||
3202 | env->insn_aux_data = vzalloc(sizeof(struct bpf_insn_aux_data) * | ||
3203 | prog->len); | ||
3204 | ret = -ENOMEM; | ||
3205 | if (!env->insn_aux_data) | ||
3206 | goto err_free_env; | ||
3207 | env->prog = prog; | ||
3208 | env->analyzer_ops = ops; | ||
3209 | env->analyzer_priv = priv; | ||
3210 | |||
3211 | /* grab the mutex to protect few globals used by verifier */ | ||
3212 | mutex_lock(&bpf_verifier_lock); | ||
3213 | |||
3214 | log_level = 0; | ||
3215 | |||
3216 | env->explored_states = kcalloc(env->prog->len, | ||
3217 | sizeof(struct bpf_verifier_state_list *), | ||
3218 | GFP_KERNEL); | ||
3219 | ret = -ENOMEM; | ||
3220 | if (!env->explored_states) | ||
3221 | goto skip_full_check; | ||
3222 | |||
3223 | ret = check_cfg(env); | ||
3224 | if (ret < 0) | ||
3225 | goto skip_full_check; | ||
3226 | |||
3227 | env->allow_ptr_leaks = capable(CAP_SYS_ADMIN); | ||
3228 | |||
3229 | ret = do_check(env); | ||
3230 | |||
3231 | skip_full_check: | ||
3232 | while (pop_stack(env, NULL) >= 0); | ||
3233 | free_states(env); | ||
3234 | |||
2794 | mutex_unlock(&bpf_verifier_lock); | 3235 | mutex_unlock(&bpf_verifier_lock); |
3236 | vfree(env->insn_aux_data); | ||
3237 | err_free_env: | ||
3238 | kfree(env); | ||
2795 | return ret; | 3239 | return ret; |
2796 | } | 3240 | } |
3241 | EXPORT_SYMBOL_GPL(bpf_analyzer); | ||
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index d1c51b7f5221..85bc9beb046d 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c | |||
@@ -64,6 +64,9 @@ | |||
64 | #include <linux/file.h> | 64 | #include <linux/file.h> |
65 | #include <net/sock.h> | 65 | #include <net/sock.h> |
66 | 66 | ||
67 | #define CREATE_TRACE_POINTS | ||
68 | #include <trace/events/cgroup.h> | ||
69 | |||
67 | /* | 70 | /* |
68 | * pidlists linger the following amount before being destroyed. The goal | 71 | * pidlists linger the following amount before being destroyed. The goal |
69 | * is avoiding frequent destruction in the middle of consecutive read calls | 72 | * is avoiding frequent destruction in the middle of consecutive read calls |
@@ -1176,6 +1179,8 @@ static void cgroup_destroy_root(struct cgroup_root *root) | |||
1176 | struct cgroup *cgrp = &root->cgrp; | 1179 | struct cgroup *cgrp = &root->cgrp; |
1177 | struct cgrp_cset_link *link, *tmp_link; | 1180 | struct cgrp_cset_link *link, *tmp_link; |
1178 | 1181 | ||
1182 | trace_cgroup_destroy_root(root); | ||
1183 | |||
1179 | cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp); | 1184 | cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp); |
1180 | 1185 | ||
1181 | BUG_ON(atomic_read(&root->nr_cgrps)); | 1186 | BUG_ON(atomic_read(&root->nr_cgrps)); |
@@ -1874,6 +1879,9 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data) | |||
1874 | strcpy(root->release_agent_path, opts.release_agent); | 1879 | strcpy(root->release_agent_path, opts.release_agent); |
1875 | spin_unlock(&release_agent_path_lock); | 1880 | spin_unlock(&release_agent_path_lock); |
1876 | } | 1881 | } |
1882 | |||
1883 | trace_cgroup_remount(root); | ||
1884 | |||
1877 | out_unlock: | 1885 | out_unlock: |
1878 | kfree(opts.release_agent); | 1886 | kfree(opts.release_agent); |
1879 | kfree(opts.name); | 1887 | kfree(opts.name); |
@@ -2031,6 +2039,8 @@ static int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) | |||
2031 | if (ret) | 2039 | if (ret) |
2032 | goto destroy_root; | 2040 | goto destroy_root; |
2033 | 2041 | ||
2042 | trace_cgroup_setup_root(root); | ||
2043 | |||
2034 | /* | 2044 | /* |
2035 | * There must be no failure case after here, since rebinding takes | 2045 | * There must be no failure case after here, since rebinding takes |
2036 | * care of subsystems' refcounts, which are explicitly dropped in | 2046 | * care of subsystems' refcounts, which are explicitly dropped in |
@@ -2315,22 +2325,18 @@ static struct file_system_type cgroup2_fs_type = { | |||
2315 | .fs_flags = FS_USERNS_MOUNT, | 2325 | .fs_flags = FS_USERNS_MOUNT, |
2316 | }; | 2326 | }; |
2317 | 2327 | ||
2318 | static char *cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen, | 2328 | static int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen, |
2319 | struct cgroup_namespace *ns) | 2329 | struct cgroup_namespace *ns) |
2320 | { | 2330 | { |
2321 | struct cgroup *root = cset_cgroup_from_root(ns->root_cset, cgrp->root); | 2331 | struct cgroup *root = cset_cgroup_from_root(ns->root_cset, cgrp->root); |
2322 | int ret; | ||
2323 | 2332 | ||
2324 | ret = kernfs_path_from_node(cgrp->kn, root->kn, buf, buflen); | 2333 | return kernfs_path_from_node(cgrp->kn, root->kn, buf, buflen); |
2325 | if (ret < 0 || ret >= buflen) | ||
2326 | return NULL; | ||
2327 | return buf; | ||
2328 | } | 2334 | } |
2329 | 2335 | ||
2330 | char *cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen, | 2336 | int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen, |
2331 | struct cgroup_namespace *ns) | 2337 | struct cgroup_namespace *ns) |
2332 | { | 2338 | { |
2333 | char *ret; | 2339 | int ret; |
2334 | 2340 | ||
2335 | mutex_lock(&cgroup_mutex); | 2341 | mutex_lock(&cgroup_mutex); |
2336 | spin_lock_irq(&css_set_lock); | 2342 | spin_lock_irq(&css_set_lock); |
@@ -2357,12 +2363,12 @@ EXPORT_SYMBOL_GPL(cgroup_path_ns); | |||
2357 | * | 2363 | * |
2358 | * Return value is the same as kernfs_path(). | 2364 | * Return value is the same as kernfs_path(). |
2359 | */ | 2365 | */ |
2360 | char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen) | 2366 | int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen) |
2361 | { | 2367 | { |
2362 | struct cgroup_root *root; | 2368 | struct cgroup_root *root; |
2363 | struct cgroup *cgrp; | 2369 | struct cgroup *cgrp; |
2364 | int hierarchy_id = 1; | 2370 | int hierarchy_id = 1; |
2365 | char *path = NULL; | 2371 | int ret; |
2366 | 2372 | ||
2367 | mutex_lock(&cgroup_mutex); | 2373 | mutex_lock(&cgroup_mutex); |
2368 | spin_lock_irq(&css_set_lock); | 2374 | spin_lock_irq(&css_set_lock); |
@@ -2371,16 +2377,15 @@ char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen) | |||
2371 | 2377 | ||
2372 | if (root) { | 2378 | if (root) { |
2373 | cgrp = task_cgroup_from_root(task, root); | 2379 | cgrp = task_cgroup_from_root(task, root); |
2374 | path = cgroup_path_ns_locked(cgrp, buf, buflen, &init_cgroup_ns); | 2380 | ret = cgroup_path_ns_locked(cgrp, buf, buflen, &init_cgroup_ns); |
2375 | } else { | 2381 | } else { |
2376 | /* if no hierarchy exists, everyone is in "/" */ | 2382 | /* if no hierarchy exists, everyone is in "/" */ |
2377 | if (strlcpy(buf, "/", buflen) < buflen) | 2383 | ret = strlcpy(buf, "/", buflen); |
2378 | path = buf; | ||
2379 | } | 2384 | } |
2380 | 2385 | ||
2381 | spin_unlock_irq(&css_set_lock); | 2386 | spin_unlock_irq(&css_set_lock); |
2382 | mutex_unlock(&cgroup_mutex); | 2387 | mutex_unlock(&cgroup_mutex); |
2383 | return path; | 2388 | return ret; |
2384 | } | 2389 | } |
2385 | EXPORT_SYMBOL_GPL(task_cgroup_path); | 2390 | EXPORT_SYMBOL_GPL(task_cgroup_path); |
2386 | 2391 | ||
@@ -2830,6 +2835,10 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp, | |||
2830 | ret = cgroup_migrate(leader, threadgroup, dst_cgrp->root); | 2835 | ret = cgroup_migrate(leader, threadgroup, dst_cgrp->root); |
2831 | 2836 | ||
2832 | cgroup_migrate_finish(&preloaded_csets); | 2837 | cgroup_migrate_finish(&preloaded_csets); |
2838 | |||
2839 | if (!ret) | ||
2840 | trace_cgroup_attach_task(dst_cgrp, leader, threadgroup); | ||
2841 | |||
2833 | return ret; | 2842 | return ret; |
2834 | } | 2843 | } |
2835 | 2844 | ||
@@ -3446,9 +3455,28 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, | |||
3446 | * Except for the root, subtree_control must be zero for a cgroup | 3455 | * Except for the root, subtree_control must be zero for a cgroup |
3447 | * with tasks so that child cgroups don't compete against tasks. | 3456 | * with tasks so that child cgroups don't compete against tasks. |
3448 | */ | 3457 | */ |
3449 | if (enable && cgroup_parent(cgrp) && !list_empty(&cgrp->cset_links)) { | 3458 | if (enable && cgroup_parent(cgrp)) { |
3450 | ret = -EBUSY; | 3459 | struct cgrp_cset_link *link; |
3451 | goto out_unlock; | 3460 | |
3461 | /* | ||
3462 | * Because namespaces pin csets too, @cgrp->cset_links | ||
3463 | * might not be empty even when @cgrp is empty. Walk and | ||
3464 | * verify each cset. | ||
3465 | */ | ||
3466 | spin_lock_irq(&css_set_lock); | ||
3467 | |||
3468 | ret = 0; | ||
3469 | list_for_each_entry(link, &cgrp->cset_links, cset_link) { | ||
3470 | if (css_set_populated(link->cset)) { | ||
3471 | ret = -EBUSY; | ||
3472 | break; | ||
3473 | } | ||
3474 | } | ||
3475 | |||
3476 | spin_unlock_irq(&css_set_lock); | ||
3477 | |||
3478 | if (ret) | ||
3479 | goto out_unlock; | ||
3452 | } | 3480 | } |
3453 | 3481 | ||
3454 | /* save and update control masks and prepare csses */ | 3482 | /* save and update control masks and prepare csses */ |
@@ -3592,6 +3620,8 @@ static int cgroup_rename(struct kernfs_node *kn, struct kernfs_node *new_parent, | |||
3592 | mutex_lock(&cgroup_mutex); | 3620 | mutex_lock(&cgroup_mutex); |
3593 | 3621 | ||
3594 | ret = kernfs_rename(kn, new_parent, new_name_str); | 3622 | ret = kernfs_rename(kn, new_parent, new_name_str); |
3623 | if (!ret) | ||
3624 | trace_cgroup_rename(cgrp); | ||
3595 | 3625 | ||
3596 | mutex_unlock(&cgroup_mutex); | 3626 | mutex_unlock(&cgroup_mutex); |
3597 | 3627 | ||
@@ -3899,7 +3929,9 @@ void cgroup_file_notify(struct cgroup_file *cfile) | |||
3899 | * cgroup_task_count - count the number of tasks in a cgroup. | 3929 | * cgroup_task_count - count the number of tasks in a cgroup. |
3900 | * @cgrp: the cgroup in question | 3930 | * @cgrp: the cgroup in question |
3901 | * | 3931 | * |
3902 | * Return the number of tasks in the cgroup. | 3932 | * Return the number of tasks in the cgroup. The returned number can be |
3933 | * higher than the actual number of tasks due to css_set references from | ||
3934 | * namespace roots and temporary usages. | ||
3903 | */ | 3935 | */ |
3904 | static int cgroup_task_count(const struct cgroup *cgrp) | 3936 | static int cgroup_task_count(const struct cgroup *cgrp) |
3905 | { | 3937 | { |
@@ -4360,6 +4392,8 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) | |||
4360 | 4392 | ||
4361 | if (task) { | 4393 | if (task) { |
4362 | ret = cgroup_migrate(task, false, to->root); | 4394 | ret = cgroup_migrate(task, false, to->root); |
4395 | if (!ret) | ||
4396 | trace_cgroup_transfer_tasks(to, task, false); | ||
4363 | put_task_struct(task); | 4397 | put_task_struct(task); |
4364 | } | 4398 | } |
4365 | } while (task && !ret); | 4399 | } while (task && !ret); |
@@ -5025,6 +5059,8 @@ static void css_release_work_fn(struct work_struct *work) | |||
5025 | ss->css_released(css); | 5059 | ss->css_released(css); |
5026 | } else { | 5060 | } else { |
5027 | /* cgroup release path */ | 5061 | /* cgroup release path */ |
5062 | trace_cgroup_release(cgrp); | ||
5063 | |||
5028 | cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id); | 5064 | cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id); |
5029 | cgrp->id = -1; | 5065 | cgrp->id = -1; |
5030 | 5066 | ||
@@ -5311,6 +5347,8 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, | |||
5311 | if (ret) | 5347 | if (ret) |
5312 | goto out_destroy; | 5348 | goto out_destroy; |
5313 | 5349 | ||
5350 | trace_cgroup_mkdir(cgrp); | ||
5351 | |||
5314 | /* let's create and online css's */ | 5352 | /* let's create and online css's */ |
5315 | kernfs_activate(kn); | 5353 | kernfs_activate(kn); |
5316 | 5354 | ||
@@ -5486,6 +5524,9 @@ static int cgroup_rmdir(struct kernfs_node *kn) | |||
5486 | 5524 | ||
5487 | ret = cgroup_destroy_locked(cgrp); | 5525 | ret = cgroup_destroy_locked(cgrp); |
5488 | 5526 | ||
5527 | if (!ret) | ||
5528 | trace_cgroup_rmdir(cgrp); | ||
5529 | |||
5489 | cgroup_kn_unlock(kn); | 5530 | cgroup_kn_unlock(kn); |
5490 | return ret; | 5531 | return ret; |
5491 | } | 5532 | } |
@@ -5606,6 +5647,12 @@ int __init cgroup_init(void) | |||
5606 | BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files)); | 5647 | BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files)); |
5607 | BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files)); | 5648 | BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files)); |
5608 | 5649 | ||
5650 | /* | ||
5651 | * The latency of the synchronize_sched() is too high for cgroups, | ||
5652 | * avoid it at the cost of forcing all readers into the slow path. | ||
5653 | */ | ||
5654 | rcu_sync_enter_start(&cgroup_threadgroup_rwsem.rss); | ||
5655 | |||
5609 | get_user_ns(init_cgroup_ns.user_ns); | 5656 | get_user_ns(init_cgroup_ns.user_ns); |
5610 | 5657 | ||
5611 | mutex_lock(&cgroup_mutex); | 5658 | mutex_lock(&cgroup_mutex); |
@@ -5716,7 +5763,7 @@ core_initcall(cgroup_wq_init); | |||
5716 | int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, | 5763 | int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, |
5717 | struct pid *pid, struct task_struct *tsk) | 5764 | struct pid *pid, struct task_struct *tsk) |
5718 | { | 5765 | { |
5719 | char *buf, *path; | 5766 | char *buf; |
5720 | int retval; | 5767 | int retval; |
5721 | struct cgroup_root *root; | 5768 | struct cgroup_root *root; |
5722 | 5769 | ||
@@ -5759,18 +5806,18 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, | |||
5759 | * " (deleted)" is appended to the cgroup path. | 5806 | * " (deleted)" is appended to the cgroup path. |
5760 | */ | 5807 | */ |
5761 | if (cgroup_on_dfl(cgrp) || !(tsk->flags & PF_EXITING)) { | 5808 | if (cgroup_on_dfl(cgrp) || !(tsk->flags & PF_EXITING)) { |
5762 | path = cgroup_path_ns_locked(cgrp, buf, PATH_MAX, | 5809 | retval = cgroup_path_ns_locked(cgrp, buf, PATH_MAX, |
5763 | current->nsproxy->cgroup_ns); | 5810 | current->nsproxy->cgroup_ns); |
5764 | if (!path) { | 5811 | if (retval >= PATH_MAX) |
5765 | retval = -ENAMETOOLONG; | 5812 | retval = -ENAMETOOLONG; |
5813 | if (retval < 0) | ||
5766 | goto out_unlock; | 5814 | goto out_unlock; |
5767 | } | 5815 | |
5816 | seq_puts(m, buf); | ||
5768 | } else { | 5817 | } else { |
5769 | path = "/"; | 5818 | seq_puts(m, "/"); |
5770 | } | 5819 | } |
5771 | 5820 | ||
5772 | seq_puts(m, path); | ||
5773 | |||
5774 | if (cgroup_on_dfl(cgrp) && cgroup_is_dead(cgrp)) | 5821 | if (cgroup_on_dfl(cgrp) && cgroup_is_dead(cgrp)) |
5775 | seq_puts(m, " (deleted)\n"); | 5822 | seq_puts(m, " (deleted)\n"); |
5776 | else | 5823 | else |
@@ -6035,8 +6082,9 @@ static void cgroup_release_agent(struct work_struct *work) | |||
6035 | { | 6082 | { |
6036 | struct cgroup *cgrp = | 6083 | struct cgroup *cgrp = |
6037 | container_of(work, struct cgroup, release_agent_work); | 6084 | container_of(work, struct cgroup, release_agent_work); |
6038 | char *pathbuf = NULL, *agentbuf = NULL, *path; | 6085 | char *pathbuf = NULL, *agentbuf = NULL; |
6039 | char *argv[3], *envp[3]; | 6086 | char *argv[3], *envp[3]; |
6087 | int ret; | ||
6040 | 6088 | ||
6041 | mutex_lock(&cgroup_mutex); | 6089 | mutex_lock(&cgroup_mutex); |
6042 | 6090 | ||
@@ -6046,13 +6094,13 @@ static void cgroup_release_agent(struct work_struct *work) | |||
6046 | goto out; | 6094 | goto out; |
6047 | 6095 | ||
6048 | spin_lock_irq(&css_set_lock); | 6096 | spin_lock_irq(&css_set_lock); |
6049 | path = cgroup_path_ns_locked(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns); | 6097 | ret = cgroup_path_ns_locked(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns); |
6050 | spin_unlock_irq(&css_set_lock); | 6098 | spin_unlock_irq(&css_set_lock); |
6051 | if (!path) | 6099 | if (ret < 0 || ret >= PATH_MAX) |
6052 | goto out; | 6100 | goto out; |
6053 | 6101 | ||
6054 | argv[0] = agentbuf; | 6102 | argv[0] = agentbuf; |
6055 | argv[1] = path; | 6103 | argv[1] = pathbuf; |
6056 | argv[2] = NULL; | 6104 | argv[2] = NULL; |
6057 | 6105 | ||
6058 | /* minimal command environment */ | 6106 | /* minimal command environment */ |
@@ -6270,6 +6318,12 @@ void cgroup_sk_alloc(struct sock_cgroup_data *skcd) | |||
6270 | if (cgroup_sk_alloc_disabled) | 6318 | if (cgroup_sk_alloc_disabled) |
6271 | return; | 6319 | return; |
6272 | 6320 | ||
6321 | /* Socket clone path */ | ||
6322 | if (skcd->val) { | ||
6323 | cgroup_get(sock_cgroup_ptr(skcd)); | ||
6324 | return; | ||
6325 | } | ||
6326 | |||
6273 | rcu_read_lock(); | 6327 | rcu_read_lock(); |
6274 | 6328 | ||
6275 | while (true) { | 6329 | while (true) { |
@@ -6295,6 +6349,16 @@ void cgroup_sk_free(struct sock_cgroup_data *skcd) | |||
6295 | 6349 | ||
6296 | /* cgroup namespaces */ | 6350 | /* cgroup namespaces */ |
6297 | 6351 | ||
6352 | static struct ucounts *inc_cgroup_namespaces(struct user_namespace *ns) | ||
6353 | { | ||
6354 | return inc_ucount(ns, current_euid(), UCOUNT_CGROUP_NAMESPACES); | ||
6355 | } | ||
6356 | |||
6357 | static void dec_cgroup_namespaces(struct ucounts *ucounts) | ||
6358 | { | ||
6359 | dec_ucount(ucounts, UCOUNT_CGROUP_NAMESPACES); | ||
6360 | } | ||
6361 | |||
6298 | static struct cgroup_namespace *alloc_cgroup_ns(void) | 6362 | static struct cgroup_namespace *alloc_cgroup_ns(void) |
6299 | { | 6363 | { |
6300 | struct cgroup_namespace *new_ns; | 6364 | struct cgroup_namespace *new_ns; |
@@ -6316,6 +6380,7 @@ static struct cgroup_namespace *alloc_cgroup_ns(void) | |||
6316 | void free_cgroup_ns(struct cgroup_namespace *ns) | 6380 | void free_cgroup_ns(struct cgroup_namespace *ns) |
6317 | { | 6381 | { |
6318 | put_css_set(ns->root_cset); | 6382 | put_css_set(ns->root_cset); |
6383 | dec_cgroup_namespaces(ns->ucounts); | ||
6319 | put_user_ns(ns->user_ns); | 6384 | put_user_ns(ns->user_ns); |
6320 | ns_free_inum(&ns->ns); | 6385 | ns_free_inum(&ns->ns); |
6321 | kfree(ns); | 6386 | kfree(ns); |
@@ -6327,6 +6392,7 @@ struct cgroup_namespace *copy_cgroup_ns(unsigned long flags, | |||
6327 | struct cgroup_namespace *old_ns) | 6392 | struct cgroup_namespace *old_ns) |
6328 | { | 6393 | { |
6329 | struct cgroup_namespace *new_ns; | 6394 | struct cgroup_namespace *new_ns; |
6395 | struct ucounts *ucounts; | ||
6330 | struct css_set *cset; | 6396 | struct css_set *cset; |
6331 | 6397 | ||
6332 | BUG_ON(!old_ns); | 6398 | BUG_ON(!old_ns); |
@@ -6340,6 +6406,10 @@ struct cgroup_namespace *copy_cgroup_ns(unsigned long flags, | |||
6340 | if (!ns_capable(user_ns, CAP_SYS_ADMIN)) | 6406 | if (!ns_capable(user_ns, CAP_SYS_ADMIN)) |
6341 | return ERR_PTR(-EPERM); | 6407 | return ERR_PTR(-EPERM); |
6342 | 6408 | ||
6409 | ucounts = inc_cgroup_namespaces(user_ns); | ||
6410 | if (!ucounts) | ||
6411 | return ERR_PTR(-ENOSPC); | ||
6412 | |||
6343 | /* It is not safe to take cgroup_mutex here */ | 6413 | /* It is not safe to take cgroup_mutex here */ |
6344 | spin_lock_irq(&css_set_lock); | 6414 | spin_lock_irq(&css_set_lock); |
6345 | cset = task_css_set(current); | 6415 | cset = task_css_set(current); |
@@ -6349,10 +6419,12 @@ struct cgroup_namespace *copy_cgroup_ns(unsigned long flags, | |||
6349 | new_ns = alloc_cgroup_ns(); | 6419 | new_ns = alloc_cgroup_ns(); |
6350 | if (IS_ERR(new_ns)) { | 6420 | if (IS_ERR(new_ns)) { |
6351 | put_css_set(cset); | 6421 | put_css_set(cset); |
6422 | dec_cgroup_namespaces(ucounts); | ||
6352 | return new_ns; | 6423 | return new_ns; |
6353 | } | 6424 | } |
6354 | 6425 | ||
6355 | new_ns->user_ns = get_user_ns(user_ns); | 6426 | new_ns->user_ns = get_user_ns(user_ns); |
6427 | new_ns->ucounts = ucounts; | ||
6356 | new_ns->root_cset = cset; | 6428 | new_ns->root_cset = cset; |
6357 | 6429 | ||
6358 | return new_ns; | 6430 | return new_ns; |
@@ -6403,12 +6475,18 @@ static void cgroupns_put(struct ns_common *ns) | |||
6403 | put_cgroup_ns(to_cg_ns(ns)); | 6475 | put_cgroup_ns(to_cg_ns(ns)); |
6404 | } | 6476 | } |
6405 | 6477 | ||
6478 | static struct user_namespace *cgroupns_owner(struct ns_common *ns) | ||
6479 | { | ||
6480 | return to_cg_ns(ns)->user_ns; | ||
6481 | } | ||
6482 | |||
6406 | const struct proc_ns_operations cgroupns_operations = { | 6483 | const struct proc_ns_operations cgroupns_operations = { |
6407 | .name = "cgroup", | 6484 | .name = "cgroup", |
6408 | .type = CLONE_NEWCGROUP, | 6485 | .type = CLONE_NEWCGROUP, |
6409 | .get = cgroupns_get, | 6486 | .get = cgroupns_get, |
6410 | .put = cgroupns_put, | 6487 | .put = cgroupns_put, |
6411 | .install = cgroupns_install, | 6488 | .install = cgroupns_install, |
6489 | .owner = cgroupns_owner, | ||
6412 | }; | 6490 | }; |
6413 | 6491 | ||
6414 | static __init int cgroup_namespaces_init(void) | 6492 | static __init int cgroup_namespaces_init(void) |
diff --git a/kernel/configs/android-base.config b/kernel/configs/android-base.config index 9f748ed7bea8..1a8f34f63601 100644 --- a/kernel/configs/android-base.config +++ b/kernel/configs/android-base.config | |||
@@ -11,7 +11,6 @@ CONFIG_ANDROID_LOW_MEMORY_KILLER=y | |||
11 | CONFIG_ARMV8_DEPRECATED=y | 11 | CONFIG_ARMV8_DEPRECATED=y |
12 | CONFIG_ASHMEM=y | 12 | CONFIG_ASHMEM=y |
13 | CONFIG_AUDIT=y | 13 | CONFIG_AUDIT=y |
14 | CONFIG_BLK_DEV_DM=y | ||
15 | CONFIG_BLK_DEV_INITRD=y | 14 | CONFIG_BLK_DEV_INITRD=y |
16 | CONFIG_CGROUPS=y | 15 | CONFIG_CGROUPS=y |
17 | CONFIG_CGROUP_CPUACCT=y | 16 | CONFIG_CGROUP_CPUACCT=y |
@@ -19,9 +18,7 @@ CONFIG_CGROUP_DEBUG=y | |||
19 | CONFIG_CGROUP_FREEZER=y | 18 | CONFIG_CGROUP_FREEZER=y |
20 | CONFIG_CGROUP_SCHED=y | 19 | CONFIG_CGROUP_SCHED=y |
21 | CONFIG_CP15_BARRIER_EMULATION=y | 20 | CONFIG_CP15_BARRIER_EMULATION=y |
22 | CONFIG_DM_CRYPT=y | 21 | CONFIG_DEFAULT_SECURITY_SELINUX=y |
23 | CONFIG_DM_VERITY=y | ||
24 | CONFIG_DM_VERITY_FEC=y | ||
25 | CONFIG_EMBEDDED=y | 22 | CONFIG_EMBEDDED=y |
26 | CONFIG_FB=y | 23 | CONFIG_FB=y |
27 | CONFIG_HIGH_RES_TIMERS=y | 24 | CONFIG_HIGH_RES_TIMERS=y |
@@ -41,7 +38,6 @@ CONFIG_IPV6=y | |||
41 | CONFIG_IPV6_MIP6=y | 38 | CONFIG_IPV6_MIP6=y |
42 | CONFIG_IPV6_MULTIPLE_TABLES=y | 39 | CONFIG_IPV6_MULTIPLE_TABLES=y |
43 | CONFIG_IPV6_OPTIMISTIC_DAD=y | 40 | CONFIG_IPV6_OPTIMISTIC_DAD=y |
44 | CONFIG_IPV6_PRIVACY=y | ||
45 | CONFIG_IPV6_ROUTER_PREF=y | 41 | CONFIG_IPV6_ROUTER_PREF=y |
46 | CONFIG_IPV6_ROUTE_INFO=y | 42 | CONFIG_IPV6_ROUTE_INFO=y |
47 | CONFIG_IP_ADVANCED_ROUTER=y | 43 | CONFIG_IP_ADVANCED_ROUTER=y |
@@ -135,6 +131,7 @@ CONFIG_PREEMPT=y | |||
135 | CONFIG_QUOTA=y | 131 | CONFIG_QUOTA=y |
136 | CONFIG_RTC_CLASS=y | 132 | CONFIG_RTC_CLASS=y |
137 | CONFIG_RT_GROUP_SCHED=y | 133 | CONFIG_RT_GROUP_SCHED=y |
134 | CONFIG_SECCOMP=y | ||
138 | CONFIG_SECURITY=y | 135 | CONFIG_SECURITY=y |
139 | CONFIG_SECURITY_NETWORK=y | 136 | CONFIG_SECURITY_NETWORK=y |
140 | CONFIG_SECURITY_SELINUX=y | 137 | CONFIG_SECURITY_SELINUX=y |
diff --git a/kernel/configs/android-recommended.config b/kernel/configs/android-recommended.config index e3b953e966d2..297756be369c 100644 --- a/kernel/configs/android-recommended.config +++ b/kernel/configs/android-recommended.config | |||
@@ -6,12 +6,16 @@ | |||
6 | # CONFIG_PM_WAKELOCKS_GC is not set | 6 | # CONFIG_PM_WAKELOCKS_GC is not set |
7 | # CONFIG_VT is not set | 7 | # CONFIG_VT is not set |
8 | CONFIG_BACKLIGHT_LCD_SUPPORT=y | 8 | CONFIG_BACKLIGHT_LCD_SUPPORT=y |
9 | CONFIG_BLK_DEV_DM=y | ||
9 | CONFIG_BLK_DEV_LOOP=y | 10 | CONFIG_BLK_DEV_LOOP=y |
10 | CONFIG_BLK_DEV_RAM=y | 11 | CONFIG_BLK_DEV_RAM=y |
11 | CONFIG_BLK_DEV_RAM_SIZE=8192 | 12 | CONFIG_BLK_DEV_RAM_SIZE=8192 |
12 | CONFIG_COMPACTION=y | 13 | CONFIG_COMPACTION=y |
13 | CONFIG_DEBUG_RODATA=y | 14 | CONFIG_DEBUG_RODATA=y |
15 | CONFIG_DM_CRYPT=y | ||
14 | CONFIG_DM_UEVENT=y | 16 | CONFIG_DM_UEVENT=y |
17 | CONFIG_DM_VERITY=y | ||
18 | CONFIG_DM_VERITY_FEC=y | ||
15 | CONFIG_DRAGONRISE_FF=y | 19 | CONFIG_DRAGONRISE_FF=y |
16 | CONFIG_ENABLE_DEFAULT_TRACERS=y | 20 | CONFIG_ENABLE_DEFAULT_TRACERS=y |
17 | CONFIG_EXT4_FS=y | 21 | CONFIG_EXT4_FS=y |
diff --git a/kernel/configs/kvm_guest.config b/kernel/configs/kvm_guest.config new file mode 100644 index 000000000000..8d9643767142 --- /dev/null +++ b/kernel/configs/kvm_guest.config | |||
@@ -0,0 +1,32 @@ | |||
1 | CONFIG_NET=y | ||
2 | CONFIG_NET_CORE=y | ||
3 | CONFIG_NETDEVICES=y | ||
4 | CONFIG_BLOCK=y | ||
5 | CONFIG_BLK_DEV=y | ||
6 | CONFIG_NETWORK_FILESYSTEMS=y | ||
7 | CONFIG_INET=y | ||
8 | CONFIG_TTY=y | ||
9 | CONFIG_SERIAL_8250=y | ||
10 | CONFIG_SERIAL_8250_CONSOLE=y | ||
11 | CONFIG_IP_PNP=y | ||
12 | CONFIG_IP_PNP_DHCP=y | ||
13 | CONFIG_BINFMT_ELF=y | ||
14 | CONFIG_PCI=y | ||
15 | CONFIG_PCI_MSI=y | ||
16 | CONFIG_DEBUG_KERNEL=y | ||
17 | CONFIG_VIRTUALIZATION=y | ||
18 | CONFIG_HYPERVISOR_GUEST=y | ||
19 | CONFIG_PARAVIRT=y | ||
20 | CONFIG_KVM_GUEST=y | ||
21 | CONFIG_VIRTIO=y | ||
22 | CONFIG_VIRTIO_PCI=y | ||
23 | CONFIG_VIRTIO_BLK=y | ||
24 | CONFIG_VIRTIO_CONSOLE=y | ||
25 | CONFIG_VIRTIO_NET=y | ||
26 | CONFIG_9P_FS=y | ||
27 | CONFIG_NET_9P=y | ||
28 | CONFIG_NET_9P_VIRTIO=y | ||
29 | CONFIG_SCSI_LOWLEVEL=y | ||
30 | CONFIG_SCSI_VIRTIO=y | ||
31 | CONFIG_VIRTIO_INPUT=y | ||
32 | CONFIG_DRM_VIRTIO_GPU=y | ||
diff --git a/kernel/cpu.c b/kernel/cpu.c index 341bf80f80bd..29de1a9352c0 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c | |||
@@ -23,6 +23,8 @@ | |||
23 | #include <linux/tick.h> | 23 | #include <linux/tick.h> |
24 | #include <linux/irq.h> | 24 | #include <linux/irq.h> |
25 | #include <linux/smpboot.h> | 25 | #include <linux/smpboot.h> |
26 | #include <linux/relay.h> | ||
27 | #include <linux/slab.h> | ||
26 | 28 | ||
27 | #include <trace/events/power.h> | 29 | #include <trace/events/power.h> |
28 | #define CREATE_TRACE_POINTS | 30 | #define CREATE_TRACE_POINTS |
@@ -37,8 +39,9 @@ | |||
37 | * @thread: Pointer to the hotplug thread | 39 | * @thread: Pointer to the hotplug thread |
38 | * @should_run: Thread should execute | 40 | * @should_run: Thread should execute |
39 | * @rollback: Perform a rollback | 41 | * @rollback: Perform a rollback |
40 | * @cb_stat: The state for a single callback (install/uninstall) | 42 | * @single: Single callback invocation |
41 | * @cb: Single callback function (install/uninstall) | 43 | * @bringup: Single callback bringup or teardown selector |
44 | * @cb_state: The state for a single callback (install/uninstall) | ||
42 | * @result: Result of the operation | 45 | * @result: Result of the operation |
43 | * @done: Signal completion to the issuer of the task | 46 | * @done: Signal completion to the issuer of the task |
44 | */ | 47 | */ |
@@ -49,8 +52,10 @@ struct cpuhp_cpu_state { | |||
49 | struct task_struct *thread; | 52 | struct task_struct *thread; |
50 | bool should_run; | 53 | bool should_run; |
51 | bool rollback; | 54 | bool rollback; |
55 | bool single; | ||
56 | bool bringup; | ||
57 | struct hlist_node *node; | ||
52 | enum cpuhp_state cb_state; | 58 | enum cpuhp_state cb_state; |
53 | int (*cb)(unsigned int cpu); | ||
54 | int result; | 59 | int result; |
55 | struct completion done; | 60 | struct completion done; |
56 | #endif | 61 | #endif |
@@ -68,35 +73,103 @@ static DEFINE_PER_CPU(struct cpuhp_cpu_state, cpuhp_state); | |||
68 | * @cant_stop: Bringup/teardown can't be stopped at this step | 73 | * @cant_stop: Bringup/teardown can't be stopped at this step |
69 | */ | 74 | */ |
70 | struct cpuhp_step { | 75 | struct cpuhp_step { |
71 | const char *name; | 76 | const char *name; |
72 | int (*startup)(unsigned int cpu); | 77 | union { |
73 | int (*teardown)(unsigned int cpu); | 78 | int (*single)(unsigned int cpu); |
74 | bool skip_onerr; | 79 | int (*multi)(unsigned int cpu, |
75 | bool cant_stop; | 80 | struct hlist_node *node); |
81 | } startup; | ||
82 | union { | ||
83 | int (*single)(unsigned int cpu); | ||
84 | int (*multi)(unsigned int cpu, | ||
85 | struct hlist_node *node); | ||
86 | } teardown; | ||
87 | struct hlist_head list; | ||
88 | bool skip_onerr; | ||
89 | bool cant_stop; | ||
90 | bool multi_instance; | ||
76 | }; | 91 | }; |
77 | 92 | ||
78 | static DEFINE_MUTEX(cpuhp_state_mutex); | 93 | static DEFINE_MUTEX(cpuhp_state_mutex); |
79 | static struct cpuhp_step cpuhp_bp_states[]; | 94 | static struct cpuhp_step cpuhp_bp_states[]; |
80 | static struct cpuhp_step cpuhp_ap_states[]; | 95 | static struct cpuhp_step cpuhp_ap_states[]; |
81 | 96 | ||
97 | static bool cpuhp_is_ap_state(enum cpuhp_state state) | ||
98 | { | ||
99 | /* | ||
100 | * The extra check for CPUHP_TEARDOWN_CPU is only for documentation | ||
101 | * purposes as that state is handled explicitly in cpu_down. | ||
102 | */ | ||
103 | return state > CPUHP_BRINGUP_CPU && state != CPUHP_TEARDOWN_CPU; | ||
104 | } | ||
105 | |||
106 | static struct cpuhp_step *cpuhp_get_step(enum cpuhp_state state) | ||
107 | { | ||
108 | struct cpuhp_step *sp; | ||
109 | |||
110 | sp = cpuhp_is_ap_state(state) ? cpuhp_ap_states : cpuhp_bp_states; | ||
111 | return sp + state; | ||
112 | } | ||
113 | |||
82 | /** | 114 | /** |
83 | * cpuhp_invoke_callback _ Invoke the callbacks for a given state | 115 | * cpuhp_invoke_callback _ Invoke the callbacks for a given state |
84 | * @cpu: The cpu for which the callback should be invoked | 116 | * @cpu: The cpu for which the callback should be invoked |
85 | * @step: The step in the state machine | 117 | * @step: The step in the state machine |
86 | * @cb: The callback function to invoke | 118 | * @bringup: True if the bringup callback should be invoked |
87 | * | 119 | * |
88 | * Called from cpu hotplug and from the state register machinery | 120 | * Called from cpu hotplug and from the state register machinery. |
89 | */ | 121 | */ |
90 | static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state step, | 122 | static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state state, |
91 | int (*cb)(unsigned int)) | 123 | bool bringup, struct hlist_node *node) |
92 | { | 124 | { |
93 | struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); | 125 | struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); |
94 | int ret = 0; | 126 | struct cpuhp_step *step = cpuhp_get_step(state); |
95 | 127 | int (*cbm)(unsigned int cpu, struct hlist_node *node); | |
96 | if (cb) { | 128 | int (*cb)(unsigned int cpu); |
97 | trace_cpuhp_enter(cpu, st->target, step, cb); | 129 | int ret, cnt; |
130 | |||
131 | if (!step->multi_instance) { | ||
132 | cb = bringup ? step->startup.single : step->teardown.single; | ||
133 | if (!cb) | ||
134 | return 0; | ||
135 | trace_cpuhp_enter(cpu, st->target, state, cb); | ||
98 | ret = cb(cpu); | 136 | ret = cb(cpu); |
99 | trace_cpuhp_exit(cpu, st->state, step, ret); | 137 | trace_cpuhp_exit(cpu, st->state, state, ret); |
138 | return ret; | ||
139 | } | ||
140 | cbm = bringup ? step->startup.multi : step->teardown.multi; | ||
141 | if (!cbm) | ||
142 | return 0; | ||
143 | |||
144 | /* Single invocation for instance add/remove */ | ||
145 | if (node) { | ||
146 | trace_cpuhp_multi_enter(cpu, st->target, state, cbm, node); | ||
147 | ret = cbm(cpu, node); | ||
148 | trace_cpuhp_exit(cpu, st->state, state, ret); | ||
149 | return ret; | ||
150 | } | ||
151 | |||
152 | /* State transition. Invoke on all instances */ | ||
153 | cnt = 0; | ||
154 | hlist_for_each(node, &step->list) { | ||
155 | trace_cpuhp_multi_enter(cpu, st->target, state, cbm, node); | ||
156 | ret = cbm(cpu, node); | ||
157 | trace_cpuhp_exit(cpu, st->state, state, ret); | ||
158 | if (ret) | ||
159 | goto err; | ||
160 | cnt++; | ||
161 | } | ||
162 | return 0; | ||
163 | err: | ||
164 | /* Rollback the instances if one failed */ | ||
165 | cbm = !bringup ? step->startup.multi : step->teardown.multi; | ||
166 | if (!cbm) | ||
167 | return ret; | ||
168 | |||
169 | hlist_for_each(node, &step->list) { | ||
170 | if (!cnt--) | ||
171 | break; | ||
172 | cbm(cpu, node); | ||
100 | } | 173 | } |
101 | return ret; | 174 | return ret; |
102 | } | 175 | } |
@@ -155,7 +228,7 @@ static struct { | |||
155 | .wq = __WAIT_QUEUE_HEAD_INITIALIZER(cpu_hotplug.wq), | 228 | .wq = __WAIT_QUEUE_HEAD_INITIALIZER(cpu_hotplug.wq), |
156 | .lock = __MUTEX_INITIALIZER(cpu_hotplug.lock), | 229 | .lock = __MUTEX_INITIALIZER(cpu_hotplug.lock), |
157 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | 230 | #ifdef CONFIG_DEBUG_LOCK_ALLOC |
158 | .dep_map = {.name = "cpu_hotplug.lock" }, | 231 | .dep_map = STATIC_LOCKDEP_MAP_INIT("cpu_hotplug.dep_map", &cpu_hotplug.dep_map), |
159 | #endif | 232 | #endif |
160 | }; | 233 | }; |
161 | 234 | ||
@@ -260,10 +333,17 @@ void cpu_hotplug_disable(void) | |||
260 | } | 333 | } |
261 | EXPORT_SYMBOL_GPL(cpu_hotplug_disable); | 334 | EXPORT_SYMBOL_GPL(cpu_hotplug_disable); |
262 | 335 | ||
336 | static void __cpu_hotplug_enable(void) | ||
337 | { | ||
338 | if (WARN_ONCE(!cpu_hotplug_disabled, "Unbalanced cpu hotplug enable\n")) | ||
339 | return; | ||
340 | cpu_hotplug_disabled--; | ||
341 | } | ||
342 | |||
263 | void cpu_hotplug_enable(void) | 343 | void cpu_hotplug_enable(void) |
264 | { | 344 | { |
265 | cpu_maps_update_begin(); | 345 | cpu_maps_update_begin(); |
266 | WARN_ON(--cpu_hotplug_disabled < 0); | 346 | __cpu_hotplug_enable(); |
267 | cpu_maps_update_done(); | 347 | cpu_maps_update_done(); |
268 | } | 348 | } |
269 | EXPORT_SYMBOL_GPL(cpu_hotplug_enable); | 349 | EXPORT_SYMBOL_GPL(cpu_hotplug_enable); |
@@ -330,12 +410,6 @@ static int notify_online(unsigned int cpu) | |||
330 | return 0; | 410 | return 0; |
331 | } | 411 | } |
332 | 412 | ||
333 | static int notify_starting(unsigned int cpu) | ||
334 | { | ||
335 | cpu_notify(CPU_STARTING, cpu); | ||
336 | return 0; | ||
337 | } | ||
338 | |||
339 | static int bringup_wait_for_ap(unsigned int cpu) | 413 | static int bringup_wait_for_ap(unsigned int cpu) |
340 | { | 414 | { |
341 | struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); | 415 | struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); |
@@ -349,8 +423,16 @@ static int bringup_cpu(unsigned int cpu) | |||
349 | struct task_struct *idle = idle_thread_get(cpu); | 423 | struct task_struct *idle = idle_thread_get(cpu); |
350 | int ret; | 424 | int ret; |
351 | 425 | ||
426 | /* | ||
427 | * Some architectures have to walk the irq descriptors to | ||
428 | * setup the vector space for the cpu which comes online. | ||
429 | * Prevent irq alloc/free across the bringup. | ||
430 | */ | ||
431 | irq_lock_sparse(); | ||
432 | |||
352 | /* Arch-specific enabling code. */ | 433 | /* Arch-specific enabling code. */ |
353 | ret = __cpu_up(cpu, idle); | 434 | ret = __cpu_up(cpu, idle); |
435 | irq_unlock_sparse(); | ||
354 | if (ret) { | 436 | if (ret) { |
355 | cpu_notify(CPU_UP_CANCELED, cpu); | 437 | cpu_notify(CPU_UP_CANCELED, cpu); |
356 | return ret; | 438 | return ret; |
@@ -363,62 +445,55 @@ static int bringup_cpu(unsigned int cpu) | |||
363 | /* | 445 | /* |
364 | * Hotplug state machine related functions | 446 | * Hotplug state machine related functions |
365 | */ | 447 | */ |
366 | static void undo_cpu_down(unsigned int cpu, struct cpuhp_cpu_state *st, | 448 | static void undo_cpu_down(unsigned int cpu, struct cpuhp_cpu_state *st) |
367 | struct cpuhp_step *steps) | ||
368 | { | 449 | { |
369 | for (st->state++; st->state < st->target; st->state++) { | 450 | for (st->state++; st->state < st->target; st->state++) { |
370 | struct cpuhp_step *step = steps + st->state; | 451 | struct cpuhp_step *step = cpuhp_get_step(st->state); |
371 | 452 | ||
372 | if (!step->skip_onerr) | 453 | if (!step->skip_onerr) |
373 | cpuhp_invoke_callback(cpu, st->state, step->startup); | 454 | cpuhp_invoke_callback(cpu, st->state, true, NULL); |
374 | } | 455 | } |
375 | } | 456 | } |
376 | 457 | ||
377 | static int cpuhp_down_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st, | 458 | static int cpuhp_down_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st, |
378 | struct cpuhp_step *steps, enum cpuhp_state target) | 459 | enum cpuhp_state target) |
379 | { | 460 | { |
380 | enum cpuhp_state prev_state = st->state; | 461 | enum cpuhp_state prev_state = st->state; |
381 | int ret = 0; | 462 | int ret = 0; |
382 | 463 | ||
383 | for (; st->state > target; st->state--) { | 464 | for (; st->state > target; st->state--) { |
384 | struct cpuhp_step *step = steps + st->state; | 465 | ret = cpuhp_invoke_callback(cpu, st->state, false, NULL); |
385 | |||
386 | ret = cpuhp_invoke_callback(cpu, st->state, step->teardown); | ||
387 | if (ret) { | 466 | if (ret) { |
388 | st->target = prev_state; | 467 | st->target = prev_state; |
389 | undo_cpu_down(cpu, st, steps); | 468 | undo_cpu_down(cpu, st); |
390 | break; | 469 | break; |
391 | } | 470 | } |
392 | } | 471 | } |
393 | return ret; | 472 | return ret; |
394 | } | 473 | } |
395 | 474 | ||
396 | static void undo_cpu_up(unsigned int cpu, struct cpuhp_cpu_state *st, | 475 | static void undo_cpu_up(unsigned int cpu, struct cpuhp_cpu_state *st) |
397 | struct cpuhp_step *steps) | ||
398 | { | 476 | { |
399 | for (st->state--; st->state > st->target; st->state--) { | 477 | for (st->state--; st->state > st->target; st->state--) { |
400 | struct cpuhp_step *step = steps + st->state; | 478 | struct cpuhp_step *step = cpuhp_get_step(st->state); |
401 | 479 | ||
402 | if (!step->skip_onerr) | 480 | if (!step->skip_onerr) |
403 | cpuhp_invoke_callback(cpu, st->state, step->teardown); | 481 | cpuhp_invoke_callback(cpu, st->state, false, NULL); |
404 | } | 482 | } |
405 | } | 483 | } |
406 | 484 | ||
407 | static int cpuhp_up_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st, | 485 | static int cpuhp_up_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st, |
408 | struct cpuhp_step *steps, enum cpuhp_state target) | 486 | enum cpuhp_state target) |
409 | { | 487 | { |
410 | enum cpuhp_state prev_state = st->state; | 488 | enum cpuhp_state prev_state = st->state; |
411 | int ret = 0; | 489 | int ret = 0; |
412 | 490 | ||
413 | while (st->state < target) { | 491 | while (st->state < target) { |
414 | struct cpuhp_step *step; | ||
415 | |||
416 | st->state++; | 492 | st->state++; |
417 | step = steps + st->state; | 493 | ret = cpuhp_invoke_callback(cpu, st->state, true, NULL); |
418 | ret = cpuhp_invoke_callback(cpu, st->state, step->startup); | ||
419 | if (ret) { | 494 | if (ret) { |
420 | st->target = prev_state; | 495 | st->target = prev_state; |
421 | undo_cpu_up(cpu, st, steps); | 496 | undo_cpu_up(cpu, st); |
422 | break; | 497 | break; |
423 | } | 498 | } |
424 | } | 499 | } |
@@ -447,13 +522,13 @@ static int cpuhp_ap_offline(unsigned int cpu, struct cpuhp_cpu_state *st) | |||
447 | { | 522 | { |
448 | enum cpuhp_state target = max((int)st->target, CPUHP_TEARDOWN_CPU); | 523 | enum cpuhp_state target = max((int)st->target, CPUHP_TEARDOWN_CPU); |
449 | 524 | ||
450 | return cpuhp_down_callbacks(cpu, st, cpuhp_ap_states, target); | 525 | return cpuhp_down_callbacks(cpu, st, target); |
451 | } | 526 | } |
452 | 527 | ||
453 | /* Execute the online startup callbacks. Used to be CPU_ONLINE */ | 528 | /* Execute the online startup callbacks. Used to be CPU_ONLINE */ |
454 | static int cpuhp_ap_online(unsigned int cpu, struct cpuhp_cpu_state *st) | 529 | static int cpuhp_ap_online(unsigned int cpu, struct cpuhp_cpu_state *st) |
455 | { | 530 | { |
456 | return cpuhp_up_callbacks(cpu, st, cpuhp_ap_states, st->target); | 531 | return cpuhp_up_callbacks(cpu, st, st->target); |
457 | } | 532 | } |
458 | 533 | ||
459 | /* | 534 | /* |
@@ -476,18 +551,20 @@ static void cpuhp_thread_fun(unsigned int cpu) | |||
476 | st->should_run = false; | 551 | st->should_run = false; |
477 | 552 | ||
478 | /* Single callback invocation for [un]install ? */ | 553 | /* Single callback invocation for [un]install ? */ |
479 | if (st->cb) { | 554 | if (st->single) { |
480 | if (st->cb_state < CPUHP_AP_ONLINE) { | 555 | if (st->cb_state < CPUHP_AP_ONLINE) { |
481 | local_irq_disable(); | 556 | local_irq_disable(); |
482 | ret = cpuhp_invoke_callback(cpu, st->cb_state, st->cb); | 557 | ret = cpuhp_invoke_callback(cpu, st->cb_state, |
558 | st->bringup, st->node); | ||
483 | local_irq_enable(); | 559 | local_irq_enable(); |
484 | } else { | 560 | } else { |
485 | ret = cpuhp_invoke_callback(cpu, st->cb_state, st->cb); | 561 | ret = cpuhp_invoke_callback(cpu, st->cb_state, |
562 | st->bringup, st->node); | ||
486 | } | 563 | } |
487 | } else if (st->rollback) { | 564 | } else if (st->rollback) { |
488 | BUG_ON(st->state < CPUHP_AP_ONLINE_IDLE); | 565 | BUG_ON(st->state < CPUHP_AP_ONLINE_IDLE); |
489 | 566 | ||
490 | undo_cpu_down(cpu, st, cpuhp_ap_states); | 567 | undo_cpu_down(cpu, st); |
491 | /* | 568 | /* |
492 | * This is a momentary workaround to keep the notifier users | 569 | * This is a momentary workaround to keep the notifier users |
493 | * happy. Will go away once we got rid of the notifiers. | 570 | * happy. Will go away once we got rid of the notifiers. |
@@ -509,8 +586,9 @@ static void cpuhp_thread_fun(unsigned int cpu) | |||
509 | } | 586 | } |
510 | 587 | ||
511 | /* Invoke a single callback on a remote cpu */ | 588 | /* Invoke a single callback on a remote cpu */ |
512 | static int cpuhp_invoke_ap_callback(int cpu, enum cpuhp_state state, | 589 | static int |
513 | int (*cb)(unsigned int)) | 590 | cpuhp_invoke_ap_callback(int cpu, enum cpuhp_state state, bool bringup, |
591 | struct hlist_node *node) | ||
514 | { | 592 | { |
515 | struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); | 593 | struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); |
516 | 594 | ||
@@ -522,10 +600,13 @@ static int cpuhp_invoke_ap_callback(int cpu, enum cpuhp_state state, | |||
522 | * we invoke the thread function directly. | 600 | * we invoke the thread function directly. |
523 | */ | 601 | */ |
524 | if (!st->thread) | 602 | if (!st->thread) |
525 | return cpuhp_invoke_callback(cpu, state, cb); | 603 | return cpuhp_invoke_callback(cpu, state, bringup, node); |
526 | 604 | ||
527 | st->cb_state = state; | 605 | st->cb_state = state; |
528 | st->cb = cb; | 606 | st->single = true; |
607 | st->bringup = bringup; | ||
608 | st->node = node; | ||
609 | |||
529 | /* | 610 | /* |
530 | * Make sure the above stores are visible before should_run becomes | 611 | * Make sure the above stores are visible before should_run becomes |
531 | * true. Paired with the mb() above in cpuhp_thread_fun() | 612 | * true. Paired with the mb() above in cpuhp_thread_fun() |
@@ -541,7 +622,7 @@ static int cpuhp_invoke_ap_callback(int cpu, enum cpuhp_state state, | |||
541 | static void __cpuhp_kick_ap_work(struct cpuhp_cpu_state *st) | 622 | static void __cpuhp_kick_ap_work(struct cpuhp_cpu_state *st) |
542 | { | 623 | { |
543 | st->result = 0; | 624 | st->result = 0; |
544 | st->cb = NULL; | 625 | st->single = false; |
545 | /* | 626 | /* |
546 | * Make sure the above stores are visible before should_run becomes | 627 | * Make sure the above stores are visible before should_run becomes |
547 | * true. Paired with the mb() above in cpuhp_thread_fun() | 628 | * true. Paired with the mb() above in cpuhp_thread_fun() |
@@ -674,12 +755,6 @@ static int notify_down_prepare(unsigned int cpu) | |||
674 | return err; | 755 | return err; |
675 | } | 756 | } |
676 | 757 | ||
677 | static int notify_dying(unsigned int cpu) | ||
678 | { | ||
679 | cpu_notify(CPU_DYING, cpu); | ||
680 | return 0; | ||
681 | } | ||
682 | |||
683 | /* Take this CPU down. */ | 758 | /* Take this CPU down. */ |
684 | static int take_cpu_down(void *_param) | 759 | static int take_cpu_down(void *_param) |
685 | { | 760 | { |
@@ -692,12 +767,16 @@ static int take_cpu_down(void *_param) | |||
692 | if (err < 0) | 767 | if (err < 0) |
693 | return err; | 768 | return err; |
694 | 769 | ||
770 | /* | ||
771 | * We get here while we are in CPUHP_TEARDOWN_CPU state and we must not | ||
772 | * do this step again. | ||
773 | */ | ||
774 | WARN_ON(st->state != CPUHP_TEARDOWN_CPU); | ||
775 | st->state--; | ||
695 | /* Invoke the former CPU_DYING callbacks */ | 776 | /* Invoke the former CPU_DYING callbacks */ |
696 | for (; st->state > target; st->state--) { | 777 | for (; st->state > target; st->state--) |
697 | struct cpuhp_step *step = cpuhp_ap_states + st->state; | 778 | cpuhp_invoke_callback(cpu, st->state, false, NULL); |
698 | 779 | ||
699 | cpuhp_invoke_callback(cpu, st->state, step->teardown); | ||
700 | } | ||
701 | /* Give up timekeeping duties */ | 780 | /* Give up timekeeping duties */ |
702 | tick_handover_do_timer(); | 781 | tick_handover_do_timer(); |
703 | /* Park the stopper thread */ | 782 | /* Park the stopper thread */ |
@@ -734,7 +813,7 @@ static int takedown_cpu(unsigned int cpu) | |||
734 | BUG_ON(cpu_online(cpu)); | 813 | BUG_ON(cpu_online(cpu)); |
735 | 814 | ||
736 | /* | 815 | /* |
737 | * The migration_call() CPU_DYING callback will have removed all | 816 | * The CPUHP_AP_SCHED_MIGRATE_DYING callback will have removed all |
738 | * runnable tasks from the cpu, there's only the idle task left now | 817 | * runnable tasks from the cpu, there's only the idle task left now |
739 | * that the migration thread is done doing the stop_machine thing. | 818 | * that the migration thread is done doing the stop_machine thing. |
740 | * | 819 | * |
@@ -787,7 +866,6 @@ void cpuhp_report_idle_dead(void) | |||
787 | #define notify_down_prepare NULL | 866 | #define notify_down_prepare NULL |
788 | #define takedown_cpu NULL | 867 | #define takedown_cpu NULL |
789 | #define notify_dead NULL | 868 | #define notify_dead NULL |
790 | #define notify_dying NULL | ||
791 | #endif | 869 | #endif |
792 | 870 | ||
793 | #ifdef CONFIG_HOTPLUG_CPU | 871 | #ifdef CONFIG_HOTPLUG_CPU |
@@ -836,7 +914,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen, | |||
836 | * The AP brought itself down to CPUHP_TEARDOWN_CPU. So we need | 914 | * The AP brought itself down to CPUHP_TEARDOWN_CPU. So we need |
837 | * to do the further cleanups. | 915 | * to do the further cleanups. |
838 | */ | 916 | */ |
839 | ret = cpuhp_down_callbacks(cpu, st, cpuhp_bp_states, target); | 917 | ret = cpuhp_down_callbacks(cpu, st, target); |
840 | if (ret && st->state > CPUHP_TEARDOWN_CPU && st->state < prev_state) { | 918 | if (ret && st->state > CPUHP_TEARDOWN_CPU && st->state < prev_state) { |
841 | st->target = prev_state; | 919 | st->target = prev_state; |
842 | st->rollback = true; | 920 | st->rollback = true; |
@@ -877,10 +955,9 @@ EXPORT_SYMBOL(cpu_down); | |||
877 | #endif /*CONFIG_HOTPLUG_CPU*/ | 955 | #endif /*CONFIG_HOTPLUG_CPU*/ |
878 | 956 | ||
879 | /** | 957 | /** |
880 | * notify_cpu_starting(cpu) - call the CPU_STARTING notifiers | 958 | * notify_cpu_starting(cpu) - Invoke the callbacks on the starting CPU |
881 | * @cpu: cpu that just started | 959 | * @cpu: cpu that just started |
882 | * | 960 | * |
883 | * This function calls the cpu_chain notifiers with CPU_STARTING. | ||
884 | * It must be called by the arch code on the new cpu, before the new cpu | 961 | * It must be called by the arch code on the new cpu, before the new cpu |
885 | * enables interrupts and before the "boot" cpu returns from __cpu_up(). | 962 | * enables interrupts and before the "boot" cpu returns from __cpu_up(). |
886 | */ | 963 | */ |
@@ -889,12 +966,10 @@ void notify_cpu_starting(unsigned int cpu) | |||
889 | struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); | 966 | struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); |
890 | enum cpuhp_state target = min((int)st->target, CPUHP_AP_ONLINE); | 967 | enum cpuhp_state target = min((int)st->target, CPUHP_AP_ONLINE); |
891 | 968 | ||
969 | rcu_cpu_starting(cpu); /* Enables RCU usage on this CPU. */ | ||
892 | while (st->state < target) { | 970 | while (st->state < target) { |
893 | struct cpuhp_step *step; | ||
894 | |||
895 | st->state++; | 971 | st->state++; |
896 | step = cpuhp_ap_states + st->state; | 972 | cpuhp_invoke_callback(cpu, st->state, true, NULL); |
897 | cpuhp_invoke_callback(cpu, st->state, step->startup); | ||
898 | } | 973 | } |
899 | } | 974 | } |
900 | 975 | ||
@@ -979,7 +1054,7 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen, enum cpuhp_state target) | |||
979 | * responsible for bringing it up to the target state. | 1054 | * responsible for bringing it up to the target state. |
980 | */ | 1055 | */ |
981 | target = min((int)target, CPUHP_BRINGUP_CPU); | 1056 | target = min((int)target, CPUHP_BRINGUP_CPU); |
982 | ret = cpuhp_up_callbacks(cpu, st, cpuhp_bp_states, target); | 1057 | ret = cpuhp_up_callbacks(cpu, st, target); |
983 | out: | 1058 | out: |
984 | cpu_hotplug_done(); | 1059 | cpu_hotplug_done(); |
985 | return ret; | 1060 | return ret; |
@@ -1024,12 +1099,13 @@ EXPORT_SYMBOL_GPL(cpu_up); | |||
1024 | #ifdef CONFIG_PM_SLEEP_SMP | 1099 | #ifdef CONFIG_PM_SLEEP_SMP |
1025 | static cpumask_var_t frozen_cpus; | 1100 | static cpumask_var_t frozen_cpus; |
1026 | 1101 | ||
1027 | int disable_nonboot_cpus(void) | 1102 | int freeze_secondary_cpus(int primary) |
1028 | { | 1103 | { |
1029 | int cpu, first_cpu, error = 0; | 1104 | int cpu, error = 0; |
1030 | 1105 | ||
1031 | cpu_maps_update_begin(); | 1106 | cpu_maps_update_begin(); |
1032 | first_cpu = cpumask_first(cpu_online_mask); | 1107 | if (!cpu_online(primary)) |
1108 | primary = cpumask_first(cpu_online_mask); | ||
1033 | /* | 1109 | /* |
1034 | * We take down all of the non-boot CPUs in one shot to avoid races | 1110 | * We take down all of the non-boot CPUs in one shot to avoid races |
1035 | * with the userspace trying to use the CPU hotplug at the same time | 1111 | * with the userspace trying to use the CPU hotplug at the same time |
@@ -1038,7 +1114,7 @@ int disable_nonboot_cpus(void) | |||
1038 | 1114 | ||
1039 | pr_info("Disabling non-boot CPUs ...\n"); | 1115 | pr_info("Disabling non-boot CPUs ...\n"); |
1040 | for_each_online_cpu(cpu) { | 1116 | for_each_online_cpu(cpu) { |
1041 | if (cpu == first_cpu) | 1117 | if (cpu == primary) |
1042 | continue; | 1118 | continue; |
1043 | trace_suspend_resume(TPS("CPU_OFF"), cpu, true); | 1119 | trace_suspend_resume(TPS("CPU_OFF"), cpu, true); |
1044 | error = _cpu_down(cpu, 1, CPUHP_OFFLINE); | 1120 | error = _cpu_down(cpu, 1, CPUHP_OFFLINE); |
@@ -1081,7 +1157,7 @@ void enable_nonboot_cpus(void) | |||
1081 | 1157 | ||
1082 | /* Allow everyone to use the CPU hotplug again */ | 1158 | /* Allow everyone to use the CPU hotplug again */ |
1083 | cpu_maps_update_begin(); | 1159 | cpu_maps_update_begin(); |
1084 | WARN_ON(--cpu_hotplug_disabled < 0); | 1160 | __cpu_hotplug_enable(); |
1085 | if (cpumask_empty(frozen_cpus)) | 1161 | if (cpumask_empty(frozen_cpus)) |
1086 | goto out; | 1162 | goto out; |
1087 | 1163 | ||
@@ -1170,40 +1246,50 @@ core_initcall(cpu_hotplug_pm_sync_init); | |||
1170 | static struct cpuhp_step cpuhp_bp_states[] = { | 1246 | static struct cpuhp_step cpuhp_bp_states[] = { |
1171 | [CPUHP_OFFLINE] = { | 1247 | [CPUHP_OFFLINE] = { |
1172 | .name = "offline", | 1248 | .name = "offline", |
1173 | .startup = NULL, | 1249 | .startup.single = NULL, |
1174 | .teardown = NULL, | 1250 | .teardown.single = NULL, |
1175 | }, | 1251 | }, |
1176 | #ifdef CONFIG_SMP | 1252 | #ifdef CONFIG_SMP |
1177 | [CPUHP_CREATE_THREADS]= { | 1253 | [CPUHP_CREATE_THREADS]= { |
1178 | .name = "threads:create", | 1254 | .name = "threads:prepare", |
1179 | .startup = smpboot_create_threads, | 1255 | .startup.single = smpboot_create_threads, |
1180 | .teardown = NULL, | 1256 | .teardown.single = NULL, |
1181 | .cant_stop = true, | 1257 | .cant_stop = true, |
1182 | }, | 1258 | }, |
1183 | [CPUHP_PERF_PREPARE] = { | 1259 | [CPUHP_PERF_PREPARE] = { |
1184 | .name = "perf prepare", | 1260 | .name = "perf:prepare", |
1185 | .startup = perf_event_init_cpu, | 1261 | .startup.single = perf_event_init_cpu, |
1186 | .teardown = perf_event_exit_cpu, | 1262 | .teardown.single = perf_event_exit_cpu, |
1187 | }, | 1263 | }, |
1188 | [CPUHP_WORKQUEUE_PREP] = { | 1264 | [CPUHP_WORKQUEUE_PREP] = { |
1189 | .name = "workqueue prepare", | 1265 | .name = "workqueue:prepare", |
1190 | .startup = workqueue_prepare_cpu, | 1266 | .startup.single = workqueue_prepare_cpu, |
1191 | .teardown = NULL, | 1267 | .teardown.single = NULL, |
1192 | }, | 1268 | }, |
1193 | [CPUHP_HRTIMERS_PREPARE] = { | 1269 | [CPUHP_HRTIMERS_PREPARE] = { |
1194 | .name = "hrtimers prepare", | 1270 | .name = "hrtimers:prepare", |
1195 | .startup = hrtimers_prepare_cpu, | 1271 | .startup.single = hrtimers_prepare_cpu, |
1196 | .teardown = hrtimers_dead_cpu, | 1272 | .teardown.single = hrtimers_dead_cpu, |
1197 | }, | 1273 | }, |
1198 | [CPUHP_SMPCFD_PREPARE] = { | 1274 | [CPUHP_SMPCFD_PREPARE] = { |
1199 | .name = "SMPCFD prepare", | 1275 | .name = "smpcfd:prepare", |
1200 | .startup = smpcfd_prepare_cpu, | 1276 | .startup.single = smpcfd_prepare_cpu, |
1201 | .teardown = smpcfd_dead_cpu, | 1277 | .teardown.single = smpcfd_dead_cpu, |
1278 | }, | ||
1279 | [CPUHP_RELAY_PREPARE] = { | ||
1280 | .name = "relay:prepare", | ||
1281 | .startup.single = relay_prepare_cpu, | ||
1282 | .teardown.single = NULL, | ||
1283 | }, | ||
1284 | [CPUHP_SLAB_PREPARE] = { | ||
1285 | .name = "slab:prepare", | ||
1286 | .startup.single = slab_prepare_cpu, | ||
1287 | .teardown.single = slab_dead_cpu, | ||
1202 | }, | 1288 | }, |
1203 | [CPUHP_RCUTREE_PREP] = { | 1289 | [CPUHP_RCUTREE_PREP] = { |
1204 | .name = "RCU-tree prepare", | 1290 | .name = "RCU/tree:prepare", |
1205 | .startup = rcutree_prepare_cpu, | 1291 | .startup.single = rcutree_prepare_cpu, |
1206 | .teardown = rcutree_dead_cpu, | 1292 | .teardown.single = rcutree_dead_cpu, |
1207 | }, | 1293 | }, |
1208 | /* | 1294 | /* |
1209 | * Preparatory and dead notifiers. Will be replaced once the notifiers | 1295 | * Preparatory and dead notifiers. Will be replaced once the notifiers |
@@ -1211,8 +1297,8 @@ static struct cpuhp_step cpuhp_bp_states[] = { | |||
1211 | */ | 1297 | */ |
1212 | [CPUHP_NOTIFY_PREPARE] = { | 1298 | [CPUHP_NOTIFY_PREPARE] = { |
1213 | .name = "notify:prepare", | 1299 | .name = "notify:prepare", |
1214 | .startup = notify_prepare, | 1300 | .startup.single = notify_prepare, |
1215 | .teardown = notify_dead, | 1301 | .teardown.single = notify_dead, |
1216 | .skip_onerr = true, | 1302 | .skip_onerr = true, |
1217 | .cant_stop = true, | 1303 | .cant_stop = true, |
1218 | }, | 1304 | }, |
@@ -1222,20 +1308,21 @@ static struct cpuhp_step cpuhp_bp_states[] = { | |||
1222 | * otherwise a RCU stall occurs. | 1308 | * otherwise a RCU stall occurs. |
1223 | */ | 1309 | */ |
1224 | [CPUHP_TIMERS_DEAD] = { | 1310 | [CPUHP_TIMERS_DEAD] = { |
1225 | .name = "timers dead", | 1311 | .name = "timers:dead", |
1226 | .startup = NULL, | 1312 | .startup.single = NULL, |
1227 | .teardown = timers_dead_cpu, | 1313 | .teardown.single = timers_dead_cpu, |
1228 | }, | 1314 | }, |
1229 | /* Kicks the plugged cpu into life */ | 1315 | /* Kicks the plugged cpu into life */ |
1230 | [CPUHP_BRINGUP_CPU] = { | 1316 | [CPUHP_BRINGUP_CPU] = { |
1231 | .name = "cpu:bringup", | 1317 | .name = "cpu:bringup", |
1232 | .startup = bringup_cpu, | 1318 | .startup.single = bringup_cpu, |
1233 | .teardown = NULL, | 1319 | .teardown.single = NULL, |
1234 | .cant_stop = true, | 1320 | .cant_stop = true, |
1235 | }, | 1321 | }, |
1236 | [CPUHP_AP_SMPCFD_DYING] = { | 1322 | [CPUHP_AP_SMPCFD_DYING] = { |
1237 | .startup = NULL, | 1323 | .name = "smpcfd:dying", |
1238 | .teardown = smpcfd_dying_cpu, | 1324 | .startup.single = NULL, |
1325 | .teardown.single = smpcfd_dying_cpu, | ||
1239 | }, | 1326 | }, |
1240 | /* | 1327 | /* |
1241 | * Handled on controll processor until the plugged processor manages | 1328 | * Handled on controll processor until the plugged processor manages |
@@ -1243,8 +1330,8 @@ static struct cpuhp_step cpuhp_bp_states[] = { | |||
1243 | */ | 1330 | */ |
1244 | [CPUHP_TEARDOWN_CPU] = { | 1331 | [CPUHP_TEARDOWN_CPU] = { |
1245 | .name = "cpu:teardown", | 1332 | .name = "cpu:teardown", |
1246 | .startup = NULL, | 1333 | .startup.single = NULL, |
1247 | .teardown = takedown_cpu, | 1334 | .teardown.single = takedown_cpu, |
1248 | .cant_stop = true, | 1335 | .cant_stop = true, |
1249 | }, | 1336 | }, |
1250 | #else | 1337 | #else |
@@ -1270,24 +1357,13 @@ static struct cpuhp_step cpuhp_ap_states[] = { | |||
1270 | /* First state is scheduler control. Interrupts are disabled */ | 1357 | /* First state is scheduler control. Interrupts are disabled */ |
1271 | [CPUHP_AP_SCHED_STARTING] = { | 1358 | [CPUHP_AP_SCHED_STARTING] = { |
1272 | .name = "sched:starting", | 1359 | .name = "sched:starting", |
1273 | .startup = sched_cpu_starting, | 1360 | .startup.single = sched_cpu_starting, |
1274 | .teardown = sched_cpu_dying, | 1361 | .teardown.single = sched_cpu_dying, |
1275 | }, | 1362 | }, |
1276 | [CPUHP_AP_RCUTREE_DYING] = { | 1363 | [CPUHP_AP_RCUTREE_DYING] = { |
1277 | .startup = NULL, | 1364 | .name = "RCU/tree:dying", |
1278 | .teardown = rcutree_dying_cpu, | 1365 | .startup.single = NULL, |
1279 | }, | 1366 | .teardown.single = rcutree_dying_cpu, |
1280 | /* | ||
1281 | * Low level startup/teardown notifiers. Run with interrupts | ||
1282 | * disabled. Will be removed once the notifiers are converted to | ||
1283 | * states. | ||
1284 | */ | ||
1285 | [CPUHP_AP_NOTIFY_STARTING] = { | ||
1286 | .name = "notify:starting", | ||
1287 | .startup = notify_starting, | ||
1288 | .teardown = notify_dying, | ||
1289 | .skip_onerr = true, | ||
1290 | .cant_stop = true, | ||
1291 | }, | 1367 | }, |
1292 | /* Entry state on starting. Interrupts enabled from here on. Transient | 1368 | /* Entry state on starting. Interrupts enabled from here on. Transient |
1293 | * state for synchronsization */ | 1369 | * state for synchronsization */ |
@@ -1296,24 +1372,24 @@ static struct cpuhp_step cpuhp_ap_states[] = { | |||
1296 | }, | 1372 | }, |
1297 | /* Handle smpboot threads park/unpark */ | 1373 | /* Handle smpboot threads park/unpark */ |
1298 | [CPUHP_AP_SMPBOOT_THREADS] = { | 1374 | [CPUHP_AP_SMPBOOT_THREADS] = { |
1299 | .name = "smpboot:threads", | 1375 | .name = "smpboot/threads:online", |
1300 | .startup = smpboot_unpark_threads, | 1376 | .startup.single = smpboot_unpark_threads, |
1301 | .teardown = NULL, | 1377 | .teardown.single = NULL, |
1302 | }, | 1378 | }, |
1303 | [CPUHP_AP_PERF_ONLINE] = { | 1379 | [CPUHP_AP_PERF_ONLINE] = { |
1304 | .name = "perf online", | 1380 | .name = "perf:online", |
1305 | .startup = perf_event_init_cpu, | 1381 | .startup.single = perf_event_init_cpu, |
1306 | .teardown = perf_event_exit_cpu, | 1382 | .teardown.single = perf_event_exit_cpu, |
1307 | }, | 1383 | }, |
1308 | [CPUHP_AP_WORKQUEUE_ONLINE] = { | 1384 | [CPUHP_AP_WORKQUEUE_ONLINE] = { |
1309 | .name = "workqueue online", | 1385 | .name = "workqueue:online", |
1310 | .startup = workqueue_online_cpu, | 1386 | .startup.single = workqueue_online_cpu, |
1311 | .teardown = workqueue_offline_cpu, | 1387 | .teardown.single = workqueue_offline_cpu, |
1312 | }, | 1388 | }, |
1313 | [CPUHP_AP_RCUTREE_ONLINE] = { | 1389 | [CPUHP_AP_RCUTREE_ONLINE] = { |
1314 | .name = "RCU-tree online", | 1390 | .name = "RCU/tree:online", |
1315 | .startup = rcutree_online_cpu, | 1391 | .startup.single = rcutree_online_cpu, |
1316 | .teardown = rcutree_offline_cpu, | 1392 | .teardown.single = rcutree_offline_cpu, |
1317 | }, | 1393 | }, |
1318 | 1394 | ||
1319 | /* | 1395 | /* |
@@ -1322,8 +1398,8 @@ static struct cpuhp_step cpuhp_ap_states[] = { | |||
1322 | */ | 1398 | */ |
1323 | [CPUHP_AP_NOTIFY_ONLINE] = { | 1399 | [CPUHP_AP_NOTIFY_ONLINE] = { |
1324 | .name = "notify:online", | 1400 | .name = "notify:online", |
1325 | .startup = notify_online, | 1401 | .startup.single = notify_online, |
1326 | .teardown = notify_down_prepare, | 1402 | .teardown.single = notify_down_prepare, |
1327 | .skip_onerr = true, | 1403 | .skip_onerr = true, |
1328 | }, | 1404 | }, |
1329 | #endif | 1405 | #endif |
@@ -1335,16 +1411,16 @@ static struct cpuhp_step cpuhp_ap_states[] = { | |||
1335 | /* Last state is scheduler control setting the cpu active */ | 1411 | /* Last state is scheduler control setting the cpu active */ |
1336 | [CPUHP_AP_ACTIVE] = { | 1412 | [CPUHP_AP_ACTIVE] = { |
1337 | .name = "sched:active", | 1413 | .name = "sched:active", |
1338 | .startup = sched_cpu_activate, | 1414 | .startup.single = sched_cpu_activate, |
1339 | .teardown = sched_cpu_deactivate, | 1415 | .teardown.single = sched_cpu_deactivate, |
1340 | }, | 1416 | }, |
1341 | #endif | 1417 | #endif |
1342 | 1418 | ||
1343 | /* CPU is fully up and running. */ | 1419 | /* CPU is fully up and running. */ |
1344 | [CPUHP_ONLINE] = { | 1420 | [CPUHP_ONLINE] = { |
1345 | .name = "online", | 1421 | .name = "online", |
1346 | .startup = NULL, | 1422 | .startup.single = NULL, |
1347 | .teardown = NULL, | 1423 | .teardown.single = NULL, |
1348 | }, | 1424 | }, |
1349 | }; | 1425 | }; |
1350 | 1426 | ||
@@ -1356,54 +1432,42 @@ static int cpuhp_cb_check(enum cpuhp_state state) | |||
1356 | return 0; | 1432 | return 0; |
1357 | } | 1433 | } |
1358 | 1434 | ||
1359 | static bool cpuhp_is_ap_state(enum cpuhp_state state) | ||
1360 | { | ||
1361 | /* | ||
1362 | * The extra check for CPUHP_TEARDOWN_CPU is only for documentation | ||
1363 | * purposes as that state is handled explicitely in cpu_down. | ||
1364 | */ | ||
1365 | return state > CPUHP_BRINGUP_CPU && state != CPUHP_TEARDOWN_CPU; | ||
1366 | } | ||
1367 | |||
1368 | static struct cpuhp_step *cpuhp_get_step(enum cpuhp_state state) | ||
1369 | { | ||
1370 | struct cpuhp_step *sp; | ||
1371 | |||
1372 | sp = cpuhp_is_ap_state(state) ? cpuhp_ap_states : cpuhp_bp_states; | ||
1373 | return sp + state; | ||
1374 | } | ||
1375 | |||
1376 | static void cpuhp_store_callbacks(enum cpuhp_state state, | 1435 | static void cpuhp_store_callbacks(enum cpuhp_state state, |
1377 | const char *name, | 1436 | const char *name, |
1378 | int (*startup)(unsigned int cpu), | 1437 | int (*startup)(unsigned int cpu), |
1379 | int (*teardown)(unsigned int cpu)) | 1438 | int (*teardown)(unsigned int cpu), |
1439 | bool multi_instance) | ||
1380 | { | 1440 | { |
1381 | /* (Un)Install the callbacks for further cpu hotplug operations */ | 1441 | /* (Un)Install the callbacks for further cpu hotplug operations */ |
1382 | struct cpuhp_step *sp; | 1442 | struct cpuhp_step *sp; |
1383 | 1443 | ||
1384 | mutex_lock(&cpuhp_state_mutex); | 1444 | mutex_lock(&cpuhp_state_mutex); |
1385 | sp = cpuhp_get_step(state); | 1445 | sp = cpuhp_get_step(state); |
1386 | sp->startup = startup; | 1446 | sp->startup.single = startup; |
1387 | sp->teardown = teardown; | 1447 | sp->teardown.single = teardown; |
1388 | sp->name = name; | 1448 | sp->name = name; |
1449 | sp->multi_instance = multi_instance; | ||
1450 | INIT_HLIST_HEAD(&sp->list); | ||
1389 | mutex_unlock(&cpuhp_state_mutex); | 1451 | mutex_unlock(&cpuhp_state_mutex); |
1390 | } | 1452 | } |
1391 | 1453 | ||
1392 | static void *cpuhp_get_teardown_cb(enum cpuhp_state state) | 1454 | static void *cpuhp_get_teardown_cb(enum cpuhp_state state) |
1393 | { | 1455 | { |
1394 | return cpuhp_get_step(state)->teardown; | 1456 | return cpuhp_get_step(state)->teardown.single; |
1395 | } | 1457 | } |
1396 | 1458 | ||
1397 | /* | 1459 | /* |
1398 | * Call the startup/teardown function for a step either on the AP or | 1460 | * Call the startup/teardown function for a step either on the AP or |
1399 | * on the current CPU. | 1461 | * on the current CPU. |
1400 | */ | 1462 | */ |
1401 | static int cpuhp_issue_call(int cpu, enum cpuhp_state state, | 1463 | static int cpuhp_issue_call(int cpu, enum cpuhp_state state, bool bringup, |
1402 | int (*cb)(unsigned int), bool bringup) | 1464 | struct hlist_node *node) |
1403 | { | 1465 | { |
1466 | struct cpuhp_step *sp = cpuhp_get_step(state); | ||
1404 | int ret; | 1467 | int ret; |
1405 | 1468 | ||
1406 | if (!cb) | 1469 | if ((bringup && !sp->startup.single) || |
1470 | (!bringup && !sp->teardown.single)) | ||
1407 | return 0; | 1471 | return 0; |
1408 | /* | 1472 | /* |
1409 | * The non AP bound callbacks can fail on bringup. On teardown | 1473 | * The non AP bound callbacks can fail on bringup. On teardown |
@@ -1411,11 +1475,11 @@ static int cpuhp_issue_call(int cpu, enum cpuhp_state state, | |||
1411 | */ | 1475 | */ |
1412 | #ifdef CONFIG_SMP | 1476 | #ifdef CONFIG_SMP |
1413 | if (cpuhp_is_ap_state(state)) | 1477 | if (cpuhp_is_ap_state(state)) |
1414 | ret = cpuhp_invoke_ap_callback(cpu, state, cb); | 1478 | ret = cpuhp_invoke_ap_callback(cpu, state, bringup, node); |
1415 | else | 1479 | else |
1416 | ret = cpuhp_invoke_callback(cpu, state, cb); | 1480 | ret = cpuhp_invoke_callback(cpu, state, bringup, node); |
1417 | #else | 1481 | #else |
1418 | ret = cpuhp_invoke_callback(cpu, state, cb); | 1482 | ret = cpuhp_invoke_callback(cpu, state, bringup, node); |
1419 | #endif | 1483 | #endif |
1420 | BUG_ON(ret && !bringup); | 1484 | BUG_ON(ret && !bringup); |
1421 | return ret; | 1485 | return ret; |
@@ -1427,13 +1491,10 @@ static int cpuhp_issue_call(int cpu, enum cpuhp_state state, | |||
1427 | * Note: The teardown callbacks for rollback are not allowed to fail! | 1491 | * Note: The teardown callbacks for rollback are not allowed to fail! |
1428 | */ | 1492 | */ |
1429 | static void cpuhp_rollback_install(int failedcpu, enum cpuhp_state state, | 1493 | static void cpuhp_rollback_install(int failedcpu, enum cpuhp_state state, |
1430 | int (*teardown)(unsigned int cpu)) | 1494 | struct hlist_node *node) |
1431 | { | 1495 | { |
1432 | int cpu; | 1496 | int cpu; |
1433 | 1497 | ||
1434 | if (!teardown) | ||
1435 | return; | ||
1436 | |||
1437 | /* Roll back the already executed steps on the other cpus */ | 1498 | /* Roll back the already executed steps on the other cpus */ |
1438 | for_each_present_cpu(cpu) { | 1499 | for_each_present_cpu(cpu) { |
1439 | struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); | 1500 | struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); |
@@ -1444,7 +1505,7 @@ static void cpuhp_rollback_install(int failedcpu, enum cpuhp_state state, | |||
1444 | 1505 | ||
1445 | /* Did we invoke the startup call on that cpu ? */ | 1506 | /* Did we invoke the startup call on that cpu ? */ |
1446 | if (cpustate >= state) | 1507 | if (cpustate >= state) |
1447 | cpuhp_issue_call(cpu, state, teardown, false); | 1508 | cpuhp_issue_call(cpu, state, false, node); |
1448 | } | 1509 | } |
1449 | } | 1510 | } |
1450 | 1511 | ||
@@ -1471,6 +1532,52 @@ static int cpuhp_reserve_state(enum cpuhp_state state) | |||
1471 | return -ENOSPC; | 1532 | return -ENOSPC; |
1472 | } | 1533 | } |
1473 | 1534 | ||
1535 | int __cpuhp_state_add_instance(enum cpuhp_state state, struct hlist_node *node, | ||
1536 | bool invoke) | ||
1537 | { | ||
1538 | struct cpuhp_step *sp; | ||
1539 | int cpu; | ||
1540 | int ret; | ||
1541 | |||
1542 | sp = cpuhp_get_step(state); | ||
1543 | if (sp->multi_instance == false) | ||
1544 | return -EINVAL; | ||
1545 | |||
1546 | get_online_cpus(); | ||
1547 | |||
1548 | if (!invoke || !sp->startup.multi) | ||
1549 | goto add_node; | ||
1550 | |||
1551 | /* | ||
1552 | * Try to call the startup callback for each present cpu | ||
1553 | * depending on the hotplug state of the cpu. | ||
1554 | */ | ||
1555 | for_each_present_cpu(cpu) { | ||
1556 | struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); | ||
1557 | int cpustate = st->state; | ||
1558 | |||
1559 | if (cpustate < state) | ||
1560 | continue; | ||
1561 | |||
1562 | ret = cpuhp_issue_call(cpu, state, true, node); | ||
1563 | if (ret) { | ||
1564 | if (sp->teardown.multi) | ||
1565 | cpuhp_rollback_install(cpu, state, node); | ||
1566 | goto err; | ||
1567 | } | ||
1568 | } | ||
1569 | add_node: | ||
1570 | ret = 0; | ||
1571 | mutex_lock(&cpuhp_state_mutex); | ||
1572 | hlist_add_head(node, &sp->list); | ||
1573 | mutex_unlock(&cpuhp_state_mutex); | ||
1574 | |||
1575 | err: | ||
1576 | put_online_cpus(); | ||
1577 | return ret; | ||
1578 | } | ||
1579 | EXPORT_SYMBOL_GPL(__cpuhp_state_add_instance); | ||
1580 | |||
1474 | /** | 1581 | /** |
1475 | * __cpuhp_setup_state - Setup the callbacks for an hotplug machine state | 1582 | * __cpuhp_setup_state - Setup the callbacks for an hotplug machine state |
1476 | * @state: The state to setup | 1583 | * @state: The state to setup |
@@ -1484,7 +1591,8 @@ static int cpuhp_reserve_state(enum cpuhp_state state) | |||
1484 | int __cpuhp_setup_state(enum cpuhp_state state, | 1591 | int __cpuhp_setup_state(enum cpuhp_state state, |
1485 | const char *name, bool invoke, | 1592 | const char *name, bool invoke, |
1486 | int (*startup)(unsigned int cpu), | 1593 | int (*startup)(unsigned int cpu), |
1487 | int (*teardown)(unsigned int cpu)) | 1594 | int (*teardown)(unsigned int cpu), |
1595 | bool multi_instance) | ||
1488 | { | 1596 | { |
1489 | int cpu, ret = 0; | 1597 | int cpu, ret = 0; |
1490 | int dyn_state = 0; | 1598 | int dyn_state = 0; |
@@ -1503,7 +1611,7 @@ int __cpuhp_setup_state(enum cpuhp_state state, | |||
1503 | state = ret; | 1611 | state = ret; |
1504 | } | 1612 | } |
1505 | 1613 | ||
1506 | cpuhp_store_callbacks(state, name, startup, teardown); | 1614 | cpuhp_store_callbacks(state, name, startup, teardown, multi_instance); |
1507 | 1615 | ||
1508 | if (!invoke || !startup) | 1616 | if (!invoke || !startup) |
1509 | goto out; | 1617 | goto out; |
@@ -1519,10 +1627,11 @@ int __cpuhp_setup_state(enum cpuhp_state state, | |||
1519 | if (cpustate < state) | 1627 | if (cpustate < state) |
1520 | continue; | 1628 | continue; |
1521 | 1629 | ||
1522 | ret = cpuhp_issue_call(cpu, state, startup, true); | 1630 | ret = cpuhp_issue_call(cpu, state, true, NULL); |
1523 | if (ret) { | 1631 | if (ret) { |
1524 | cpuhp_rollback_install(cpu, state, teardown); | 1632 | if (teardown) |
1525 | cpuhp_store_callbacks(state, NULL, NULL, NULL); | 1633 | cpuhp_rollback_install(cpu, state, NULL); |
1634 | cpuhp_store_callbacks(state, NULL, NULL, NULL, false); | ||
1526 | goto out; | 1635 | goto out; |
1527 | } | 1636 | } |
1528 | } | 1637 | } |
@@ -1534,6 +1643,42 @@ out: | |||
1534 | } | 1643 | } |
1535 | EXPORT_SYMBOL(__cpuhp_setup_state); | 1644 | EXPORT_SYMBOL(__cpuhp_setup_state); |
1536 | 1645 | ||
1646 | int __cpuhp_state_remove_instance(enum cpuhp_state state, | ||
1647 | struct hlist_node *node, bool invoke) | ||
1648 | { | ||
1649 | struct cpuhp_step *sp = cpuhp_get_step(state); | ||
1650 | int cpu; | ||
1651 | |||
1652 | BUG_ON(cpuhp_cb_check(state)); | ||
1653 | |||
1654 | if (!sp->multi_instance) | ||
1655 | return -EINVAL; | ||
1656 | |||
1657 | get_online_cpus(); | ||
1658 | if (!invoke || !cpuhp_get_teardown_cb(state)) | ||
1659 | goto remove; | ||
1660 | /* | ||
1661 | * Call the teardown callback for each present cpu depending | ||
1662 | * on the hotplug state of the cpu. This function is not | ||
1663 | * allowed to fail currently! | ||
1664 | */ | ||
1665 | for_each_present_cpu(cpu) { | ||
1666 | struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); | ||
1667 | int cpustate = st->state; | ||
1668 | |||
1669 | if (cpustate >= state) | ||
1670 | cpuhp_issue_call(cpu, state, false, node); | ||
1671 | } | ||
1672 | |||
1673 | remove: | ||
1674 | mutex_lock(&cpuhp_state_mutex); | ||
1675 | hlist_del(node); | ||
1676 | mutex_unlock(&cpuhp_state_mutex); | ||
1677 | put_online_cpus(); | ||
1678 | |||
1679 | return 0; | ||
1680 | } | ||
1681 | EXPORT_SYMBOL_GPL(__cpuhp_state_remove_instance); | ||
1537 | /** | 1682 | /** |
1538 | * __cpuhp_remove_state - Remove the callbacks for an hotplug machine state | 1683 | * __cpuhp_remove_state - Remove the callbacks for an hotplug machine state |
1539 | * @state: The state to remove | 1684 | * @state: The state to remove |
@@ -1545,14 +1690,21 @@ EXPORT_SYMBOL(__cpuhp_setup_state); | |||
1545 | */ | 1690 | */ |
1546 | void __cpuhp_remove_state(enum cpuhp_state state, bool invoke) | 1691 | void __cpuhp_remove_state(enum cpuhp_state state, bool invoke) |
1547 | { | 1692 | { |
1548 | int (*teardown)(unsigned int cpu) = cpuhp_get_teardown_cb(state); | 1693 | struct cpuhp_step *sp = cpuhp_get_step(state); |
1549 | int cpu; | 1694 | int cpu; |
1550 | 1695 | ||
1551 | BUG_ON(cpuhp_cb_check(state)); | 1696 | BUG_ON(cpuhp_cb_check(state)); |
1552 | 1697 | ||
1553 | get_online_cpus(); | 1698 | get_online_cpus(); |
1554 | 1699 | ||
1555 | if (!invoke || !teardown) | 1700 | if (sp->multi_instance) { |
1701 | WARN(!hlist_empty(&sp->list), | ||
1702 | "Error: Removing state %d which has instances left.\n", | ||
1703 | state); | ||
1704 | goto remove; | ||
1705 | } | ||
1706 | |||
1707 | if (!invoke || !cpuhp_get_teardown_cb(state)) | ||
1556 | goto remove; | 1708 | goto remove; |
1557 | 1709 | ||
1558 | /* | 1710 | /* |
@@ -1565,10 +1717,10 @@ void __cpuhp_remove_state(enum cpuhp_state state, bool invoke) | |||
1565 | int cpustate = st->state; | 1717 | int cpustate = st->state; |
1566 | 1718 | ||
1567 | if (cpustate >= state) | 1719 | if (cpustate >= state) |
1568 | cpuhp_issue_call(cpu, state, teardown, false); | 1720 | cpuhp_issue_call(cpu, state, false, NULL); |
1569 | } | 1721 | } |
1570 | remove: | 1722 | remove: |
1571 | cpuhp_store_callbacks(state, NULL, NULL, NULL); | 1723 | cpuhp_store_callbacks(state, NULL, NULL, NULL, false); |
1572 | put_online_cpus(); | 1724 | put_online_cpus(); |
1573 | } | 1725 | } |
1574 | EXPORT_SYMBOL(__cpuhp_remove_state); | 1726 | EXPORT_SYMBOL(__cpuhp_remove_state); |
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index c27e53326bef..29f815d2ef7e 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
@@ -325,8 +325,7 @@ static struct file_system_type cpuset_fs_type = { | |||
325 | /* | 325 | /* |
326 | * Return in pmask the portion of a cpusets's cpus_allowed that | 326 | * Return in pmask the portion of a cpusets's cpus_allowed that |
327 | * are online. If none are online, walk up the cpuset hierarchy | 327 | * are online. If none are online, walk up the cpuset hierarchy |
328 | * until we find one that does have some online cpus. The top | 328 | * until we find one that does have some online cpus. |
329 | * cpuset always has some cpus online. | ||
330 | * | 329 | * |
331 | * One way or another, we guarantee to return some non-empty subset | 330 | * One way or another, we guarantee to return some non-empty subset |
332 | * of cpu_online_mask. | 331 | * of cpu_online_mask. |
@@ -335,8 +334,20 @@ static struct file_system_type cpuset_fs_type = { | |||
335 | */ | 334 | */ |
336 | static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask) | 335 | static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask) |
337 | { | 336 | { |
338 | while (!cpumask_intersects(cs->effective_cpus, cpu_online_mask)) | 337 | while (!cpumask_intersects(cs->effective_cpus, cpu_online_mask)) { |
339 | cs = parent_cs(cs); | 338 | cs = parent_cs(cs); |
339 | if (unlikely(!cs)) { | ||
340 | /* | ||
341 | * The top cpuset doesn't have any online cpu as a | ||
342 | * consequence of a race between cpuset_hotplug_work | ||
343 | * and cpu hotplug notifier. But we know the top | ||
344 | * cpuset's effective_cpus is on its way to to be | ||
345 | * identical to cpu_online_mask. | ||
346 | */ | ||
347 | cpumask_copy(pmask, cpu_online_mask); | ||
348 | return; | ||
349 | } | ||
350 | } | ||
340 | cpumask_and(pmask, cs->effective_cpus, cpu_online_mask); | 351 | cpumask_and(pmask, cs->effective_cpus, cpu_online_mask); |
341 | } | 352 | } |
342 | 353 | ||
@@ -2074,7 +2085,7 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css) | |||
2074 | * which could have been changed by cpuset just after it inherits the | 2085 | * which could have been changed by cpuset just after it inherits the |
2075 | * state from the parent and before it sits on the cgroup's task list. | 2086 | * state from the parent and before it sits on the cgroup's task list. |
2076 | */ | 2087 | */ |
2077 | void cpuset_fork(struct task_struct *task) | 2088 | static void cpuset_fork(struct task_struct *task) |
2078 | { | 2089 | { |
2079 | if (task_css_is_root(task, cpuset_cgrp_id)) | 2090 | if (task_css_is_root(task, cpuset_cgrp_id)) |
2080 | return; | 2091 | return; |
@@ -2704,7 +2715,7 @@ void __cpuset_memory_pressure_bump(void) | |||
2704 | int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns, | 2715 | int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns, |
2705 | struct pid *pid, struct task_struct *tsk) | 2716 | struct pid *pid, struct task_struct *tsk) |
2706 | { | 2717 | { |
2707 | char *buf, *p; | 2718 | char *buf; |
2708 | struct cgroup_subsys_state *css; | 2719 | struct cgroup_subsys_state *css; |
2709 | int retval; | 2720 | int retval; |
2710 | 2721 | ||
@@ -2713,14 +2724,15 @@ int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns, | |||
2713 | if (!buf) | 2724 | if (!buf) |
2714 | goto out; | 2725 | goto out; |
2715 | 2726 | ||
2716 | retval = -ENAMETOOLONG; | ||
2717 | css = task_get_css(tsk, cpuset_cgrp_id); | 2727 | css = task_get_css(tsk, cpuset_cgrp_id); |
2718 | p = cgroup_path_ns(css->cgroup, buf, PATH_MAX, | 2728 | retval = cgroup_path_ns(css->cgroup, buf, PATH_MAX, |
2719 | current->nsproxy->cgroup_ns); | 2729 | current->nsproxy->cgroup_ns); |
2720 | css_put(css); | 2730 | css_put(css); |
2721 | if (!p) | 2731 | if (retval >= PATH_MAX) |
2732 | retval = -ENAMETOOLONG; | ||
2733 | if (retval < 0) | ||
2722 | goto out_free; | 2734 | goto out_free; |
2723 | seq_puts(m, p); | 2735 | seq_puts(m, buf); |
2724 | seq_putc(m, '\n'); | 2736 | seq_putc(m, '\n'); |
2725 | retval = 0; | 2737 | retval = 0; |
2726 | out_free: | 2738 | out_free: |
diff --git a/kernel/events/core.c b/kernel/events/core.c index a54f2c2cdb20..6ee1febdf6ff 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c | |||
@@ -902,6 +902,17 @@ list_update_cgroup_event(struct perf_event *event, | |||
902 | * this will always be called from the right CPU. | 902 | * this will always be called from the right CPU. |
903 | */ | 903 | */ |
904 | cpuctx = __get_cpu_context(ctx); | 904 | cpuctx = __get_cpu_context(ctx); |
905 | |||
906 | /* Only set/clear cpuctx->cgrp if current task uses event->cgrp. */ | ||
907 | if (perf_cgroup_from_task(current, ctx) != event->cgrp) { | ||
908 | /* | ||
909 | * We are removing the last cpu event in this context. | ||
910 | * If that event is not active in this cpu, cpuctx->cgrp | ||
911 | * should've been cleared by perf_cgroup_switch. | ||
912 | */ | ||
913 | WARN_ON_ONCE(!add && cpuctx->cgrp); | ||
914 | return; | ||
915 | } | ||
905 | cpuctx->cgrp = add ? event->cgrp : NULL; | 916 | cpuctx->cgrp = add ? event->cgrp : NULL; |
906 | } | 917 | } |
907 | 918 | ||
@@ -1475,8 +1486,7 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx) | |||
1475 | if (event->group_leader == event) { | 1486 | if (event->group_leader == event) { |
1476 | struct list_head *list; | 1487 | struct list_head *list; |
1477 | 1488 | ||
1478 | if (is_software_event(event)) | 1489 | event->group_caps = event->event_caps; |
1479 | event->group_flags |= PERF_GROUP_SOFTWARE; | ||
1480 | 1490 | ||
1481 | list = ctx_group_list(event, ctx); | 1491 | list = ctx_group_list(event, ctx); |
1482 | list_add_tail(&event->group_entry, list); | 1492 | list_add_tail(&event->group_entry, list); |
@@ -1630,9 +1640,7 @@ static void perf_group_attach(struct perf_event *event) | |||
1630 | 1640 | ||
1631 | WARN_ON_ONCE(group_leader->ctx != event->ctx); | 1641 | WARN_ON_ONCE(group_leader->ctx != event->ctx); |
1632 | 1642 | ||
1633 | if (group_leader->group_flags & PERF_GROUP_SOFTWARE && | 1643 | group_leader->group_caps &= event->event_caps; |
1634 | !is_software_event(event)) | ||
1635 | group_leader->group_flags &= ~PERF_GROUP_SOFTWARE; | ||
1636 | 1644 | ||
1637 | list_add_tail(&event->group_entry, &group_leader->sibling_list); | 1645 | list_add_tail(&event->group_entry, &group_leader->sibling_list); |
1638 | group_leader->nr_siblings++; | 1646 | group_leader->nr_siblings++; |
@@ -1723,7 +1731,7 @@ static void perf_group_detach(struct perf_event *event) | |||
1723 | sibling->group_leader = sibling; | 1731 | sibling->group_leader = sibling; |
1724 | 1732 | ||
1725 | /* Inherit group flags from the previous leader */ | 1733 | /* Inherit group flags from the previous leader */ |
1726 | sibling->group_flags = event->group_flags; | 1734 | sibling->group_caps = event->group_caps; |
1727 | 1735 | ||
1728 | WARN_ON_ONCE(sibling->ctx != event->ctx); | 1736 | WARN_ON_ONCE(sibling->ctx != event->ctx); |
1729 | } | 1737 | } |
@@ -1832,6 +1840,8 @@ group_sched_out(struct perf_event *group_event, | |||
1832 | struct perf_event *event; | 1840 | struct perf_event *event; |
1833 | int state = group_event->state; | 1841 | int state = group_event->state; |
1834 | 1842 | ||
1843 | perf_pmu_disable(ctx->pmu); | ||
1844 | |||
1835 | event_sched_out(group_event, cpuctx, ctx); | 1845 | event_sched_out(group_event, cpuctx, ctx); |
1836 | 1846 | ||
1837 | /* | 1847 | /* |
@@ -1840,6 +1850,8 @@ group_sched_out(struct perf_event *group_event, | |||
1840 | list_for_each_entry(event, &group_event->sibling_list, group_entry) | 1850 | list_for_each_entry(event, &group_event->sibling_list, group_entry) |
1841 | event_sched_out(event, cpuctx, ctx); | 1851 | event_sched_out(event, cpuctx, ctx); |
1842 | 1852 | ||
1853 | perf_pmu_enable(ctx->pmu); | ||
1854 | |||
1843 | if (state == PERF_EVENT_STATE_ACTIVE && group_event->attr.exclusive) | 1855 | if (state == PERF_EVENT_STATE_ACTIVE && group_event->attr.exclusive) |
1844 | cpuctx->exclusive = 0; | 1856 | cpuctx->exclusive = 0; |
1845 | } | 1857 | } |
@@ -1959,6 +1971,12 @@ void perf_event_disable(struct perf_event *event) | |||
1959 | } | 1971 | } |
1960 | EXPORT_SYMBOL_GPL(perf_event_disable); | 1972 | EXPORT_SYMBOL_GPL(perf_event_disable); |
1961 | 1973 | ||
1974 | void perf_event_disable_inatomic(struct perf_event *event) | ||
1975 | { | ||
1976 | event->pending_disable = 1; | ||
1977 | irq_work_queue(&event->pending); | ||
1978 | } | ||
1979 | |||
1962 | static void perf_set_shadow_time(struct perf_event *event, | 1980 | static void perf_set_shadow_time(struct perf_event *event, |
1963 | struct perf_event_context *ctx, | 1981 | struct perf_event_context *ctx, |
1964 | u64 tstamp) | 1982 | u64 tstamp) |
@@ -2145,7 +2163,7 @@ static int group_can_go_on(struct perf_event *event, | |||
2145 | /* | 2163 | /* |
2146 | * Groups consisting entirely of software events can always go on. | 2164 | * Groups consisting entirely of software events can always go on. |
2147 | */ | 2165 | */ |
2148 | if (event->group_flags & PERF_GROUP_SOFTWARE) | 2166 | if (event->group_caps & PERF_EV_CAP_SOFTWARE) |
2149 | return 1; | 2167 | return 1; |
2150 | /* | 2168 | /* |
2151 | * If an exclusive group is already on, no other hardware | 2169 | * If an exclusive group is already on, no other hardware |
@@ -2491,7 +2509,7 @@ static int __perf_event_stop(void *info) | |||
2491 | * while restarting. | 2509 | * while restarting. |
2492 | */ | 2510 | */ |
2493 | if (sd->restart) | 2511 | if (sd->restart) |
2494 | event->pmu->start(event, PERF_EF_START); | 2512 | event->pmu->start(event, 0); |
2495 | 2513 | ||
2496 | return 0; | 2514 | return 0; |
2497 | } | 2515 | } |
@@ -2837,19 +2855,36 @@ unlock: | |||
2837 | } | 2855 | } |
2838 | } | 2856 | } |
2839 | 2857 | ||
2858 | static DEFINE_PER_CPU(struct list_head, sched_cb_list); | ||
2859 | |||
2840 | void perf_sched_cb_dec(struct pmu *pmu) | 2860 | void perf_sched_cb_dec(struct pmu *pmu) |
2841 | { | 2861 | { |
2862 | struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); | ||
2863 | |||
2842 | this_cpu_dec(perf_sched_cb_usages); | 2864 | this_cpu_dec(perf_sched_cb_usages); |
2865 | |||
2866 | if (!--cpuctx->sched_cb_usage) | ||
2867 | list_del(&cpuctx->sched_cb_entry); | ||
2843 | } | 2868 | } |
2844 | 2869 | ||
2870 | |||
2845 | void perf_sched_cb_inc(struct pmu *pmu) | 2871 | void perf_sched_cb_inc(struct pmu *pmu) |
2846 | { | 2872 | { |
2873 | struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); | ||
2874 | |||
2875 | if (!cpuctx->sched_cb_usage++) | ||
2876 | list_add(&cpuctx->sched_cb_entry, this_cpu_ptr(&sched_cb_list)); | ||
2877 | |||
2847 | this_cpu_inc(perf_sched_cb_usages); | 2878 | this_cpu_inc(perf_sched_cb_usages); |
2848 | } | 2879 | } |
2849 | 2880 | ||
2850 | /* | 2881 | /* |
2851 | * This function provides the context switch callback to the lower code | 2882 | * This function provides the context switch callback to the lower code |
2852 | * layer. It is invoked ONLY when the context switch callback is enabled. | 2883 | * layer. It is invoked ONLY when the context switch callback is enabled. |
2884 | * | ||
2885 | * This callback is relevant even to per-cpu events; for example multi event | ||
2886 | * PEBS requires this to provide PID/TID information. This requires we flush | ||
2887 | * all queued PEBS records before we context switch to a new task. | ||
2853 | */ | 2888 | */ |
2854 | static void perf_pmu_sched_task(struct task_struct *prev, | 2889 | static void perf_pmu_sched_task(struct task_struct *prev, |
2855 | struct task_struct *next, | 2890 | struct task_struct *next, |
@@ -2857,34 +2892,24 @@ static void perf_pmu_sched_task(struct task_struct *prev, | |||
2857 | { | 2892 | { |
2858 | struct perf_cpu_context *cpuctx; | 2893 | struct perf_cpu_context *cpuctx; |
2859 | struct pmu *pmu; | 2894 | struct pmu *pmu; |
2860 | unsigned long flags; | ||
2861 | 2895 | ||
2862 | if (prev == next) | 2896 | if (prev == next) |
2863 | return; | 2897 | return; |
2864 | 2898 | ||
2865 | local_irq_save(flags); | 2899 | list_for_each_entry(cpuctx, this_cpu_ptr(&sched_cb_list), sched_cb_entry) { |
2866 | 2900 | pmu = cpuctx->unique_pmu; /* software PMUs will not have sched_task */ | |
2867 | rcu_read_lock(); | ||
2868 | 2901 | ||
2869 | list_for_each_entry_rcu(pmu, &pmus, entry) { | 2902 | if (WARN_ON_ONCE(!pmu->sched_task)) |
2870 | if (pmu->sched_task) { | 2903 | continue; |
2871 | cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); | ||
2872 | |||
2873 | perf_ctx_lock(cpuctx, cpuctx->task_ctx); | ||
2874 | |||
2875 | perf_pmu_disable(pmu); | ||
2876 | 2904 | ||
2877 | pmu->sched_task(cpuctx->task_ctx, sched_in); | 2905 | perf_ctx_lock(cpuctx, cpuctx->task_ctx); |
2906 | perf_pmu_disable(pmu); | ||
2878 | 2907 | ||
2879 | perf_pmu_enable(pmu); | 2908 | pmu->sched_task(cpuctx->task_ctx, sched_in); |
2880 | 2909 | ||
2881 | perf_ctx_unlock(cpuctx, cpuctx->task_ctx); | 2910 | perf_pmu_enable(pmu); |
2882 | } | 2911 | perf_ctx_unlock(cpuctx, cpuctx->task_ctx); |
2883 | } | 2912 | } |
2884 | |||
2885 | rcu_read_unlock(); | ||
2886 | |||
2887 | local_irq_restore(flags); | ||
2888 | } | 2913 | } |
2889 | 2914 | ||
2890 | static void perf_event_switch(struct task_struct *task, | 2915 | static void perf_event_switch(struct task_struct *task, |
@@ -3416,6 +3441,22 @@ struct perf_read_data { | |||
3416 | int ret; | 3441 | int ret; |
3417 | }; | 3442 | }; |
3418 | 3443 | ||
3444 | static int find_cpu_to_read(struct perf_event *event, int local_cpu) | ||
3445 | { | ||
3446 | int event_cpu = event->oncpu; | ||
3447 | u16 local_pkg, event_pkg; | ||
3448 | |||
3449 | if (event->group_caps & PERF_EV_CAP_READ_ACTIVE_PKG) { | ||
3450 | event_pkg = topology_physical_package_id(event_cpu); | ||
3451 | local_pkg = topology_physical_package_id(local_cpu); | ||
3452 | |||
3453 | if (event_pkg == local_pkg) | ||
3454 | return local_cpu; | ||
3455 | } | ||
3456 | |||
3457 | return event_cpu; | ||
3458 | } | ||
3459 | |||
3419 | /* | 3460 | /* |
3420 | * Cross CPU call to read the hardware event | 3461 | * Cross CPU call to read the hardware event |
3421 | */ | 3462 | */ |
@@ -3537,7 +3578,7 @@ u64 perf_event_read_local(struct perf_event *event) | |||
3537 | 3578 | ||
3538 | static int perf_event_read(struct perf_event *event, bool group) | 3579 | static int perf_event_read(struct perf_event *event, bool group) |
3539 | { | 3580 | { |
3540 | int ret = 0; | 3581 | int ret = 0, cpu_to_read, local_cpu; |
3541 | 3582 | ||
3542 | /* | 3583 | /* |
3543 | * If event is enabled and currently active on a CPU, update the | 3584 | * If event is enabled and currently active on a CPU, update the |
@@ -3549,6 +3590,11 @@ static int perf_event_read(struct perf_event *event, bool group) | |||
3549 | .group = group, | 3590 | .group = group, |
3550 | .ret = 0, | 3591 | .ret = 0, |
3551 | }; | 3592 | }; |
3593 | |||
3594 | local_cpu = get_cpu(); | ||
3595 | cpu_to_read = find_cpu_to_read(event, local_cpu); | ||
3596 | put_cpu(); | ||
3597 | |||
3552 | /* | 3598 | /* |
3553 | * Purposely ignore the smp_call_function_single() return | 3599 | * Purposely ignore the smp_call_function_single() return |
3554 | * value. | 3600 | * value. |
@@ -3559,7 +3605,7 @@ static int perf_event_read(struct perf_event *event, bool group) | |||
3559 | * Therefore, either way, we'll have an up-to-date event count | 3605 | * Therefore, either way, we'll have an up-to-date event count |
3560 | * after this. | 3606 | * after this. |
3561 | */ | 3607 | */ |
3562 | (void)smp_call_function_single(event->oncpu, __perf_event_read, &data, 1); | 3608 | (void)smp_call_function_single(cpu_to_read, __perf_event_read, &data, 1); |
3563 | ret = data.ret; | 3609 | ret = data.ret; |
3564 | } else if (event->state == PERF_EVENT_STATE_INACTIVE) { | 3610 | } else if (event->state == PERF_EVENT_STATE_INACTIVE) { |
3565 | struct perf_event_context *ctx = event->ctx; | 3611 | struct perf_event_context *ctx = event->ctx; |
@@ -3929,7 +3975,7 @@ static void exclusive_event_destroy(struct perf_event *event) | |||
3929 | 3975 | ||
3930 | static bool exclusive_event_match(struct perf_event *e1, struct perf_event *e2) | 3976 | static bool exclusive_event_match(struct perf_event *e1, struct perf_event *e2) |
3931 | { | 3977 | { |
3932 | if ((e1->pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE) && | 3978 | if ((e1->pmu == e2->pmu) && |
3933 | (e1->cpu == e2->cpu || | 3979 | (e1->cpu == e2->cpu || |
3934 | e1->cpu == -1 || | 3980 | e1->cpu == -1 || |
3935 | e2->cpu == -1)) | 3981 | e2->cpu == -1)) |
@@ -5350,9 +5396,10 @@ perf_output_sample_regs(struct perf_output_handle *handle, | |||
5350 | struct pt_regs *regs, u64 mask) | 5396 | struct pt_regs *regs, u64 mask) |
5351 | { | 5397 | { |
5352 | int bit; | 5398 | int bit; |
5399 | DECLARE_BITMAP(_mask, 64); | ||
5353 | 5400 | ||
5354 | for_each_set_bit(bit, (const unsigned long *) &mask, | 5401 | bitmap_from_u64(_mask, mask); |
5355 | sizeof(mask) * BITS_PER_BYTE) { | 5402 | for_each_set_bit(bit, _mask, sizeof(mask) * BITS_PER_BYTE) { |
5356 | u64 val; | 5403 | u64 val; |
5357 | 5404 | ||
5358 | val = perf_reg_value(regs, bit); | 5405 | val = perf_reg_value(regs, bit); |
@@ -7045,11 +7092,11 @@ static int __perf_event_overflow(struct perf_event *event, | |||
7045 | if (events && atomic_dec_and_test(&event->event_limit)) { | 7092 | if (events && atomic_dec_and_test(&event->event_limit)) { |
7046 | ret = 1; | 7093 | ret = 1; |
7047 | event->pending_kill = POLL_HUP; | 7094 | event->pending_kill = POLL_HUP; |
7048 | event->pending_disable = 1; | 7095 | |
7049 | irq_work_queue(&event->pending); | 7096 | perf_event_disable_inatomic(event); |
7050 | } | 7097 | } |
7051 | 7098 | ||
7052 | event->overflow_handler(event, data, regs); | 7099 | READ_ONCE(event->overflow_handler)(event, data, regs); |
7053 | 7100 | ||
7054 | if (*perf_event_fasync(event) && event->pending_kill) { | 7101 | if (*perf_event_fasync(event) && event->pending_kill) { |
7055 | event->pending_wakeup = 1; | 7102 | event->pending_wakeup = 1; |
@@ -7664,11 +7711,83 @@ static void perf_event_free_filter(struct perf_event *event) | |||
7664 | ftrace_profile_free_filter(event); | 7711 | ftrace_profile_free_filter(event); |
7665 | } | 7712 | } |
7666 | 7713 | ||
7714 | #ifdef CONFIG_BPF_SYSCALL | ||
7715 | static void bpf_overflow_handler(struct perf_event *event, | ||
7716 | struct perf_sample_data *data, | ||
7717 | struct pt_regs *regs) | ||
7718 | { | ||
7719 | struct bpf_perf_event_data_kern ctx = { | ||
7720 | .data = data, | ||
7721 | .regs = regs, | ||
7722 | }; | ||
7723 | int ret = 0; | ||
7724 | |||
7725 | preempt_disable(); | ||
7726 | if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) | ||
7727 | goto out; | ||
7728 | rcu_read_lock(); | ||
7729 | ret = BPF_PROG_RUN(event->prog, (void *)&ctx); | ||
7730 | rcu_read_unlock(); | ||
7731 | out: | ||
7732 | __this_cpu_dec(bpf_prog_active); | ||
7733 | preempt_enable(); | ||
7734 | if (!ret) | ||
7735 | return; | ||
7736 | |||
7737 | event->orig_overflow_handler(event, data, regs); | ||
7738 | } | ||
7739 | |||
7740 | static int perf_event_set_bpf_handler(struct perf_event *event, u32 prog_fd) | ||
7741 | { | ||
7742 | struct bpf_prog *prog; | ||
7743 | |||
7744 | if (event->overflow_handler_context) | ||
7745 | /* hw breakpoint or kernel counter */ | ||
7746 | return -EINVAL; | ||
7747 | |||
7748 | if (event->prog) | ||
7749 | return -EEXIST; | ||
7750 | |||
7751 | prog = bpf_prog_get_type(prog_fd, BPF_PROG_TYPE_PERF_EVENT); | ||
7752 | if (IS_ERR(prog)) | ||
7753 | return PTR_ERR(prog); | ||
7754 | |||
7755 | event->prog = prog; | ||
7756 | event->orig_overflow_handler = READ_ONCE(event->overflow_handler); | ||
7757 | WRITE_ONCE(event->overflow_handler, bpf_overflow_handler); | ||
7758 | return 0; | ||
7759 | } | ||
7760 | |||
7761 | static void perf_event_free_bpf_handler(struct perf_event *event) | ||
7762 | { | ||
7763 | struct bpf_prog *prog = event->prog; | ||
7764 | |||
7765 | if (!prog) | ||
7766 | return; | ||
7767 | |||
7768 | WRITE_ONCE(event->overflow_handler, event->orig_overflow_handler); | ||
7769 | event->prog = NULL; | ||
7770 | bpf_prog_put(prog); | ||
7771 | } | ||
7772 | #else | ||
7773 | static int perf_event_set_bpf_handler(struct perf_event *event, u32 prog_fd) | ||
7774 | { | ||
7775 | return -EOPNOTSUPP; | ||
7776 | } | ||
7777 | static void perf_event_free_bpf_handler(struct perf_event *event) | ||
7778 | { | ||
7779 | } | ||
7780 | #endif | ||
7781 | |||
7667 | static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) | 7782 | static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) |
7668 | { | 7783 | { |
7669 | bool is_kprobe, is_tracepoint; | 7784 | bool is_kprobe, is_tracepoint; |
7670 | struct bpf_prog *prog; | 7785 | struct bpf_prog *prog; |
7671 | 7786 | ||
7787 | if (event->attr.type == PERF_TYPE_HARDWARE || | ||
7788 | event->attr.type == PERF_TYPE_SOFTWARE) | ||
7789 | return perf_event_set_bpf_handler(event, prog_fd); | ||
7790 | |||
7672 | if (event->attr.type != PERF_TYPE_TRACEPOINT) | 7791 | if (event->attr.type != PERF_TYPE_TRACEPOINT) |
7673 | return -EINVAL; | 7792 | return -EINVAL; |
7674 | 7793 | ||
@@ -7709,6 +7828,8 @@ static void perf_event_free_bpf_prog(struct perf_event *event) | |||
7709 | { | 7828 | { |
7710 | struct bpf_prog *prog; | 7829 | struct bpf_prog *prog; |
7711 | 7830 | ||
7831 | perf_event_free_bpf_handler(event); | ||
7832 | |||
7712 | if (!event->tp_event) | 7833 | if (!event->tp_event) |
7713 | return; | 7834 | return; |
7714 | 7835 | ||
@@ -7908,6 +8029,7 @@ restart: | |||
7908 | * if <size> is not specified, the range is treated as a single address. | 8029 | * if <size> is not specified, the range is treated as a single address. |
7909 | */ | 8030 | */ |
7910 | enum { | 8031 | enum { |
8032 | IF_ACT_NONE = -1, | ||
7911 | IF_ACT_FILTER, | 8033 | IF_ACT_FILTER, |
7912 | IF_ACT_START, | 8034 | IF_ACT_START, |
7913 | IF_ACT_STOP, | 8035 | IF_ACT_STOP, |
@@ -7931,6 +8053,7 @@ static const match_table_t if_tokens = { | |||
7931 | { IF_SRC_KERNEL, "%u/%u" }, | 8053 | { IF_SRC_KERNEL, "%u/%u" }, |
7932 | { IF_SRC_FILEADDR, "%u@%s" }, | 8054 | { IF_SRC_FILEADDR, "%u@%s" }, |
7933 | { IF_SRC_KERNELADDR, "%u" }, | 8055 | { IF_SRC_KERNELADDR, "%u" }, |
8056 | { IF_ACT_NONE, NULL }, | ||
7934 | }; | 8057 | }; |
7935 | 8058 | ||
7936 | /* | 8059 | /* |
@@ -8751,7 +8874,10 @@ EXPORT_SYMBOL_GPL(perf_pmu_register); | |||
8751 | 8874 | ||
8752 | void perf_pmu_unregister(struct pmu *pmu) | 8875 | void perf_pmu_unregister(struct pmu *pmu) |
8753 | { | 8876 | { |
8877 | int remove_device; | ||
8878 | |||
8754 | mutex_lock(&pmus_lock); | 8879 | mutex_lock(&pmus_lock); |
8880 | remove_device = pmu_bus_running; | ||
8755 | list_del_rcu(&pmu->entry); | 8881 | list_del_rcu(&pmu->entry); |
8756 | mutex_unlock(&pmus_lock); | 8882 | mutex_unlock(&pmus_lock); |
8757 | 8883 | ||
@@ -8765,10 +8891,12 @@ void perf_pmu_unregister(struct pmu *pmu) | |||
8765 | free_percpu(pmu->pmu_disable_count); | 8891 | free_percpu(pmu->pmu_disable_count); |
8766 | if (pmu->type >= PERF_TYPE_MAX) | 8892 | if (pmu->type >= PERF_TYPE_MAX) |
8767 | idr_remove(&pmu_idr, pmu->type); | 8893 | idr_remove(&pmu_idr, pmu->type); |
8768 | if (pmu->nr_addr_filters) | 8894 | if (remove_device) { |
8769 | device_remove_file(pmu->dev, &dev_attr_nr_addr_filters); | 8895 | if (pmu->nr_addr_filters) |
8770 | device_del(pmu->dev); | 8896 | device_remove_file(pmu->dev, &dev_attr_nr_addr_filters); |
8771 | put_device(pmu->dev); | 8897 | device_del(pmu->dev); |
8898 | put_device(pmu->dev); | ||
8899 | } | ||
8772 | free_pmu_context(pmu); | 8900 | free_pmu_context(pmu); |
8773 | } | 8901 | } |
8774 | EXPORT_SYMBOL_GPL(perf_pmu_unregister); | 8902 | EXPORT_SYMBOL_GPL(perf_pmu_unregister); |
@@ -9025,6 +9153,19 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, | |||
9025 | if (!overflow_handler && parent_event) { | 9153 | if (!overflow_handler && parent_event) { |
9026 | overflow_handler = parent_event->overflow_handler; | 9154 | overflow_handler = parent_event->overflow_handler; |
9027 | context = parent_event->overflow_handler_context; | 9155 | context = parent_event->overflow_handler_context; |
9156 | #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_EVENT_TRACING) | ||
9157 | if (overflow_handler == bpf_overflow_handler) { | ||
9158 | struct bpf_prog *prog = bpf_prog_inc(parent_event->prog); | ||
9159 | |||
9160 | if (IS_ERR(prog)) { | ||
9161 | err = PTR_ERR(prog); | ||
9162 | goto err_ns; | ||
9163 | } | ||
9164 | event->prog = prog; | ||
9165 | event->orig_overflow_handler = | ||
9166 | parent_event->orig_overflow_handler; | ||
9167 | } | ||
9168 | #endif | ||
9028 | } | 9169 | } |
9029 | 9170 | ||
9030 | if (overflow_handler) { | 9171 | if (overflow_handler) { |
@@ -9505,6 +9646,9 @@ SYSCALL_DEFINE5(perf_event_open, | |||
9505 | goto err_alloc; | 9646 | goto err_alloc; |
9506 | } | 9647 | } |
9507 | 9648 | ||
9649 | if (pmu->task_ctx_nr == perf_sw_context) | ||
9650 | event->event_caps |= PERF_EV_CAP_SOFTWARE; | ||
9651 | |||
9508 | if (group_leader && | 9652 | if (group_leader && |
9509 | (is_software_event(event) != is_software_event(group_leader))) { | 9653 | (is_software_event(event) != is_software_event(group_leader))) { |
9510 | if (is_software_event(event)) { | 9654 | if (is_software_event(event)) { |
@@ -9518,7 +9662,7 @@ SYSCALL_DEFINE5(perf_event_open, | |||
9518 | */ | 9662 | */ |
9519 | pmu = group_leader->pmu; | 9663 | pmu = group_leader->pmu; |
9520 | } else if (is_software_event(group_leader) && | 9664 | } else if (is_software_event(group_leader) && |
9521 | (group_leader->group_flags & PERF_GROUP_SOFTWARE)) { | 9665 | (group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) { |
9522 | /* | 9666 | /* |
9523 | * In case the group is a pure software group, and we | 9667 | * In case the group is a pure software group, and we |
9524 | * try to add a hardware event, move the whole group to | 9668 | * try to add a hardware event, move the whole group to |
@@ -10453,6 +10597,8 @@ static void __init perf_event_init_all_cpus(void) | |||
10453 | 10597 | ||
10454 | INIT_LIST_HEAD(&per_cpu(pmu_sb_events.list, cpu)); | 10598 | INIT_LIST_HEAD(&per_cpu(pmu_sb_events.list, cpu)); |
10455 | raw_spin_lock_init(&per_cpu(pmu_sb_events.lock, cpu)); | 10599 | raw_spin_lock_init(&per_cpu(pmu_sb_events.lock, cpu)); |
10600 | |||
10601 | INIT_LIST_HEAD(&per_cpu(sched_cb_list, cpu)); | ||
10456 | } | 10602 | } |
10457 | } | 10603 | } |
10458 | 10604 | ||
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 8c50276b60d1..f9ec9add2164 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c | |||
@@ -150,7 +150,7 @@ static loff_t vaddr_to_offset(struct vm_area_struct *vma, unsigned long vaddr) | |||
150 | * Returns 0 on success, -EFAULT on failure. | 150 | * Returns 0 on success, -EFAULT on failure. |
151 | */ | 151 | */ |
152 | static int __replace_page(struct vm_area_struct *vma, unsigned long addr, | 152 | static int __replace_page(struct vm_area_struct *vma, unsigned long addr, |
153 | struct page *page, struct page *kpage) | 153 | struct page *old_page, struct page *new_page) |
154 | { | 154 | { |
155 | struct mm_struct *mm = vma->vm_mm; | 155 | struct mm_struct *mm = vma->vm_mm; |
156 | spinlock_t *ptl; | 156 | spinlock_t *ptl; |
@@ -161,49 +161,49 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, | |||
161 | const unsigned long mmun_end = addr + PAGE_SIZE; | 161 | const unsigned long mmun_end = addr + PAGE_SIZE; |
162 | struct mem_cgroup *memcg; | 162 | struct mem_cgroup *memcg; |
163 | 163 | ||
164 | err = mem_cgroup_try_charge(kpage, vma->vm_mm, GFP_KERNEL, &memcg, | 164 | err = mem_cgroup_try_charge(new_page, vma->vm_mm, GFP_KERNEL, &memcg, |
165 | false); | 165 | false); |
166 | if (err) | 166 | if (err) |
167 | return err; | 167 | return err; |
168 | 168 | ||
169 | /* For try_to_free_swap() and munlock_vma_page() below */ | 169 | /* For try_to_free_swap() and munlock_vma_page() below */ |
170 | lock_page(page); | 170 | lock_page(old_page); |
171 | 171 | ||
172 | mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); | 172 | mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); |
173 | err = -EAGAIN; | 173 | err = -EAGAIN; |
174 | ptep = page_check_address(page, mm, addr, &ptl, 0); | 174 | ptep = page_check_address(old_page, mm, addr, &ptl, 0); |
175 | if (!ptep) { | 175 | if (!ptep) { |
176 | mem_cgroup_cancel_charge(kpage, memcg, false); | 176 | mem_cgroup_cancel_charge(new_page, memcg, false); |
177 | goto unlock; | 177 | goto unlock; |
178 | } | 178 | } |
179 | 179 | ||
180 | get_page(kpage); | 180 | get_page(new_page); |
181 | page_add_new_anon_rmap(kpage, vma, addr, false); | 181 | page_add_new_anon_rmap(new_page, vma, addr, false); |
182 | mem_cgroup_commit_charge(kpage, memcg, false, false); | 182 | mem_cgroup_commit_charge(new_page, memcg, false, false); |
183 | lru_cache_add_active_or_unevictable(kpage, vma); | 183 | lru_cache_add_active_or_unevictable(new_page, vma); |
184 | 184 | ||
185 | if (!PageAnon(page)) { | 185 | if (!PageAnon(old_page)) { |
186 | dec_mm_counter(mm, mm_counter_file(page)); | 186 | dec_mm_counter(mm, mm_counter_file(old_page)); |
187 | inc_mm_counter(mm, MM_ANONPAGES); | 187 | inc_mm_counter(mm, MM_ANONPAGES); |
188 | } | 188 | } |
189 | 189 | ||
190 | flush_cache_page(vma, addr, pte_pfn(*ptep)); | 190 | flush_cache_page(vma, addr, pte_pfn(*ptep)); |
191 | ptep_clear_flush_notify(vma, addr, ptep); | 191 | ptep_clear_flush_notify(vma, addr, ptep); |
192 | set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot)); | 192 | set_pte_at_notify(mm, addr, ptep, mk_pte(new_page, vma->vm_page_prot)); |
193 | 193 | ||
194 | page_remove_rmap(page, false); | 194 | page_remove_rmap(old_page, false); |
195 | if (!page_mapped(page)) | 195 | if (!page_mapped(old_page)) |
196 | try_to_free_swap(page); | 196 | try_to_free_swap(old_page); |
197 | pte_unmap_unlock(ptep, ptl); | 197 | pte_unmap_unlock(ptep, ptl); |
198 | 198 | ||
199 | if (vma->vm_flags & VM_LOCKED) | 199 | if (vma->vm_flags & VM_LOCKED) |
200 | munlock_vma_page(page); | 200 | munlock_vma_page(old_page); |
201 | put_page(page); | 201 | put_page(old_page); |
202 | 202 | ||
203 | err = 0; | 203 | err = 0; |
204 | unlock: | 204 | unlock: |
205 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | 205 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); |
206 | unlock_page(page); | 206 | unlock_page(old_page); |
207 | return err; | 207 | return err; |
208 | } | 208 | } |
209 | 209 | ||
@@ -300,7 +300,8 @@ int uprobe_write_opcode(struct mm_struct *mm, unsigned long vaddr, | |||
300 | 300 | ||
301 | retry: | 301 | retry: |
302 | /* Read the page with vaddr into memory */ | 302 | /* Read the page with vaddr into memory */ |
303 | ret = get_user_pages_remote(NULL, mm, vaddr, 1, 0, 1, &old_page, &vma); | 303 | ret = get_user_pages_remote(NULL, mm, vaddr, 1, FOLL_FORCE, &old_page, |
304 | &vma); | ||
304 | if (ret <= 0) | 305 | if (ret <= 0) |
305 | return ret; | 306 | return ret; |
306 | 307 | ||
@@ -1710,7 +1711,8 @@ static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr) | |||
1710 | * but we treat this as a 'remote' access since it is | 1711 | * but we treat this as a 'remote' access since it is |
1711 | * essentially a kernel access to the memory. | 1712 | * essentially a kernel access to the memory. |
1712 | */ | 1713 | */ |
1713 | result = get_user_pages_remote(NULL, mm, vaddr, 1, 0, 1, &page, NULL); | 1714 | result = get_user_pages_remote(NULL, mm, vaddr, 1, FOLL_FORCE, &page, |
1715 | NULL); | ||
1714 | if (result < 0) | 1716 | if (result < 0) |
1715 | return result; | 1717 | return result; |
1716 | 1718 | ||
diff --git a/kernel/exit.c b/kernel/exit.c index 091a78be3b09..3076f3089919 100644 --- a/kernel/exit.c +++ b/kernel/exit.c | |||
@@ -511,7 +511,7 @@ static void exit_mm(struct task_struct *tsk) | |||
511 | mm_update_next_owner(mm); | 511 | mm_update_next_owner(mm); |
512 | mmput(mm); | 512 | mmput(mm); |
513 | if (test_thread_flag(TIF_MEMDIE)) | 513 | if (test_thread_flag(TIF_MEMDIE)) |
514 | exit_oom_victim(tsk); | 514 | exit_oom_victim(); |
515 | } | 515 | } |
516 | 516 | ||
517 | static struct task_struct *find_alive_thread(struct task_struct *p) | 517 | static struct task_struct *find_alive_thread(struct task_struct *p) |
@@ -725,7 +725,7 @@ static void check_stack_usage(void) | |||
725 | static inline void check_stack_usage(void) {} | 725 | static inline void check_stack_usage(void) {} |
726 | #endif | 726 | #endif |
727 | 727 | ||
728 | void do_exit(long code) | 728 | void __noreturn do_exit(long code) |
729 | { | 729 | { |
730 | struct task_struct *tsk = current; | 730 | struct task_struct *tsk = current; |
731 | int group_dead; | 731 | int group_dead; |
@@ -836,6 +836,7 @@ void do_exit(long code) | |||
836 | */ | 836 | */ |
837 | perf_event_exit_task(tsk); | 837 | perf_event_exit_task(tsk); |
838 | 838 | ||
839 | sched_autogroup_exit_task(tsk); | ||
839 | cgroup_exit(tsk); | 840 | cgroup_exit(tsk); |
840 | 841 | ||
841 | /* | 842 | /* |
@@ -882,29 +883,7 @@ void do_exit(long code) | |||
882 | exit_rcu(); | 883 | exit_rcu(); |
883 | TASKS_RCU(__srcu_read_unlock(&tasks_rcu_exit_srcu, tasks_rcu_i)); | 884 | TASKS_RCU(__srcu_read_unlock(&tasks_rcu_exit_srcu, tasks_rcu_i)); |
884 | 885 | ||
885 | /* | 886 | do_task_dead(); |
886 | * The setting of TASK_RUNNING by try_to_wake_up() may be delayed | ||
887 | * when the following two conditions become true. | ||
888 | * - There is race condition of mmap_sem (It is acquired by | ||
889 | * exit_mm()), and | ||
890 | * - SMI occurs before setting TASK_RUNINNG. | ||
891 | * (or hypervisor of virtual machine switches to other guest) | ||
892 | * As a result, we may become TASK_RUNNING after becoming TASK_DEAD | ||
893 | * | ||
894 | * To avoid it, we have to wait for releasing tsk->pi_lock which | ||
895 | * is held by try_to_wake_up() | ||
896 | */ | ||
897 | smp_mb(); | ||
898 | raw_spin_unlock_wait(&tsk->pi_lock); | ||
899 | |||
900 | /* causes final put_task_struct in finish_task_switch(). */ | ||
901 | tsk->state = TASK_DEAD; | ||
902 | tsk->flags |= PF_NOFREEZE; /* tell freezer to ignore us */ | ||
903 | schedule(); | ||
904 | BUG(); | ||
905 | /* Avoid "noreturn function does return". */ | ||
906 | for (;;) | ||
907 | cpu_relax(); /* For when BUG is null */ | ||
908 | } | 887 | } |
909 | EXPORT_SYMBOL_GPL(do_exit); | 888 | EXPORT_SYMBOL_GPL(do_exit); |
910 | 889 | ||
diff --git a/kernel/fork.c b/kernel/fork.c index beb31725f7e2..997ac1d584f7 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -158,19 +158,83 @@ void __weak arch_release_thread_stack(unsigned long *stack) | |||
158 | * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a | 158 | * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a |
159 | * kmemcache based allocator. | 159 | * kmemcache based allocator. |
160 | */ | 160 | */ |
161 | # if THREAD_SIZE >= PAGE_SIZE | 161 | # if THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK) |
162 | static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, | 162 | |
163 | int node) | 163 | #ifdef CONFIG_VMAP_STACK |
164 | /* | ||
165 | * vmalloc() is a bit slow, and calling vfree() enough times will force a TLB | ||
166 | * flush. Try to minimize the number of calls by caching stacks. | ||
167 | */ | ||
168 | #define NR_CACHED_STACKS 2 | ||
169 | static DEFINE_PER_CPU(struct vm_struct *, cached_stacks[NR_CACHED_STACKS]); | ||
170 | #endif | ||
171 | |||
172 | static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node) | ||
164 | { | 173 | { |
174 | #ifdef CONFIG_VMAP_STACK | ||
175 | void *stack; | ||
176 | int i; | ||
177 | |||
178 | local_irq_disable(); | ||
179 | for (i = 0; i < NR_CACHED_STACKS; i++) { | ||
180 | struct vm_struct *s = this_cpu_read(cached_stacks[i]); | ||
181 | |||
182 | if (!s) | ||
183 | continue; | ||
184 | this_cpu_write(cached_stacks[i], NULL); | ||
185 | |||
186 | tsk->stack_vm_area = s; | ||
187 | local_irq_enable(); | ||
188 | return s->addr; | ||
189 | } | ||
190 | local_irq_enable(); | ||
191 | |||
192 | stack = __vmalloc_node_range(THREAD_SIZE, THREAD_SIZE, | ||
193 | VMALLOC_START, VMALLOC_END, | ||
194 | THREADINFO_GFP | __GFP_HIGHMEM, | ||
195 | PAGE_KERNEL, | ||
196 | 0, node, __builtin_return_address(0)); | ||
197 | |||
198 | /* | ||
199 | * We can't call find_vm_area() in interrupt context, and | ||
200 | * free_thread_stack() can be called in interrupt context, | ||
201 | * so cache the vm_struct. | ||
202 | */ | ||
203 | if (stack) | ||
204 | tsk->stack_vm_area = find_vm_area(stack); | ||
205 | return stack; | ||
206 | #else | ||
165 | struct page *page = alloc_pages_node(node, THREADINFO_GFP, | 207 | struct page *page = alloc_pages_node(node, THREADINFO_GFP, |
166 | THREAD_SIZE_ORDER); | 208 | THREAD_SIZE_ORDER); |
167 | 209 | ||
168 | return page ? page_address(page) : NULL; | 210 | return page ? page_address(page) : NULL; |
211 | #endif | ||
169 | } | 212 | } |
170 | 213 | ||
171 | static inline void free_thread_stack(unsigned long *stack) | 214 | static inline void free_thread_stack(struct task_struct *tsk) |
172 | { | 215 | { |
173 | __free_pages(virt_to_page(stack), THREAD_SIZE_ORDER); | 216 | #ifdef CONFIG_VMAP_STACK |
217 | if (task_stack_vm_area(tsk)) { | ||
218 | unsigned long flags; | ||
219 | int i; | ||
220 | |||
221 | local_irq_save(flags); | ||
222 | for (i = 0; i < NR_CACHED_STACKS; i++) { | ||
223 | if (this_cpu_read(cached_stacks[i])) | ||
224 | continue; | ||
225 | |||
226 | this_cpu_write(cached_stacks[i], tsk->stack_vm_area); | ||
227 | local_irq_restore(flags); | ||
228 | return; | ||
229 | } | ||
230 | local_irq_restore(flags); | ||
231 | |||
232 | vfree(tsk->stack); | ||
233 | return; | ||
234 | } | ||
235 | #endif | ||
236 | |||
237 | __free_pages(virt_to_page(tsk->stack), THREAD_SIZE_ORDER); | ||
174 | } | 238 | } |
175 | # else | 239 | # else |
176 | static struct kmem_cache *thread_stack_cache; | 240 | static struct kmem_cache *thread_stack_cache; |
@@ -181,9 +245,9 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, | |||
181 | return kmem_cache_alloc_node(thread_stack_cache, THREADINFO_GFP, node); | 245 | return kmem_cache_alloc_node(thread_stack_cache, THREADINFO_GFP, node); |
182 | } | 246 | } |
183 | 247 | ||
184 | static void free_thread_stack(unsigned long *stack) | 248 | static void free_thread_stack(struct task_struct *tsk) |
185 | { | 249 | { |
186 | kmem_cache_free(thread_stack_cache, stack); | 250 | kmem_cache_free(thread_stack_cache, tsk->stack); |
187 | } | 251 | } |
188 | 252 | ||
189 | void thread_stack_cache_init(void) | 253 | void thread_stack_cache_init(void) |
@@ -213,24 +277,79 @@ struct kmem_cache *vm_area_cachep; | |||
213 | /* SLAB cache for mm_struct structures (tsk->mm) */ | 277 | /* SLAB cache for mm_struct structures (tsk->mm) */ |
214 | static struct kmem_cache *mm_cachep; | 278 | static struct kmem_cache *mm_cachep; |
215 | 279 | ||
216 | static void account_kernel_stack(unsigned long *stack, int account) | 280 | static void account_kernel_stack(struct task_struct *tsk, int account) |
281 | { | ||
282 | void *stack = task_stack_page(tsk); | ||
283 | struct vm_struct *vm = task_stack_vm_area(tsk); | ||
284 | |||
285 | BUILD_BUG_ON(IS_ENABLED(CONFIG_VMAP_STACK) && PAGE_SIZE % 1024 != 0); | ||
286 | |||
287 | if (vm) { | ||
288 | int i; | ||
289 | |||
290 | BUG_ON(vm->nr_pages != THREAD_SIZE / PAGE_SIZE); | ||
291 | |||
292 | for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) { | ||
293 | mod_zone_page_state(page_zone(vm->pages[i]), | ||
294 | NR_KERNEL_STACK_KB, | ||
295 | PAGE_SIZE / 1024 * account); | ||
296 | } | ||
297 | |||
298 | /* All stack pages belong to the same memcg. */ | ||
299 | memcg_kmem_update_page_stat(vm->pages[0], MEMCG_KERNEL_STACK_KB, | ||
300 | account * (THREAD_SIZE / 1024)); | ||
301 | } else { | ||
302 | /* | ||
303 | * All stack pages are in the same zone and belong to the | ||
304 | * same memcg. | ||
305 | */ | ||
306 | struct page *first_page = virt_to_page(stack); | ||
307 | |||
308 | mod_zone_page_state(page_zone(first_page), NR_KERNEL_STACK_KB, | ||
309 | THREAD_SIZE / 1024 * account); | ||
310 | |||
311 | memcg_kmem_update_page_stat(first_page, MEMCG_KERNEL_STACK_KB, | ||
312 | account * (THREAD_SIZE / 1024)); | ||
313 | } | ||
314 | } | ||
315 | |||
316 | static void release_task_stack(struct task_struct *tsk) | ||
217 | { | 317 | { |
218 | /* All stack pages are in the same zone and belong to the same memcg. */ | 318 | if (WARN_ON(tsk->state != TASK_DEAD)) |
219 | struct page *first_page = virt_to_page(stack); | 319 | return; /* Better to leak the stack than to free prematurely */ |
220 | 320 | ||
221 | mod_zone_page_state(page_zone(first_page), NR_KERNEL_STACK_KB, | 321 | account_kernel_stack(tsk, -1); |
222 | THREAD_SIZE / 1024 * account); | 322 | arch_release_thread_stack(tsk->stack); |
323 | free_thread_stack(tsk); | ||
324 | tsk->stack = NULL; | ||
325 | #ifdef CONFIG_VMAP_STACK | ||
326 | tsk->stack_vm_area = NULL; | ||
327 | #endif | ||
328 | } | ||
223 | 329 | ||
224 | memcg_kmem_update_page_stat( | 330 | #ifdef CONFIG_THREAD_INFO_IN_TASK |
225 | first_page, MEMCG_KERNEL_STACK_KB, | 331 | void put_task_stack(struct task_struct *tsk) |
226 | account * (THREAD_SIZE / 1024)); | 332 | { |
333 | if (atomic_dec_and_test(&tsk->stack_refcount)) | ||
334 | release_task_stack(tsk); | ||
227 | } | 335 | } |
336 | #endif | ||
228 | 337 | ||
229 | void free_task(struct task_struct *tsk) | 338 | void free_task(struct task_struct *tsk) |
230 | { | 339 | { |
231 | account_kernel_stack(tsk->stack, -1); | 340 | #ifndef CONFIG_THREAD_INFO_IN_TASK |
232 | arch_release_thread_stack(tsk->stack); | 341 | /* |
233 | free_thread_stack(tsk->stack); | 342 | * The task is finally done with both the stack and thread_info, |
343 | * so free both. | ||
344 | */ | ||
345 | release_task_stack(tsk); | ||
346 | #else | ||
347 | /* | ||
348 | * If the task had a separate stack allocation, it should be gone | ||
349 | * by now. | ||
350 | */ | ||
351 | WARN_ON_ONCE(atomic_read(&tsk->stack_refcount) != 0); | ||
352 | #endif | ||
234 | rt_mutex_debug_task_free(tsk); | 353 | rt_mutex_debug_task_free(tsk); |
235 | ftrace_graph_exit_task(tsk); | 354 | ftrace_graph_exit_task(tsk); |
236 | put_seccomp_filter(tsk); | 355 | put_seccomp_filter(tsk); |
@@ -243,6 +362,12 @@ static inline void free_signal_struct(struct signal_struct *sig) | |||
243 | { | 362 | { |
244 | taskstats_tgid_free(sig); | 363 | taskstats_tgid_free(sig); |
245 | sched_autogroup_exit(sig); | 364 | sched_autogroup_exit(sig); |
365 | /* | ||
366 | * __mmdrop is not safe to call from softirq context on x86 due to | ||
367 | * pgd_dtor so postpone it to the async context | ||
368 | */ | ||
369 | if (sig->oom_mm) | ||
370 | mmdrop_async(sig->oom_mm); | ||
246 | kmem_cache_free(signal_cachep, sig); | 371 | kmem_cache_free(signal_cachep, sig); |
247 | } | 372 | } |
248 | 373 | ||
@@ -302,6 +427,7 @@ int arch_task_struct_size __read_mostly; | |||
302 | 427 | ||
303 | void __init fork_init(void) | 428 | void __init fork_init(void) |
304 | { | 429 | { |
430 | int i; | ||
305 | #ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR | 431 | #ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR |
306 | #ifndef ARCH_MIN_TASKALIGN | 432 | #ifndef ARCH_MIN_TASKALIGN |
307 | #define ARCH_MIN_TASKALIGN L1_CACHE_BYTES | 433 | #define ARCH_MIN_TASKALIGN L1_CACHE_BYTES |
@@ -321,6 +447,10 @@ void __init fork_init(void) | |||
321 | init_task.signal->rlim[RLIMIT_NPROC].rlim_max = max_threads/2; | 447 | init_task.signal->rlim[RLIMIT_NPROC].rlim_max = max_threads/2; |
322 | init_task.signal->rlim[RLIMIT_SIGPENDING] = | 448 | init_task.signal->rlim[RLIMIT_SIGPENDING] = |
323 | init_task.signal->rlim[RLIMIT_NPROC]; | 449 | init_task.signal->rlim[RLIMIT_NPROC]; |
450 | |||
451 | for (i = 0; i < UCOUNT_COUNTS; i++) { | ||
452 | init_user_ns.ucount_max[i] = max_threads/2; | ||
453 | } | ||
324 | } | 454 | } |
325 | 455 | ||
326 | int __weak arch_dup_task_struct(struct task_struct *dst, | 456 | int __weak arch_dup_task_struct(struct task_struct *dst, |
@@ -342,6 +472,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) | |||
342 | { | 472 | { |
343 | struct task_struct *tsk; | 473 | struct task_struct *tsk; |
344 | unsigned long *stack; | 474 | unsigned long *stack; |
475 | struct vm_struct *stack_vm_area; | ||
345 | int err; | 476 | int err; |
346 | 477 | ||
347 | if (node == NUMA_NO_NODE) | 478 | if (node == NUMA_NO_NODE) |
@@ -354,11 +485,26 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) | |||
354 | if (!stack) | 485 | if (!stack) |
355 | goto free_tsk; | 486 | goto free_tsk; |
356 | 487 | ||
488 | stack_vm_area = task_stack_vm_area(tsk); | ||
489 | |||
357 | err = arch_dup_task_struct(tsk, orig); | 490 | err = arch_dup_task_struct(tsk, orig); |
491 | |||
492 | /* | ||
493 | * arch_dup_task_struct() clobbers the stack-related fields. Make | ||
494 | * sure they're properly initialized before using any stack-related | ||
495 | * functions again. | ||
496 | */ | ||
497 | tsk->stack = stack; | ||
498 | #ifdef CONFIG_VMAP_STACK | ||
499 | tsk->stack_vm_area = stack_vm_area; | ||
500 | #endif | ||
501 | #ifdef CONFIG_THREAD_INFO_IN_TASK | ||
502 | atomic_set(&tsk->stack_refcount, 1); | ||
503 | #endif | ||
504 | |||
358 | if (err) | 505 | if (err) |
359 | goto free_stack; | 506 | goto free_stack; |
360 | 507 | ||
361 | tsk->stack = stack; | ||
362 | #ifdef CONFIG_SECCOMP | 508 | #ifdef CONFIG_SECCOMP |
363 | /* | 509 | /* |
364 | * We must handle setting up seccomp filters once we're under | 510 | * We must handle setting up seccomp filters once we're under |
@@ -390,21 +536,22 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) | |||
390 | tsk->task_frag.page = NULL; | 536 | tsk->task_frag.page = NULL; |
391 | tsk->wake_q.next = NULL; | 537 | tsk->wake_q.next = NULL; |
392 | 538 | ||
393 | account_kernel_stack(stack, 1); | 539 | account_kernel_stack(tsk, 1); |
394 | 540 | ||
395 | kcov_task_init(tsk); | 541 | kcov_task_init(tsk); |
396 | 542 | ||
397 | return tsk; | 543 | return tsk; |
398 | 544 | ||
399 | free_stack: | 545 | free_stack: |
400 | free_thread_stack(stack); | 546 | free_thread_stack(tsk); |
401 | free_tsk: | 547 | free_tsk: |
402 | free_task_struct(tsk); | 548 | free_task_struct(tsk); |
403 | return NULL; | 549 | return NULL; |
404 | } | 550 | } |
405 | 551 | ||
406 | #ifdef CONFIG_MMU | 552 | #ifdef CONFIG_MMU |
407 | static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) | 553 | static __latent_entropy int dup_mmap(struct mm_struct *mm, |
554 | struct mm_struct *oldmm) | ||
408 | { | 555 | { |
409 | struct vm_area_struct *mpnt, *tmp, *prev, **pprev; | 556 | struct vm_area_struct *mpnt, *tmp, *prev, **pprev; |
410 | struct rb_node **rb_link, *rb_parent; | 557 | struct rb_node **rb_link, *rb_parent; |
@@ -711,6 +858,7 @@ static inline void __mmput(struct mm_struct *mm) | |||
711 | ksm_exit(mm); | 858 | ksm_exit(mm); |
712 | khugepaged_exit(mm); /* must run before exit_mmap */ | 859 | khugepaged_exit(mm); /* must run before exit_mmap */ |
713 | exit_mmap(mm); | 860 | exit_mmap(mm); |
861 | mm_put_huge_zero_page(mm); | ||
714 | set_mm_exe_file(mm, NULL); | 862 | set_mm_exe_file(mm, NULL); |
715 | if (!list_empty(&mm->mmlist)) { | 863 | if (!list_empty(&mm->mmlist)) { |
716 | spin_lock(&mmlist_lock); | 864 | spin_lock(&mmlist_lock); |
@@ -719,6 +867,7 @@ static inline void __mmput(struct mm_struct *mm) | |||
719 | } | 867 | } |
720 | if (mm->binfmt) | 868 | if (mm->binfmt) |
721 | module_put(mm->binfmt->module); | 869 | module_put(mm->binfmt->module); |
870 | set_bit(MMF_OOM_SKIP, &mm->flags); | ||
722 | mmdrop(mm); | 871 | mmdrop(mm); |
723 | } | 872 | } |
724 | 873 | ||
@@ -1296,7 +1445,8 @@ init_task_pid(struct task_struct *task, enum pid_type type, struct pid *pid) | |||
1296 | * parts of the process environment (as per the clone | 1445 | * parts of the process environment (as per the clone |
1297 | * flags). The actual kick-off is left to the caller. | 1446 | * flags). The actual kick-off is left to the caller. |
1298 | */ | 1447 | */ |
1299 | static struct task_struct *copy_process(unsigned long clone_flags, | 1448 | static __latent_entropy struct task_struct *copy_process( |
1449 | unsigned long clone_flags, | ||
1300 | unsigned long stack_start, | 1450 | unsigned long stack_start, |
1301 | unsigned long stack_size, | 1451 | unsigned long stack_size, |
1302 | int __user *child_tidptr, | 1452 | int __user *child_tidptr, |
@@ -1715,6 +1865,8 @@ bad_fork_cleanup_count: | |||
1715 | atomic_dec(&p->cred->user->processes); | 1865 | atomic_dec(&p->cred->user->processes); |
1716 | exit_creds(p); | 1866 | exit_creds(p); |
1717 | bad_fork_free: | 1867 | bad_fork_free: |
1868 | p->state = TASK_DEAD; | ||
1869 | put_task_stack(p); | ||
1718 | free_task(p); | 1870 | free_task(p); |
1719 | fork_out: | 1871 | fork_out: |
1720 | return ERR_PTR(retval); | 1872 | return ERR_PTR(retval); |
@@ -1780,6 +1932,7 @@ long _do_fork(unsigned long clone_flags, | |||
1780 | 1932 | ||
1781 | p = copy_process(clone_flags, stack_start, stack_size, | 1933 | p = copy_process(clone_flags, stack_start, stack_size, |
1782 | child_tidptr, NULL, trace, tls, NUMA_NO_NODE); | 1934 | child_tidptr, NULL, trace, tls, NUMA_NO_NODE); |
1935 | add_latent_entropy(); | ||
1783 | /* | 1936 | /* |
1784 | * Do this prior waking up the new thread - the thread pointer | 1937 | * Do this prior waking up the new thread - the thread pointer |
1785 | * might get invalid after that point, if the thread exits quickly. | 1938 | * might get invalid after that point, if the thread exits quickly. |
diff --git a/kernel/futex.c b/kernel/futex.c index 46cb3a301bc1..2c4be467fecd 100644 --- a/kernel/futex.c +++ b/kernel/futex.c | |||
@@ -381,8 +381,12 @@ static inline int hb_waiters_pending(struct futex_hash_bucket *hb) | |||
381 | #endif | 381 | #endif |
382 | } | 382 | } |
383 | 383 | ||
384 | /* | 384 | /** |
385 | * We hash on the keys returned from get_futex_key (see below). | 385 | * hash_futex - Return the hash bucket in the global hash |
386 | * @key: Pointer to the futex key for which the hash is calculated | ||
387 | * | ||
388 | * We hash on the keys returned from get_futex_key (see below) and return the | ||
389 | * corresponding hash bucket in the global hash. | ||
386 | */ | 390 | */ |
387 | static struct futex_hash_bucket *hash_futex(union futex_key *key) | 391 | static struct futex_hash_bucket *hash_futex(union futex_key *key) |
388 | { | 392 | { |
@@ -392,7 +396,12 @@ static struct futex_hash_bucket *hash_futex(union futex_key *key) | |||
392 | return &futex_queues[hash & (futex_hashsize - 1)]; | 396 | return &futex_queues[hash & (futex_hashsize - 1)]; |
393 | } | 397 | } |
394 | 398 | ||
395 | /* | 399 | |
400 | /** | ||
401 | * match_futex - Check whether two futex keys are equal | ||
402 | * @key1: Pointer to key1 | ||
403 | * @key2: Pointer to key2 | ||
404 | * | ||
396 | * Return 1 if two futex_keys are equal, 0 otherwise. | 405 | * Return 1 if two futex_keys are equal, 0 otherwise. |
397 | */ | 406 | */ |
398 | static inline int match_futex(union futex_key *key1, union futex_key *key2) | 407 | static inline int match_futex(union futex_key *key1, union futex_key *key2) |
diff --git a/kernel/groups.c b/kernel/groups.c index 74d431d25251..2fcadd66a8fd 100644 --- a/kernel/groups.c +++ b/kernel/groups.c | |||
@@ -7,55 +7,31 @@ | |||
7 | #include <linux/security.h> | 7 | #include <linux/security.h> |
8 | #include <linux/syscalls.h> | 8 | #include <linux/syscalls.h> |
9 | #include <linux/user_namespace.h> | 9 | #include <linux/user_namespace.h> |
10 | #include <linux/vmalloc.h> | ||
10 | #include <asm/uaccess.h> | 11 | #include <asm/uaccess.h> |
11 | 12 | ||
12 | struct group_info *groups_alloc(int gidsetsize) | 13 | struct group_info *groups_alloc(int gidsetsize) |
13 | { | 14 | { |
14 | struct group_info *group_info; | 15 | struct group_info *gi; |
15 | int nblocks; | 16 | unsigned int len; |
16 | int i; | 17 | |
17 | 18 | len = sizeof(struct group_info) + sizeof(kgid_t) * gidsetsize; | |
18 | nblocks = (gidsetsize + NGROUPS_PER_BLOCK - 1) / NGROUPS_PER_BLOCK; | 19 | gi = kmalloc(len, GFP_KERNEL_ACCOUNT|__GFP_NOWARN|__GFP_NORETRY); |
19 | /* Make sure we always allocate at least one indirect block pointer */ | 20 | if (!gi) |
20 | nblocks = nblocks ? : 1; | 21 | gi = __vmalloc(len, GFP_KERNEL_ACCOUNT|__GFP_HIGHMEM, PAGE_KERNEL); |
21 | group_info = kmalloc(sizeof(*group_info) + nblocks*sizeof(gid_t *), GFP_USER); | 22 | if (!gi) |
22 | if (!group_info) | ||
23 | return NULL; | 23 | return NULL; |
24 | group_info->ngroups = gidsetsize; | ||
25 | group_info->nblocks = nblocks; | ||
26 | atomic_set(&group_info->usage, 1); | ||
27 | |||
28 | if (gidsetsize <= NGROUPS_SMALL) | ||
29 | group_info->blocks[0] = group_info->small_block; | ||
30 | else { | ||
31 | for (i = 0; i < nblocks; i++) { | ||
32 | kgid_t *b; | ||
33 | b = (void *)__get_free_page(GFP_USER); | ||
34 | if (!b) | ||
35 | goto out_undo_partial_alloc; | ||
36 | group_info->blocks[i] = b; | ||
37 | } | ||
38 | } | ||
39 | return group_info; | ||
40 | 24 | ||
41 | out_undo_partial_alloc: | 25 | atomic_set(&gi->usage, 1); |
42 | while (--i >= 0) { | 26 | gi->ngroups = gidsetsize; |
43 | free_page((unsigned long)group_info->blocks[i]); | 27 | return gi; |
44 | } | ||
45 | kfree(group_info); | ||
46 | return NULL; | ||
47 | } | 28 | } |
48 | 29 | ||
49 | EXPORT_SYMBOL(groups_alloc); | 30 | EXPORT_SYMBOL(groups_alloc); |
50 | 31 | ||
51 | void groups_free(struct group_info *group_info) | 32 | void groups_free(struct group_info *group_info) |
52 | { | 33 | { |
53 | if (group_info->blocks[0] != group_info->small_block) { | 34 | kvfree(group_info); |
54 | int i; | ||
55 | for (i = 0; i < group_info->nblocks; i++) | ||
56 | free_page((unsigned long)group_info->blocks[i]); | ||
57 | } | ||
58 | kfree(group_info); | ||
59 | } | 35 | } |
60 | 36 | ||
61 | EXPORT_SYMBOL(groups_free); | 37 | EXPORT_SYMBOL(groups_free); |
@@ -70,7 +46,7 @@ static int groups_to_user(gid_t __user *grouplist, | |||
70 | 46 | ||
71 | for (i = 0; i < count; i++) { | 47 | for (i = 0; i < count; i++) { |
72 | gid_t gid; | 48 | gid_t gid; |
73 | gid = from_kgid_munged(user_ns, GROUP_AT(group_info, i)); | 49 | gid = from_kgid_munged(user_ns, group_info->gid[i]); |
74 | if (put_user(gid, grouplist+i)) | 50 | if (put_user(gid, grouplist+i)) |
75 | return -EFAULT; | 51 | return -EFAULT; |
76 | } | 52 | } |
@@ -95,7 +71,7 @@ static int groups_from_user(struct group_info *group_info, | |||
95 | if (!gid_valid(kgid)) | 71 | if (!gid_valid(kgid)) |
96 | return -EINVAL; | 72 | return -EINVAL; |
97 | 73 | ||
98 | GROUP_AT(group_info, i) = kgid; | 74 | group_info->gid[i] = kgid; |
99 | } | 75 | } |
100 | return 0; | 76 | return 0; |
101 | } | 77 | } |
@@ -115,15 +91,14 @@ static void groups_sort(struct group_info *group_info) | |||
115 | for (base = 0; base < max; base++) { | 91 | for (base = 0; base < max; base++) { |
116 | int left = base; | 92 | int left = base; |
117 | int right = left + stride; | 93 | int right = left + stride; |
118 | kgid_t tmp = GROUP_AT(group_info, right); | 94 | kgid_t tmp = group_info->gid[right]; |
119 | 95 | ||
120 | while (left >= 0 && gid_gt(GROUP_AT(group_info, left), tmp)) { | 96 | while (left >= 0 && gid_gt(group_info->gid[left], tmp)) { |
121 | GROUP_AT(group_info, right) = | 97 | group_info->gid[right] = group_info->gid[left]; |
122 | GROUP_AT(group_info, left); | ||
123 | right = left; | 98 | right = left; |
124 | left -= stride; | 99 | left -= stride; |
125 | } | 100 | } |
126 | GROUP_AT(group_info, right) = tmp; | 101 | group_info->gid[right] = tmp; |
127 | } | 102 | } |
128 | stride /= 3; | 103 | stride /= 3; |
129 | } | 104 | } |
@@ -141,9 +116,9 @@ int groups_search(const struct group_info *group_info, kgid_t grp) | |||
141 | right = group_info->ngroups; | 116 | right = group_info->ngroups; |
142 | while (left < right) { | 117 | while (left < right) { |
143 | unsigned int mid = (left+right)/2; | 118 | unsigned int mid = (left+right)/2; |
144 | if (gid_gt(grp, GROUP_AT(group_info, mid))) | 119 | if (gid_gt(grp, group_info->gid[mid])) |
145 | left = mid + 1; | 120 | left = mid + 1; |
146 | else if (gid_lt(grp, GROUP_AT(group_info, mid))) | 121 | else if (gid_lt(grp, group_info->gid[mid])) |
147 | right = mid; | 122 | right = mid; |
148 | else | 123 | else |
149 | return 1; | 124 | return 1; |
diff --git a/kernel/hung_task.c b/kernel/hung_task.c index d234022805dc..2b59c82cc3e1 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c | |||
@@ -98,26 +98,26 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) | |||
98 | 98 | ||
99 | trace_sched_process_hang(t); | 99 | trace_sched_process_hang(t); |
100 | 100 | ||
101 | if (!sysctl_hung_task_warnings) | 101 | if (!sysctl_hung_task_warnings && !sysctl_hung_task_panic) |
102 | return; | 102 | return; |
103 | 103 | ||
104 | if (sysctl_hung_task_warnings > 0) | ||
105 | sysctl_hung_task_warnings--; | ||
106 | |||
107 | /* | 104 | /* |
108 | * Ok, the task did not get scheduled for more than 2 minutes, | 105 | * Ok, the task did not get scheduled for more than 2 minutes, |
109 | * complain: | 106 | * complain: |
110 | */ | 107 | */ |
111 | pr_err("INFO: task %s:%d blocked for more than %ld seconds.\n", | 108 | if (sysctl_hung_task_warnings) { |
112 | t->comm, t->pid, timeout); | 109 | sysctl_hung_task_warnings--; |
113 | pr_err(" %s %s %.*s\n", | 110 | pr_err("INFO: task %s:%d blocked for more than %ld seconds.\n", |
114 | print_tainted(), init_utsname()->release, | 111 | t->comm, t->pid, timeout); |
115 | (int)strcspn(init_utsname()->version, " "), | 112 | pr_err(" %s %s %.*s\n", |
116 | init_utsname()->version); | 113 | print_tainted(), init_utsname()->release, |
117 | pr_err("\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\"" | 114 | (int)strcspn(init_utsname()->version, " "), |
118 | " disables this message.\n"); | 115 | init_utsname()->version); |
119 | sched_show_task(t); | 116 | pr_err("\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\"" |
120 | debug_show_held_locks(t); | 117 | " disables this message.\n"); |
118 | sched_show_task(t); | ||
119 | debug_show_all_locks(); | ||
120 | } | ||
121 | 121 | ||
122 | touch_nmi_watchdog(); | 122 | touch_nmi_watchdog(); |
123 | 123 | ||
diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c index 32f6cfcff212..17f51d63da56 100644 --- a/kernel/irq/affinity.c +++ b/kernel/irq/affinity.c | |||
@@ -4,60 +4,151 @@ | |||
4 | #include <linux/slab.h> | 4 | #include <linux/slab.h> |
5 | #include <linux/cpu.h> | 5 | #include <linux/cpu.h> |
6 | 6 | ||
7 | static int get_first_sibling(unsigned int cpu) | 7 | static void irq_spread_init_one(struct cpumask *irqmsk, struct cpumask *nmsk, |
8 | int cpus_per_vec) | ||
8 | { | 9 | { |
9 | unsigned int ret; | 10 | const struct cpumask *siblmsk; |
11 | int cpu, sibl; | ||
10 | 12 | ||
11 | ret = cpumask_first(topology_sibling_cpumask(cpu)); | 13 | for ( ; cpus_per_vec > 0; ) { |
12 | if (ret < nr_cpu_ids) | 14 | cpu = cpumask_first(nmsk); |
13 | return ret; | 15 | |
14 | return cpu; | 16 | /* Should not happen, but I'm too lazy to think about it */ |
17 | if (cpu >= nr_cpu_ids) | ||
18 | return; | ||
19 | |||
20 | cpumask_clear_cpu(cpu, nmsk); | ||
21 | cpumask_set_cpu(cpu, irqmsk); | ||
22 | cpus_per_vec--; | ||
23 | |||
24 | /* If the cpu has siblings, use them first */ | ||
25 | siblmsk = topology_sibling_cpumask(cpu); | ||
26 | for (sibl = -1; cpus_per_vec > 0; ) { | ||
27 | sibl = cpumask_next(sibl, siblmsk); | ||
28 | if (sibl >= nr_cpu_ids) | ||
29 | break; | ||
30 | if (!cpumask_test_and_clear_cpu(sibl, nmsk)) | ||
31 | continue; | ||
32 | cpumask_set_cpu(sibl, irqmsk); | ||
33 | cpus_per_vec--; | ||
34 | } | ||
35 | } | ||
36 | } | ||
37 | |||
38 | static int get_nodes_in_cpumask(const struct cpumask *mask, nodemask_t *nodemsk) | ||
39 | { | ||
40 | int n, nodes; | ||
41 | |||
42 | /* Calculate the number of nodes in the supplied affinity mask */ | ||
43 | for (n = 0, nodes = 0; n < num_online_nodes(); n++) { | ||
44 | if (cpumask_intersects(mask, cpumask_of_node(n))) { | ||
45 | node_set(n, *nodemsk); | ||
46 | nodes++; | ||
47 | } | ||
48 | } | ||
49 | return nodes; | ||
15 | } | 50 | } |
16 | 51 | ||
17 | /* | 52 | /** |
18 | * Take a map of online CPUs and the number of available interrupt vectors | 53 | * irq_create_affinity_masks - Create affinity masks for multiqueue spreading |
19 | * and generate an output cpumask suitable for spreading MSI/MSI-X vectors | 54 | * @affinity: The affinity mask to spread. If NULL cpu_online_mask |
20 | * so that they are distributed as good as possible around the CPUs. If | 55 | * is used |
21 | * more vectors than CPUs are available we'll map one to each CPU, | 56 | * @nvecs: The number of vectors |
22 | * otherwise we map one to the first sibling of each socket. | ||
23 | * | 57 | * |
24 | * If there are more vectors than CPUs we will still only have one bit | 58 | * Returns the masks pointer or NULL if allocation failed. |
25 | * set per CPU, but interrupt code will keep on assigning the vectors from | ||
26 | * the start of the bitmap until we run out of vectors. | ||
27 | */ | 59 | */ |
28 | struct cpumask *irq_create_affinity_mask(unsigned int *nr_vecs) | 60 | struct cpumask *irq_create_affinity_masks(const struct cpumask *affinity, |
61 | int nvec) | ||
29 | { | 62 | { |
30 | struct cpumask *affinity_mask; | 63 | int n, nodes, vecs_per_node, cpus_per_vec, extra_vecs, curvec = 0; |
31 | unsigned int max_vecs = *nr_vecs; | 64 | nodemask_t nodemsk = NODE_MASK_NONE; |
65 | struct cpumask *masks; | ||
66 | cpumask_var_t nmsk; | ||
32 | 67 | ||
33 | if (max_vecs == 1) | 68 | if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL)) |
34 | return NULL; | 69 | return NULL; |
35 | 70 | ||
36 | affinity_mask = kzalloc(cpumask_size(), GFP_KERNEL); | 71 | masks = kzalloc(nvec * sizeof(*masks), GFP_KERNEL); |
37 | if (!affinity_mask) { | 72 | if (!masks) |
38 | *nr_vecs = 1; | 73 | goto out; |
39 | return NULL; | ||
40 | } | ||
41 | 74 | ||
75 | /* Stabilize the cpumasks */ | ||
42 | get_online_cpus(); | 76 | get_online_cpus(); |
43 | if (max_vecs >= num_online_cpus()) { | 77 | /* If the supplied affinity mask is NULL, use cpu online mask */ |
44 | cpumask_copy(affinity_mask, cpu_online_mask); | 78 | if (!affinity) |
45 | *nr_vecs = num_online_cpus(); | 79 | affinity = cpu_online_mask; |
46 | } else { | 80 | |
47 | unsigned int vecs = 0, cpu; | 81 | nodes = get_nodes_in_cpumask(affinity, &nodemsk); |
48 | |||
49 | for_each_online_cpu(cpu) { | ||
50 | if (cpu == get_first_sibling(cpu)) { | ||
51 | cpumask_set_cpu(cpu, affinity_mask); | ||
52 | vecs++; | ||
53 | } | ||
54 | 82 | ||
55 | if (--max_vecs == 0) | 83 | /* |
84 | * If the number of nodes in the mask is less than or equal the | ||
85 | * number of vectors we just spread the vectors across the nodes. | ||
86 | */ | ||
87 | if (nvec <= nodes) { | ||
88 | for_each_node_mask(n, nodemsk) { | ||
89 | cpumask_copy(masks + curvec, cpumask_of_node(n)); | ||
90 | if (++curvec == nvec) | ||
56 | break; | 91 | break; |
57 | } | 92 | } |
58 | *nr_vecs = vecs; | 93 | goto outonl; |
59 | } | 94 | } |
95 | |||
96 | /* Spread the vectors per node */ | ||
97 | vecs_per_node = nvec / nodes; | ||
98 | /* Account for rounding errors */ | ||
99 | extra_vecs = nvec - (nodes * vecs_per_node); | ||
100 | |||
101 | for_each_node_mask(n, nodemsk) { | ||
102 | int ncpus, v, vecs_to_assign = vecs_per_node; | ||
103 | |||
104 | /* Get the cpus on this node which are in the mask */ | ||
105 | cpumask_and(nmsk, affinity, cpumask_of_node(n)); | ||
106 | |||
107 | /* Calculate the number of cpus per vector */ | ||
108 | ncpus = cpumask_weight(nmsk); | ||
109 | |||
110 | for (v = 0; curvec < nvec && v < vecs_to_assign; curvec++, v++) { | ||
111 | cpus_per_vec = ncpus / vecs_to_assign; | ||
112 | |||
113 | /* Account for extra vectors to compensate rounding errors */ | ||
114 | if (extra_vecs) { | ||
115 | cpus_per_vec++; | ||
116 | if (!--extra_vecs) | ||
117 | vecs_per_node++; | ||
118 | } | ||
119 | irq_spread_init_one(masks + curvec, nmsk, cpus_per_vec); | ||
120 | } | ||
121 | |||
122 | if (curvec >= nvec) | ||
123 | break; | ||
124 | } | ||
125 | |||
126 | outonl: | ||
60 | put_online_cpus(); | 127 | put_online_cpus(); |
128 | out: | ||
129 | free_cpumask_var(nmsk); | ||
130 | return masks; | ||
131 | } | ||
132 | |||
133 | /** | ||
134 | * irq_calc_affinity_vectors - Calculate to optimal number of vectors for a given affinity mask | ||
135 | * @affinity: The affinity mask to spread. If NULL cpu_online_mask | ||
136 | * is used | ||
137 | * @maxvec: The maximum number of vectors available | ||
138 | */ | ||
139 | int irq_calc_affinity_vectors(const struct cpumask *affinity, int maxvec) | ||
140 | { | ||
141 | int cpus, ret; | ||
61 | 142 | ||
62 | return affinity_mask; | 143 | /* Stabilize the cpumasks */ |
144 | get_online_cpus(); | ||
145 | /* If the supplied affinity mask is NULL, use cpu online mask */ | ||
146 | if (!affinity) | ||
147 | affinity = cpu_online_mask; | ||
148 | |||
149 | cpus = cpumask_weight(affinity); | ||
150 | ret = (cpus < maxvec) ? cpus : maxvec; | ||
151 | |||
152 | put_online_cpus(); | ||
153 | return ret; | ||
63 | } | 154 | } |
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 637389088b3f..be3c34e4f2ac 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c | |||
@@ -76,7 +76,6 @@ int irq_set_irq_type(unsigned int irq, unsigned int type) | |||
76 | if (!desc) | 76 | if (!desc) |
77 | return -EINVAL; | 77 | return -EINVAL; |
78 | 78 | ||
79 | type &= IRQ_TYPE_SENSE_MASK; | ||
80 | ret = __irq_set_trigger(desc, type); | 79 | ret = __irq_set_trigger(desc, type); |
81 | irq_put_desc_busunlock(desc, flags); | 80 | irq_put_desc_busunlock(desc, flags); |
82 | return ret; | 81 | return ret; |
@@ -756,7 +755,6 @@ void handle_percpu_devid_irq(struct irq_desc *desc) | |||
756 | { | 755 | { |
757 | struct irq_chip *chip = irq_desc_get_chip(desc); | 756 | struct irq_chip *chip = irq_desc_get_chip(desc); |
758 | struct irqaction *action = desc->action; | 757 | struct irqaction *action = desc->action; |
759 | void *dev_id = raw_cpu_ptr(action->percpu_dev_id); | ||
760 | unsigned int irq = irq_desc_get_irq(desc); | 758 | unsigned int irq = irq_desc_get_irq(desc); |
761 | irqreturn_t res; | 759 | irqreturn_t res; |
762 | 760 | ||
@@ -765,15 +763,26 @@ void handle_percpu_devid_irq(struct irq_desc *desc) | |||
765 | if (chip->irq_ack) | 763 | if (chip->irq_ack) |
766 | chip->irq_ack(&desc->irq_data); | 764 | chip->irq_ack(&desc->irq_data); |
767 | 765 | ||
768 | trace_irq_handler_entry(irq, action); | 766 | if (likely(action)) { |
769 | res = action->handler(irq, dev_id); | 767 | trace_irq_handler_entry(irq, action); |
770 | trace_irq_handler_exit(irq, action, res); | 768 | res = action->handler(irq, raw_cpu_ptr(action->percpu_dev_id)); |
769 | trace_irq_handler_exit(irq, action, res); | ||
770 | } else { | ||
771 | unsigned int cpu = smp_processor_id(); | ||
772 | bool enabled = cpumask_test_cpu(cpu, desc->percpu_enabled); | ||
773 | |||
774 | if (enabled) | ||
775 | irq_percpu_disable(desc, cpu); | ||
776 | |||
777 | pr_err_once("Spurious%s percpu IRQ%u on CPU%u\n", | ||
778 | enabled ? " and unmasked" : "", irq, cpu); | ||
779 | } | ||
771 | 780 | ||
772 | if (chip->irq_eoi) | 781 | if (chip->irq_eoi) |
773 | chip->irq_eoi(&desc->irq_data); | 782 | chip->irq_eoi(&desc->irq_data); |
774 | } | 783 | } |
775 | 784 | ||
776 | void | 785 | static void |
777 | __irq_do_set_handler(struct irq_desc *desc, irq_flow_handler_t handle, | 786 | __irq_do_set_handler(struct irq_desc *desc, irq_flow_handler_t handle, |
778 | int is_chained, const char *name) | 787 | int is_chained, const char *name) |
779 | { | 788 | { |
@@ -820,6 +829,8 @@ __irq_do_set_handler(struct irq_desc *desc, irq_flow_handler_t handle, | |||
820 | desc->name = name; | 829 | desc->name = name; |
821 | 830 | ||
822 | if (handle != handle_bad_irq && is_chained) { | 831 | if (handle != handle_bad_irq && is_chained) { |
832 | unsigned int type = irqd_get_trigger_type(&desc->irq_data); | ||
833 | |||
823 | /* | 834 | /* |
824 | * We're about to start this interrupt immediately, | 835 | * We're about to start this interrupt immediately, |
825 | * hence the need to set the trigger configuration. | 836 | * hence the need to set the trigger configuration. |
@@ -828,8 +839,10 @@ __irq_do_set_handler(struct irq_desc *desc, irq_flow_handler_t handle, | |||
828 | * chained interrupt. Reset it immediately because we | 839 | * chained interrupt. Reset it immediately because we |
829 | * do know better. | 840 | * do know better. |
830 | */ | 841 | */ |
831 | __irq_set_trigger(desc, irqd_get_trigger_type(&desc->irq_data)); | 842 | if (type != IRQ_TYPE_NONE) { |
832 | desc->handle_irq = handle; | 843 | __irq_set_trigger(desc, type); |
844 | desc->handle_irq = handle; | ||
845 | } | ||
833 | 846 | ||
834 | irq_settings_set_noprobe(desc); | 847 | irq_settings_set_noprobe(desc); |
835 | irq_settings_set_norequest(desc); | 848 | irq_settings_set_norequest(desc); |
diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c index abd286afbd27..ee32870079c9 100644 --- a/kernel/irq/generic-chip.c +++ b/kernel/irq/generic-chip.c | |||
@@ -260,9 +260,9 @@ irq_gc_init_mask_cache(struct irq_chip_generic *gc, enum irq_gc_flags flags) | |||
260 | } | 260 | } |
261 | 261 | ||
262 | /** | 262 | /** |
263 | * irq_alloc_domain_generic_chip - Allocate generic chips for an irq domain | 263 | * __irq_alloc_domain_generic_chip - Allocate generic chips for an irq domain |
264 | * @d: irq domain for which to allocate chips | 264 | * @d: irq domain for which to allocate chips |
265 | * @irqs_per_chip: Number of interrupts each chip handles | 265 | * @irqs_per_chip: Number of interrupts each chip handles (max 32) |
266 | * @num_ct: Number of irq_chip_type instances associated with this | 266 | * @num_ct: Number of irq_chip_type instances associated with this |
267 | * @name: Name of the irq chip | 267 | * @name: Name of the irq chip |
268 | * @handler: Default flow handler associated with these chips | 268 | * @handler: Default flow handler associated with these chips |
@@ -270,11 +270,11 @@ irq_gc_init_mask_cache(struct irq_chip_generic *gc, enum irq_gc_flags flags) | |||
270 | * @set: IRQ_* bits to set in the mapping function | 270 | * @set: IRQ_* bits to set in the mapping function |
271 | * @gcflags: Generic chip specific setup flags | 271 | * @gcflags: Generic chip specific setup flags |
272 | */ | 272 | */ |
273 | int irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip, | 273 | int __irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip, |
274 | int num_ct, const char *name, | 274 | int num_ct, const char *name, |
275 | irq_flow_handler_t handler, | 275 | irq_flow_handler_t handler, |
276 | unsigned int clr, unsigned int set, | 276 | unsigned int clr, unsigned int set, |
277 | enum irq_gc_flags gcflags) | 277 | enum irq_gc_flags gcflags) |
278 | { | 278 | { |
279 | struct irq_domain_chip_generic *dgc; | 279 | struct irq_domain_chip_generic *dgc; |
280 | struct irq_chip_generic *gc; | 280 | struct irq_chip_generic *gc; |
@@ -326,7 +326,21 @@ int irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip, | |||
326 | d->name = name; | 326 | d->name = name; |
327 | return 0; | 327 | return 0; |
328 | } | 328 | } |
329 | EXPORT_SYMBOL_GPL(irq_alloc_domain_generic_chips); | 329 | EXPORT_SYMBOL_GPL(__irq_alloc_domain_generic_chips); |
330 | |||
331 | static struct irq_chip_generic * | ||
332 | __irq_get_domain_generic_chip(struct irq_domain *d, unsigned int hw_irq) | ||
333 | { | ||
334 | struct irq_domain_chip_generic *dgc = d->gc; | ||
335 | int idx; | ||
336 | |||
337 | if (!dgc) | ||
338 | return ERR_PTR(-ENODEV); | ||
339 | idx = hw_irq / dgc->irqs_per_chip; | ||
340 | if (idx >= dgc->num_chips) | ||
341 | return ERR_PTR(-EINVAL); | ||
342 | return dgc->gc[idx]; | ||
343 | } | ||
330 | 344 | ||
331 | /** | 345 | /** |
332 | * irq_get_domain_generic_chip - Get a pointer to the generic chip of a hw_irq | 346 | * irq_get_domain_generic_chip - Get a pointer to the generic chip of a hw_irq |
@@ -336,15 +350,9 @@ EXPORT_SYMBOL_GPL(irq_alloc_domain_generic_chips); | |||
336 | struct irq_chip_generic * | 350 | struct irq_chip_generic * |
337 | irq_get_domain_generic_chip(struct irq_domain *d, unsigned int hw_irq) | 351 | irq_get_domain_generic_chip(struct irq_domain *d, unsigned int hw_irq) |
338 | { | 352 | { |
339 | struct irq_domain_chip_generic *dgc = d->gc; | 353 | struct irq_chip_generic *gc = __irq_get_domain_generic_chip(d, hw_irq); |
340 | int idx; | ||
341 | 354 | ||
342 | if (!dgc) | 355 | return !IS_ERR(gc) ? gc : NULL; |
343 | return NULL; | ||
344 | idx = hw_irq / dgc->irqs_per_chip; | ||
345 | if (idx >= dgc->num_chips) | ||
346 | return NULL; | ||
347 | return dgc->gc[idx]; | ||
348 | } | 356 | } |
349 | EXPORT_SYMBOL_GPL(irq_get_domain_generic_chip); | 357 | EXPORT_SYMBOL_GPL(irq_get_domain_generic_chip); |
350 | 358 | ||
@@ -368,13 +376,9 @@ int irq_map_generic_chip(struct irq_domain *d, unsigned int virq, | |||
368 | unsigned long flags; | 376 | unsigned long flags; |
369 | int idx; | 377 | int idx; |
370 | 378 | ||
371 | if (!d->gc) | 379 | gc = __irq_get_domain_generic_chip(d, hw_irq); |
372 | return -ENODEV; | 380 | if (IS_ERR(gc)) |
373 | 381 | return PTR_ERR(gc); | |
374 | idx = hw_irq / dgc->irqs_per_chip; | ||
375 | if (idx >= dgc->num_chips) | ||
376 | return -EINVAL; | ||
377 | gc = dgc->gc[idx]; | ||
378 | 382 | ||
379 | idx = hw_irq % dgc->irqs_per_chip; | 383 | idx = hw_irq % dgc->irqs_per_chip; |
380 | 384 | ||
@@ -409,10 +413,30 @@ int irq_map_generic_chip(struct irq_domain *d, unsigned int virq, | |||
409 | irq_modify_status(virq, dgc->irq_flags_to_clear, dgc->irq_flags_to_set); | 413 | irq_modify_status(virq, dgc->irq_flags_to_clear, dgc->irq_flags_to_set); |
410 | return 0; | 414 | return 0; |
411 | } | 415 | } |
412 | EXPORT_SYMBOL_GPL(irq_map_generic_chip); | 416 | |
417 | static void irq_unmap_generic_chip(struct irq_domain *d, unsigned int virq) | ||
418 | { | ||
419 | struct irq_data *data = irq_domain_get_irq_data(d, virq); | ||
420 | struct irq_domain_chip_generic *dgc = d->gc; | ||
421 | unsigned int hw_irq = data->hwirq; | ||
422 | struct irq_chip_generic *gc; | ||
423 | int irq_idx; | ||
424 | |||
425 | gc = irq_get_domain_generic_chip(d, hw_irq); | ||
426 | if (!gc) | ||
427 | return; | ||
428 | |||
429 | irq_idx = hw_irq % dgc->irqs_per_chip; | ||
430 | |||
431 | clear_bit(irq_idx, &gc->installed); | ||
432 | irq_domain_set_info(d, virq, hw_irq, &no_irq_chip, NULL, NULL, NULL, | ||
433 | NULL); | ||
434 | |||
435 | } | ||
413 | 436 | ||
414 | struct irq_domain_ops irq_generic_chip_ops = { | 437 | struct irq_domain_ops irq_generic_chip_ops = { |
415 | .map = irq_map_generic_chip, | 438 | .map = irq_map_generic_chip, |
439 | .unmap = irq_unmap_generic_chip, | ||
416 | .xlate = irq_domain_xlate_onetwocell, | 440 | .xlate = irq_domain_xlate_onetwocell, |
417 | }; | 441 | }; |
418 | EXPORT_SYMBOL_GPL(irq_generic_chip_ops); | 442 | EXPORT_SYMBOL_GPL(irq_generic_chip_ops); |
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index a623b44f2d4b..00bb0aeea1d0 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c | |||
@@ -15,6 +15,7 @@ | |||
15 | #include <linux/radix-tree.h> | 15 | #include <linux/radix-tree.h> |
16 | #include <linux/bitmap.h> | 16 | #include <linux/bitmap.h> |
17 | #include <linux/irqdomain.h> | 17 | #include <linux/irqdomain.h> |
18 | #include <linux/sysfs.h> | ||
18 | 19 | ||
19 | #include "internals.h" | 20 | #include "internals.h" |
20 | 21 | ||
@@ -123,6 +124,181 @@ static DECLARE_BITMAP(allocated_irqs, IRQ_BITMAP_BITS); | |||
123 | 124 | ||
124 | #ifdef CONFIG_SPARSE_IRQ | 125 | #ifdef CONFIG_SPARSE_IRQ |
125 | 126 | ||
127 | static void irq_kobj_release(struct kobject *kobj); | ||
128 | |||
129 | #ifdef CONFIG_SYSFS | ||
130 | static struct kobject *irq_kobj_base; | ||
131 | |||
132 | #define IRQ_ATTR_RO(_name) \ | ||
133 | static struct kobj_attribute _name##_attr = __ATTR_RO(_name) | ||
134 | |||
135 | static ssize_t per_cpu_count_show(struct kobject *kobj, | ||
136 | struct kobj_attribute *attr, char *buf) | ||
137 | { | ||
138 | struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj); | ||
139 | int cpu, irq = desc->irq_data.irq; | ||
140 | ssize_t ret = 0; | ||
141 | char *p = ""; | ||
142 | |||
143 | for_each_possible_cpu(cpu) { | ||
144 | unsigned int c = kstat_irqs_cpu(irq, cpu); | ||
145 | |||
146 | ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%s%u", p, c); | ||
147 | p = ","; | ||
148 | } | ||
149 | |||
150 | ret += scnprintf(buf + ret, PAGE_SIZE - ret, "\n"); | ||
151 | return ret; | ||
152 | } | ||
153 | IRQ_ATTR_RO(per_cpu_count); | ||
154 | |||
155 | static ssize_t chip_name_show(struct kobject *kobj, | ||
156 | struct kobj_attribute *attr, char *buf) | ||
157 | { | ||
158 | struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj); | ||
159 | ssize_t ret = 0; | ||
160 | |||
161 | raw_spin_lock_irq(&desc->lock); | ||
162 | if (desc->irq_data.chip && desc->irq_data.chip->name) { | ||
163 | ret = scnprintf(buf, PAGE_SIZE, "%s\n", | ||
164 | desc->irq_data.chip->name); | ||
165 | } | ||
166 | raw_spin_unlock_irq(&desc->lock); | ||
167 | |||
168 | return ret; | ||
169 | } | ||
170 | IRQ_ATTR_RO(chip_name); | ||
171 | |||
172 | static ssize_t hwirq_show(struct kobject *kobj, | ||
173 | struct kobj_attribute *attr, char *buf) | ||
174 | { | ||
175 | struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj); | ||
176 | ssize_t ret = 0; | ||
177 | |||
178 | raw_spin_lock_irq(&desc->lock); | ||
179 | if (desc->irq_data.domain) | ||
180 | ret = sprintf(buf, "%d\n", (int)desc->irq_data.hwirq); | ||
181 | raw_spin_unlock_irq(&desc->lock); | ||
182 | |||
183 | return ret; | ||
184 | } | ||
185 | IRQ_ATTR_RO(hwirq); | ||
186 | |||
187 | static ssize_t type_show(struct kobject *kobj, | ||
188 | struct kobj_attribute *attr, char *buf) | ||
189 | { | ||
190 | struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj); | ||
191 | ssize_t ret = 0; | ||
192 | |||
193 | raw_spin_lock_irq(&desc->lock); | ||
194 | ret = sprintf(buf, "%s\n", | ||
195 | irqd_is_level_type(&desc->irq_data) ? "level" : "edge"); | ||
196 | raw_spin_unlock_irq(&desc->lock); | ||
197 | |||
198 | return ret; | ||
199 | |||
200 | } | ||
201 | IRQ_ATTR_RO(type); | ||
202 | |||
203 | static ssize_t name_show(struct kobject *kobj, | ||
204 | struct kobj_attribute *attr, char *buf) | ||
205 | { | ||
206 | struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj); | ||
207 | ssize_t ret = 0; | ||
208 | |||
209 | raw_spin_lock_irq(&desc->lock); | ||
210 | if (desc->name) | ||
211 | ret = scnprintf(buf, PAGE_SIZE, "%s\n", desc->name); | ||
212 | raw_spin_unlock_irq(&desc->lock); | ||
213 | |||
214 | return ret; | ||
215 | } | ||
216 | IRQ_ATTR_RO(name); | ||
217 | |||
218 | static ssize_t actions_show(struct kobject *kobj, | ||
219 | struct kobj_attribute *attr, char *buf) | ||
220 | { | ||
221 | struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj); | ||
222 | struct irqaction *action; | ||
223 | ssize_t ret = 0; | ||
224 | char *p = ""; | ||
225 | |||
226 | raw_spin_lock_irq(&desc->lock); | ||
227 | for (action = desc->action; action != NULL; action = action->next) { | ||
228 | ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%s%s", | ||
229 | p, action->name); | ||
230 | p = ","; | ||
231 | } | ||
232 | raw_spin_unlock_irq(&desc->lock); | ||
233 | |||
234 | if (ret) | ||
235 | ret += scnprintf(buf + ret, PAGE_SIZE - ret, "\n"); | ||
236 | |||
237 | return ret; | ||
238 | } | ||
239 | IRQ_ATTR_RO(actions); | ||
240 | |||
241 | static struct attribute *irq_attrs[] = { | ||
242 | &per_cpu_count_attr.attr, | ||
243 | &chip_name_attr.attr, | ||
244 | &hwirq_attr.attr, | ||
245 | &type_attr.attr, | ||
246 | &name_attr.attr, | ||
247 | &actions_attr.attr, | ||
248 | NULL | ||
249 | }; | ||
250 | |||
251 | static struct kobj_type irq_kobj_type = { | ||
252 | .release = irq_kobj_release, | ||
253 | .sysfs_ops = &kobj_sysfs_ops, | ||
254 | .default_attrs = irq_attrs, | ||
255 | }; | ||
256 | |||
257 | static void irq_sysfs_add(int irq, struct irq_desc *desc) | ||
258 | { | ||
259 | if (irq_kobj_base) { | ||
260 | /* | ||
261 | * Continue even in case of failure as this is nothing | ||
262 | * crucial. | ||
263 | */ | ||
264 | if (kobject_add(&desc->kobj, irq_kobj_base, "%d", irq)) | ||
265 | pr_warn("Failed to add kobject for irq %d\n", irq); | ||
266 | } | ||
267 | } | ||
268 | |||
269 | static int __init irq_sysfs_init(void) | ||
270 | { | ||
271 | struct irq_desc *desc; | ||
272 | int irq; | ||
273 | |||
274 | /* Prevent concurrent irq alloc/free */ | ||
275 | irq_lock_sparse(); | ||
276 | |||
277 | irq_kobj_base = kobject_create_and_add("irq", kernel_kobj); | ||
278 | if (!irq_kobj_base) { | ||
279 | irq_unlock_sparse(); | ||
280 | return -ENOMEM; | ||
281 | } | ||
282 | |||
283 | /* Add the already allocated interrupts */ | ||
284 | for_each_irq_desc(irq, desc) | ||
285 | irq_sysfs_add(irq, desc); | ||
286 | irq_unlock_sparse(); | ||
287 | |||
288 | return 0; | ||
289 | } | ||
290 | postcore_initcall(irq_sysfs_init); | ||
291 | |||
292 | #else /* !CONFIG_SYSFS */ | ||
293 | |||
294 | static struct kobj_type irq_kobj_type = { | ||
295 | .release = irq_kobj_release, | ||
296 | }; | ||
297 | |||
298 | static void irq_sysfs_add(int irq, struct irq_desc *desc) {} | ||
299 | |||
300 | #endif /* CONFIG_SYSFS */ | ||
301 | |||
126 | static RADIX_TREE(irq_desc_tree, GFP_KERNEL); | 302 | static RADIX_TREE(irq_desc_tree, GFP_KERNEL); |
127 | 303 | ||
128 | static void irq_insert_desc(unsigned int irq, struct irq_desc *desc) | 304 | static void irq_insert_desc(unsigned int irq, struct irq_desc *desc) |
@@ -187,6 +363,7 @@ static struct irq_desc *alloc_desc(int irq, int node, unsigned int flags, | |||
187 | 363 | ||
188 | desc_set_defaults(irq, desc, node, affinity, owner); | 364 | desc_set_defaults(irq, desc, node, affinity, owner); |
189 | irqd_set(&desc->irq_data, flags); | 365 | irqd_set(&desc->irq_data, flags); |
366 | kobject_init(&desc->kobj, &irq_kobj_type); | ||
190 | 367 | ||
191 | return desc; | 368 | return desc; |
192 | 369 | ||
@@ -197,15 +374,22 @@ err_desc: | |||
197 | return NULL; | 374 | return NULL; |
198 | } | 375 | } |
199 | 376 | ||
200 | static void delayed_free_desc(struct rcu_head *rhp) | 377 | static void irq_kobj_release(struct kobject *kobj) |
201 | { | 378 | { |
202 | struct irq_desc *desc = container_of(rhp, struct irq_desc, rcu); | 379 | struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj); |
203 | 380 | ||
204 | free_masks(desc); | 381 | free_masks(desc); |
205 | free_percpu(desc->kstat_irqs); | 382 | free_percpu(desc->kstat_irqs); |
206 | kfree(desc); | 383 | kfree(desc); |
207 | } | 384 | } |
208 | 385 | ||
386 | static void delayed_free_desc(struct rcu_head *rhp) | ||
387 | { | ||
388 | struct irq_desc *desc = container_of(rhp, struct irq_desc, rcu); | ||
389 | |||
390 | kobject_put(&desc->kobj); | ||
391 | } | ||
392 | |||
209 | static void free_desc(unsigned int irq) | 393 | static void free_desc(unsigned int irq) |
210 | { | 394 | { |
211 | struct irq_desc *desc = irq_to_desc(irq); | 395 | struct irq_desc *desc = irq_to_desc(irq); |
@@ -217,8 +401,12 @@ static void free_desc(unsigned int irq) | |||
217 | * kstat_irq_usr(). Once we deleted the descriptor from the | 401 | * kstat_irq_usr(). Once we deleted the descriptor from the |
218 | * sparse tree we can free it. Access in proc will fail to | 402 | * sparse tree we can free it. Access in proc will fail to |
219 | * lookup the descriptor. | 403 | * lookup the descriptor. |
404 | * | ||
405 | * The sysfs entry must be serialized against a concurrent | ||
406 | * irq_sysfs_init() as well. | ||
220 | */ | 407 | */ |
221 | mutex_lock(&sparse_irq_lock); | 408 | mutex_lock(&sparse_irq_lock); |
409 | kobject_del(&desc->kobj); | ||
222 | delete_irq_desc(irq); | 410 | delete_irq_desc(irq); |
223 | mutex_unlock(&sparse_irq_lock); | 411 | mutex_unlock(&sparse_irq_lock); |
224 | 412 | ||
@@ -236,31 +424,31 @@ static int alloc_descs(unsigned int start, unsigned int cnt, int node, | |||
236 | const struct cpumask *mask = NULL; | 424 | const struct cpumask *mask = NULL; |
237 | struct irq_desc *desc; | 425 | struct irq_desc *desc; |
238 | unsigned int flags; | 426 | unsigned int flags; |
239 | int i, cpu = -1; | 427 | int i; |
240 | 428 | ||
241 | if (affinity && cpumask_empty(affinity)) | 429 | /* Validate affinity mask(s) */ |
242 | return -EINVAL; | 430 | if (affinity) { |
431 | for (i = 0, mask = affinity; i < cnt; i++, mask++) { | ||
432 | if (cpumask_empty(mask)) | ||
433 | return -EINVAL; | ||
434 | } | ||
435 | } | ||
243 | 436 | ||
244 | flags = affinity ? IRQD_AFFINITY_MANAGED : 0; | 437 | flags = affinity ? IRQD_AFFINITY_MANAGED : 0; |
438 | mask = NULL; | ||
245 | 439 | ||
246 | for (i = 0; i < cnt; i++) { | 440 | for (i = 0; i < cnt; i++) { |
247 | if (affinity) { | 441 | if (affinity) { |
248 | cpu = cpumask_next(cpu, affinity); | 442 | node = cpu_to_node(cpumask_first(affinity)); |
249 | if (cpu >= nr_cpu_ids) | 443 | mask = affinity; |
250 | cpu = cpumask_first(affinity); | 444 | affinity++; |
251 | node = cpu_to_node(cpu); | ||
252 | |||
253 | /* | ||
254 | * For single allocations we use the caller provided | ||
255 | * mask otherwise we use the mask of the target cpu | ||
256 | */ | ||
257 | mask = cnt == 1 ? affinity : cpumask_of(cpu); | ||
258 | } | 445 | } |
259 | desc = alloc_desc(start + i, node, flags, mask, owner); | 446 | desc = alloc_desc(start + i, node, flags, mask, owner); |
260 | if (!desc) | 447 | if (!desc) |
261 | goto err; | 448 | goto err; |
262 | mutex_lock(&sparse_irq_lock); | 449 | mutex_lock(&sparse_irq_lock); |
263 | irq_insert_desc(start + i, desc); | 450 | irq_insert_desc(start + i, desc); |
451 | irq_sysfs_add(start + i, desc); | ||
264 | mutex_unlock(&sparse_irq_lock); | 452 | mutex_unlock(&sparse_irq_lock); |
265 | } | 453 | } |
266 | return start; | 454 | return start; |
@@ -481,9 +669,9 @@ EXPORT_SYMBOL_GPL(irq_free_descs); | |||
481 | * @cnt: Number of consecutive irqs to allocate. | 669 | * @cnt: Number of consecutive irqs to allocate. |
482 | * @node: Preferred node on which the irq descriptor should be allocated | 670 | * @node: Preferred node on which the irq descriptor should be allocated |
483 | * @owner: Owning module (can be NULL) | 671 | * @owner: Owning module (can be NULL) |
484 | * @affinity: Optional pointer to an affinity mask which hints where the | 672 | * @affinity: Optional pointer to an affinity mask array of size @cnt which |
485 | * irq descriptors should be allocated and which default | 673 | * hints where the irq descriptors should be allocated and which |
486 | * affinities to use | 674 | * default affinities to use |
487 | * | 675 | * |
488 | * Returns the first irq number or error code | 676 | * Returns the first irq number or error code |
489 | */ | 677 | */ |
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 4752b43662e0..8c0a0ae43521 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c | |||
@@ -80,7 +80,7 @@ EXPORT_SYMBOL_GPL(irq_domain_free_fwnode); | |||
80 | 80 | ||
81 | /** | 81 | /** |
82 | * __irq_domain_add() - Allocate a new irq_domain data structure | 82 | * __irq_domain_add() - Allocate a new irq_domain data structure |
83 | * @of_node: optional device-tree node of the interrupt controller | 83 | * @fwnode: firmware node for the interrupt controller |
84 | * @size: Size of linear map; 0 for radix mapping only | 84 | * @size: Size of linear map; 0 for radix mapping only |
85 | * @hwirq_max: Maximum number of interrupts supported by controller | 85 | * @hwirq_max: Maximum number of interrupts supported by controller |
86 | * @direct_max: Maximum value of direct maps; Use ~0 for no limit; 0 for no | 86 | * @direct_max: Maximum value of direct maps; Use ~0 for no limit; 0 for no |
@@ -96,10 +96,8 @@ struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, int size, | |||
96 | const struct irq_domain_ops *ops, | 96 | const struct irq_domain_ops *ops, |
97 | void *host_data) | 97 | void *host_data) |
98 | { | 98 | { |
99 | struct device_node *of_node = to_of_node(fwnode); | ||
99 | struct irq_domain *domain; | 100 | struct irq_domain *domain; |
100 | struct device_node *of_node; | ||
101 | |||
102 | of_node = to_of_node(fwnode); | ||
103 | 101 | ||
104 | domain = kzalloc_node(sizeof(*domain) + (sizeof(unsigned int) * size), | 102 | domain = kzalloc_node(sizeof(*domain) + (sizeof(unsigned int) * size), |
105 | GFP_KERNEL, of_node_to_nid(of_node)); | 103 | GFP_KERNEL, of_node_to_nid(of_node)); |
@@ -868,7 +866,10 @@ int irq_domain_xlate_onetwocell(struct irq_domain *d, | |||
868 | if (WARN_ON(intsize < 1)) | 866 | if (WARN_ON(intsize < 1)) |
869 | return -EINVAL; | 867 | return -EINVAL; |
870 | *out_hwirq = intspec[0]; | 868 | *out_hwirq = intspec[0]; |
871 | *out_type = (intsize > 1) ? intspec[1] : IRQ_TYPE_NONE; | 869 | if (intsize > 1) |
870 | *out_type = intspec[1] & IRQ_TYPE_SENSE_MASK; | ||
871 | else | ||
872 | *out_type = IRQ_TYPE_NONE; | ||
872 | return 0; | 873 | return 0; |
873 | } | 874 | } |
874 | EXPORT_SYMBOL_GPL(irq_domain_xlate_onetwocell); | 875 | EXPORT_SYMBOL_GPL(irq_domain_xlate_onetwocell); |
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 9530fcd27704..6b669593e7eb 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c | |||
@@ -669,8 +669,6 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned long flags) | |||
669 | return 0; | 669 | return 0; |
670 | } | 670 | } |
671 | 671 | ||
672 | flags &= IRQ_TYPE_SENSE_MASK; | ||
673 | |||
674 | if (chip->flags & IRQCHIP_SET_TYPE_MASKED) { | 672 | if (chip->flags & IRQCHIP_SET_TYPE_MASKED) { |
675 | if (!irqd_irq_masked(&desc->irq_data)) | 673 | if (!irqd_irq_masked(&desc->irq_data)) |
676 | mask_irq(desc); | 674 | mask_irq(desc); |
@@ -678,7 +676,8 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned long flags) | |||
678 | unmask = 1; | 676 | unmask = 1; |
679 | } | 677 | } |
680 | 678 | ||
681 | /* caller masked out all except trigger mode flags */ | 679 | /* Mask all flags except trigger mode */ |
680 | flags &= IRQ_TYPE_SENSE_MASK; | ||
682 | ret = chip->irq_set_type(&desc->irq_data, flags); | 681 | ret = chip->irq_set_type(&desc->irq_data, flags); |
683 | 682 | ||
684 | switch (ret) { | 683 | switch (ret) { |
@@ -722,6 +721,7 @@ int irq_set_parent(int irq, int parent_irq) | |||
722 | irq_put_desc_unlock(desc, flags); | 721 | irq_put_desc_unlock(desc, flags); |
723 | return 0; | 722 | return 0; |
724 | } | 723 | } |
724 | EXPORT_SYMBOL_GPL(irq_set_parent); | ||
725 | #endif | 725 | #endif |
726 | 726 | ||
727 | /* | 727 | /* |
@@ -1341,12 +1341,12 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) | |||
1341 | 1341 | ||
1342 | } else if (new->flags & IRQF_TRIGGER_MASK) { | 1342 | } else if (new->flags & IRQF_TRIGGER_MASK) { |
1343 | unsigned int nmsk = new->flags & IRQF_TRIGGER_MASK; | 1343 | unsigned int nmsk = new->flags & IRQF_TRIGGER_MASK; |
1344 | unsigned int omsk = irq_settings_get_trigger_mask(desc); | 1344 | unsigned int omsk = irqd_get_trigger_type(&desc->irq_data); |
1345 | 1345 | ||
1346 | if (nmsk != omsk) | 1346 | if (nmsk != omsk) |
1347 | /* hope the handler works with current trigger mode */ | 1347 | /* hope the handler works with current trigger mode */ |
1348 | pr_warn("irq %d uses trigger mode %u; requested %u\n", | 1348 | pr_warn("irq %d uses trigger mode %u; requested %u\n", |
1349 | irq, nmsk, omsk); | 1349 | irq, omsk, nmsk); |
1350 | } | 1350 | } |
1351 | 1351 | ||
1352 | *old_ptr = new; | 1352 | *old_ptr = new; |
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index 19e9dfbe97fa..8a3e872798f3 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c | |||
@@ -18,20 +18,42 @@ | |||
18 | /* Temparory solution for building, will be removed later */ | 18 | /* Temparory solution for building, will be removed later */ |
19 | #include <linux/pci.h> | 19 | #include <linux/pci.h> |
20 | 20 | ||
21 | struct msi_desc *alloc_msi_entry(struct device *dev) | 21 | /** |
22 | * alloc_msi_entry - Allocate an initialize msi_entry | ||
23 | * @dev: Pointer to the device for which this is allocated | ||
24 | * @nvec: The number of vectors used in this entry | ||
25 | * @affinity: Optional pointer to an affinity mask array size of @nvec | ||
26 | * | ||
27 | * If @affinity is not NULL then a an affinity array[@nvec] is allocated | ||
28 | * and the affinity masks from @affinity are copied. | ||
29 | */ | ||
30 | struct msi_desc * | ||
31 | alloc_msi_entry(struct device *dev, int nvec, const struct cpumask *affinity) | ||
22 | { | 32 | { |
23 | struct msi_desc *desc = kzalloc(sizeof(*desc), GFP_KERNEL); | 33 | struct msi_desc *desc; |
34 | |||
35 | desc = kzalloc(sizeof(*desc), GFP_KERNEL); | ||
24 | if (!desc) | 36 | if (!desc) |
25 | return NULL; | 37 | return NULL; |
26 | 38 | ||
27 | INIT_LIST_HEAD(&desc->list); | 39 | INIT_LIST_HEAD(&desc->list); |
28 | desc->dev = dev; | 40 | desc->dev = dev; |
41 | desc->nvec_used = nvec; | ||
42 | if (affinity) { | ||
43 | desc->affinity = kmemdup(affinity, | ||
44 | nvec * sizeof(*desc->affinity), GFP_KERNEL); | ||
45 | if (!desc->affinity) { | ||
46 | kfree(desc); | ||
47 | return NULL; | ||
48 | } | ||
49 | } | ||
29 | 50 | ||
30 | return desc; | 51 | return desc; |
31 | } | 52 | } |
32 | 53 | ||
33 | void free_msi_entry(struct msi_desc *entry) | 54 | void free_msi_entry(struct msi_desc *entry) |
34 | { | 55 | { |
56 | kfree(entry->affinity); | ||
35 | kfree(entry); | 57 | kfree(entry); |
36 | } | 58 | } |
37 | 59 | ||
diff --git a/kernel/kcov.c b/kernel/kcov.c index 8d44b3fea9d0..30e6d05aa5a9 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c | |||
@@ -53,8 +53,15 @@ void notrace __sanitizer_cov_trace_pc(void) | |||
53 | /* | 53 | /* |
54 | * We are interested in code coverage as a function of a syscall inputs, | 54 | * We are interested in code coverage as a function of a syscall inputs, |
55 | * so we ignore code executed in interrupts. | 55 | * so we ignore code executed in interrupts. |
56 | * The checks for whether we are in an interrupt are open-coded, because | ||
57 | * 1. We can't use in_interrupt() here, since it also returns true | ||
58 | * when we are inside local_bh_disable() section. | ||
59 | * 2. We don't want to use (in_irq() | in_serving_softirq() | in_nmi()), | ||
60 | * since that leads to slower generated code (three separate tests, | ||
61 | * one for each of the flags). | ||
56 | */ | 62 | */ |
57 | if (!t || in_interrupt()) | 63 | if (!t || (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_OFFSET |
64 | | NMI_MASK))) | ||
58 | return; | 65 | return; |
59 | mode = READ_ONCE(t->kcov_mode); | 66 | mode = READ_ONCE(t->kcov_mode); |
60 | if (mode == KCOV_MODE_TRACE) { | 67 | if (mode == KCOV_MODE_TRACE) { |
diff --git a/kernel/kprobes.c b/kernel/kprobes.c index d10ab6b9b5e0..d63095472ea9 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c | |||
@@ -49,7 +49,7 @@ | |||
49 | #include <linux/cpu.h> | 49 | #include <linux/cpu.h> |
50 | #include <linux/jump_label.h> | 50 | #include <linux/jump_label.h> |
51 | 51 | ||
52 | #include <asm-generic/sections.h> | 52 | #include <asm/sections.h> |
53 | #include <asm/cacheflush.h> | 53 | #include <asm/cacheflush.h> |
54 | #include <asm/errno.h> | 54 | #include <asm/errno.h> |
55 | #include <asm/uaccess.h> | 55 | #include <asm/uaccess.h> |
diff --git a/kernel/kthread.c b/kernel/kthread.c index 9ff173dca1ae..be2cc1f9dd57 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c | |||
@@ -64,7 +64,7 @@ static inline struct kthread *to_kthread(struct task_struct *k) | |||
64 | static struct kthread *to_live_kthread(struct task_struct *k) | 64 | static struct kthread *to_live_kthread(struct task_struct *k) |
65 | { | 65 | { |
66 | struct completion *vfork = ACCESS_ONCE(k->vfork_done); | 66 | struct completion *vfork = ACCESS_ONCE(k->vfork_done); |
67 | if (likely(vfork)) | 67 | if (likely(vfork) && try_get_task_stack(k)) |
68 | return __to_kthread(vfork); | 68 | return __to_kthread(vfork); |
69 | return NULL; | 69 | return NULL; |
70 | } | 70 | } |
@@ -138,7 +138,7 @@ void *kthread_data(struct task_struct *task) | |||
138 | } | 138 | } |
139 | 139 | ||
140 | /** | 140 | /** |
141 | * probe_kthread_data - speculative version of kthread_data() | 141 | * kthread_probe_data - speculative version of kthread_data() |
142 | * @task: possible kthread task in question | 142 | * @task: possible kthread task in question |
143 | * | 143 | * |
144 | * @task could be a kthread task. Return the data value specified when it | 144 | * @task could be a kthread task. Return the data value specified when it |
@@ -146,7 +146,7 @@ void *kthread_data(struct task_struct *task) | |||
146 | * inaccessible for any reason, %NULL is returned. This function requires | 146 | * inaccessible for any reason, %NULL is returned. This function requires |
147 | * that @task itself is safe to dereference. | 147 | * that @task itself is safe to dereference. |
148 | */ | 148 | */ |
149 | void *probe_kthread_data(struct task_struct *task) | 149 | void *kthread_probe_data(struct task_struct *task) |
150 | { | 150 | { |
151 | struct kthread *kthread = to_kthread(task); | 151 | struct kthread *kthread = to_kthread(task); |
152 | void *data = NULL; | 152 | void *data = NULL; |
@@ -244,33 +244,10 @@ static void create_kthread(struct kthread_create_info *create) | |||
244 | } | 244 | } |
245 | } | 245 | } |
246 | 246 | ||
247 | /** | 247 | static struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data), |
248 | * kthread_create_on_node - create a kthread. | 248 | void *data, int node, |
249 | * @threadfn: the function to run until signal_pending(current). | 249 | const char namefmt[], |
250 | * @data: data ptr for @threadfn. | 250 | va_list args) |
251 | * @node: task and thread structures for the thread are allocated on this node | ||
252 | * @namefmt: printf-style name for the thread. | ||
253 | * | ||
254 | * Description: This helper function creates and names a kernel | ||
255 | * thread. The thread will be stopped: use wake_up_process() to start | ||
256 | * it. See also kthread_run(). The new thread has SCHED_NORMAL policy and | ||
257 | * is affine to all CPUs. | ||
258 | * | ||
259 | * If thread is going to be bound on a particular cpu, give its node | ||
260 | * in @node, to get NUMA affinity for kthread stack, or else give NUMA_NO_NODE. | ||
261 | * When woken, the thread will run @threadfn() with @data as its | ||
262 | * argument. @threadfn() can either call do_exit() directly if it is a | ||
263 | * standalone thread for which no one will call kthread_stop(), or | ||
264 | * return when 'kthread_should_stop()' is true (which means | ||
265 | * kthread_stop() has been called). The return value should be zero | ||
266 | * or a negative error number; it will be passed to kthread_stop(). | ||
267 | * | ||
268 | * Returns a task_struct or ERR_PTR(-ENOMEM) or ERR_PTR(-EINTR). | ||
269 | */ | ||
270 | struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), | ||
271 | void *data, int node, | ||
272 | const char namefmt[], | ||
273 | ...) | ||
274 | { | 251 | { |
275 | DECLARE_COMPLETION_ONSTACK(done); | 252 | DECLARE_COMPLETION_ONSTACK(done); |
276 | struct task_struct *task; | 253 | struct task_struct *task; |
@@ -311,11 +288,8 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), | |||
311 | task = create->result; | 288 | task = create->result; |
312 | if (!IS_ERR(task)) { | 289 | if (!IS_ERR(task)) { |
313 | static const struct sched_param param = { .sched_priority = 0 }; | 290 | static const struct sched_param param = { .sched_priority = 0 }; |
314 | va_list args; | ||
315 | 291 | ||
316 | va_start(args, namefmt); | ||
317 | vsnprintf(task->comm, sizeof(task->comm), namefmt, args); | 292 | vsnprintf(task->comm, sizeof(task->comm), namefmt, args); |
318 | va_end(args); | ||
319 | /* | 293 | /* |
320 | * root may have changed our (kthreadd's) priority or CPU mask. | 294 | * root may have changed our (kthreadd's) priority or CPU mask. |
321 | * The kernel thread should not inherit these properties. | 295 | * The kernel thread should not inherit these properties. |
@@ -326,6 +300,44 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), | |||
326 | kfree(create); | 300 | kfree(create); |
327 | return task; | 301 | return task; |
328 | } | 302 | } |
303 | |||
304 | /** | ||
305 | * kthread_create_on_node - create a kthread. | ||
306 | * @threadfn: the function to run until signal_pending(current). | ||
307 | * @data: data ptr for @threadfn. | ||
308 | * @node: task and thread structures for the thread are allocated on this node | ||
309 | * @namefmt: printf-style name for the thread. | ||
310 | * | ||
311 | * Description: This helper function creates and names a kernel | ||
312 | * thread. The thread will be stopped: use wake_up_process() to start | ||
313 | * it. See also kthread_run(). The new thread has SCHED_NORMAL policy and | ||
314 | * is affine to all CPUs. | ||
315 | * | ||
316 | * If thread is going to be bound on a particular cpu, give its node | ||
317 | * in @node, to get NUMA affinity for kthread stack, or else give NUMA_NO_NODE. | ||
318 | * When woken, the thread will run @threadfn() with @data as its | ||
319 | * argument. @threadfn() can either call do_exit() directly if it is a | ||
320 | * standalone thread for which no one will call kthread_stop(), or | ||
321 | * return when 'kthread_should_stop()' is true (which means | ||
322 | * kthread_stop() has been called). The return value should be zero | ||
323 | * or a negative error number; it will be passed to kthread_stop(). | ||
324 | * | ||
325 | * Returns a task_struct or ERR_PTR(-ENOMEM) or ERR_PTR(-EINTR). | ||
326 | */ | ||
327 | struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), | ||
328 | void *data, int node, | ||
329 | const char namefmt[], | ||
330 | ...) | ||
331 | { | ||
332 | struct task_struct *task; | ||
333 | va_list args; | ||
334 | |||
335 | va_start(args, namefmt); | ||
336 | task = __kthread_create_on_node(threadfn, data, node, namefmt, args); | ||
337 | va_end(args); | ||
338 | |||
339 | return task; | ||
340 | } | ||
329 | EXPORT_SYMBOL(kthread_create_on_node); | 341 | EXPORT_SYMBOL(kthread_create_on_node); |
330 | 342 | ||
331 | static void __kthread_bind_mask(struct task_struct *p, const struct cpumask *mask, long state) | 343 | static void __kthread_bind_mask(struct task_struct *p, const struct cpumask *mask, long state) |
@@ -390,10 +402,10 @@ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data), | |||
390 | cpu); | 402 | cpu); |
391 | if (IS_ERR(p)) | 403 | if (IS_ERR(p)) |
392 | return p; | 404 | return p; |
405 | kthread_bind(p, cpu); | ||
406 | /* CPU hotplug need to bind once again when unparking the thread. */ | ||
393 | set_bit(KTHREAD_IS_PER_CPU, &to_kthread(p)->flags); | 407 | set_bit(KTHREAD_IS_PER_CPU, &to_kthread(p)->flags); |
394 | to_kthread(p)->cpu = cpu; | 408 | to_kthread(p)->cpu = cpu; |
395 | /* Park the thread to get it out of TASK_UNINTERRUPTIBLE state */ | ||
396 | kthread_park(p); | ||
397 | return p; | 409 | return p; |
398 | } | 410 | } |
399 | 411 | ||
@@ -407,6 +419,10 @@ static void __kthread_unpark(struct task_struct *k, struct kthread *kthread) | |||
407 | * which might be about to be cleared. | 419 | * which might be about to be cleared. |
408 | */ | 420 | */ |
409 | if (test_and_clear_bit(KTHREAD_IS_PARKED, &kthread->flags)) { | 421 | if (test_and_clear_bit(KTHREAD_IS_PARKED, &kthread->flags)) { |
422 | /* | ||
423 | * Newly created kthread was parked when the CPU was offline. | ||
424 | * The binding was lost and we need to set it again. | ||
425 | */ | ||
410 | if (test_bit(KTHREAD_IS_PER_CPU, &kthread->flags)) | 426 | if (test_bit(KTHREAD_IS_PER_CPU, &kthread->flags)) |
411 | __kthread_bind(k, kthread->cpu, TASK_PARKED); | 427 | __kthread_bind(k, kthread->cpu, TASK_PARKED); |
412 | wake_up_state(k, TASK_PARKED); | 428 | wake_up_state(k, TASK_PARKED); |
@@ -425,8 +441,10 @@ void kthread_unpark(struct task_struct *k) | |||
425 | { | 441 | { |
426 | struct kthread *kthread = to_live_kthread(k); | 442 | struct kthread *kthread = to_live_kthread(k); |
427 | 443 | ||
428 | if (kthread) | 444 | if (kthread) { |
429 | __kthread_unpark(k, kthread); | 445 | __kthread_unpark(k, kthread); |
446 | put_task_stack(k); | ||
447 | } | ||
430 | } | 448 | } |
431 | EXPORT_SYMBOL_GPL(kthread_unpark); | 449 | EXPORT_SYMBOL_GPL(kthread_unpark); |
432 | 450 | ||
@@ -455,6 +473,7 @@ int kthread_park(struct task_struct *k) | |||
455 | wait_for_completion(&kthread->parked); | 473 | wait_for_completion(&kthread->parked); |
456 | } | 474 | } |
457 | } | 475 | } |
476 | put_task_stack(k); | ||
458 | ret = 0; | 477 | ret = 0; |
459 | } | 478 | } |
460 | return ret; | 479 | return ret; |
@@ -490,6 +509,7 @@ int kthread_stop(struct task_struct *k) | |||
490 | __kthread_unpark(k, kthread); | 509 | __kthread_unpark(k, kthread); |
491 | wake_up_process(k); | 510 | wake_up_process(k); |
492 | wait_for_completion(&kthread->exited); | 511 | wait_for_completion(&kthread->exited); |
512 | put_task_stack(k); | ||
493 | } | 513 | } |
494 | ret = k->exit_code; | 514 | ret = k->exit_code; |
495 | put_task_struct(k); | 515 | put_task_struct(k); |
@@ -536,39 +556,48 @@ int kthreadd(void *unused) | |||
536 | return 0; | 556 | return 0; |
537 | } | 557 | } |
538 | 558 | ||
539 | void __init_kthread_worker(struct kthread_worker *worker, | 559 | void __kthread_init_worker(struct kthread_worker *worker, |
540 | const char *name, | 560 | const char *name, |
541 | struct lock_class_key *key) | 561 | struct lock_class_key *key) |
542 | { | 562 | { |
563 | memset(worker, 0, sizeof(struct kthread_worker)); | ||
543 | spin_lock_init(&worker->lock); | 564 | spin_lock_init(&worker->lock); |
544 | lockdep_set_class_and_name(&worker->lock, key, name); | 565 | lockdep_set_class_and_name(&worker->lock, key, name); |
545 | INIT_LIST_HEAD(&worker->work_list); | 566 | INIT_LIST_HEAD(&worker->work_list); |
546 | worker->task = NULL; | 567 | INIT_LIST_HEAD(&worker->delayed_work_list); |
547 | } | 568 | } |
548 | EXPORT_SYMBOL_GPL(__init_kthread_worker); | 569 | EXPORT_SYMBOL_GPL(__kthread_init_worker); |
549 | 570 | ||
550 | /** | 571 | /** |
551 | * kthread_worker_fn - kthread function to process kthread_worker | 572 | * kthread_worker_fn - kthread function to process kthread_worker |
552 | * @worker_ptr: pointer to initialized kthread_worker | 573 | * @worker_ptr: pointer to initialized kthread_worker |
553 | * | 574 | * |
554 | * This function can be used as @threadfn to kthread_create() or | 575 | * This function implements the main cycle of kthread worker. It processes |
555 | * kthread_run() with @worker_ptr argument pointing to an initialized | 576 | * work_list until it is stopped with kthread_stop(). It sleeps when the queue |
556 | * kthread_worker. The started kthread will process work_list until | 577 | * is empty. |
557 | * the it is stopped with kthread_stop(). A kthread can also call | ||
558 | * this function directly after extra initialization. | ||
559 | * | 578 | * |
560 | * Different kthreads can be used for the same kthread_worker as long | 579 | * The works are not allowed to keep any locks, disable preemption or interrupts |
561 | * as there's only one kthread attached to it at any given time. A | 580 | * when they finish. There is defined a safe point for freezing when one work |
562 | * kthread_worker without an attached kthread simply collects queued | 581 | * finishes and before a new one is started. |
563 | * kthread_works. | 582 | * |
583 | * Also the works must not be handled by more than one worker at the same time, | ||
584 | * see also kthread_queue_work(). | ||
564 | */ | 585 | */ |
565 | int kthread_worker_fn(void *worker_ptr) | 586 | int kthread_worker_fn(void *worker_ptr) |
566 | { | 587 | { |
567 | struct kthread_worker *worker = worker_ptr; | 588 | struct kthread_worker *worker = worker_ptr; |
568 | struct kthread_work *work; | 589 | struct kthread_work *work; |
569 | 590 | ||
570 | WARN_ON(worker->task); | 591 | /* |
592 | * FIXME: Update the check and remove the assignment when all kthread | ||
593 | * worker users are created using kthread_create_worker*() functions. | ||
594 | */ | ||
595 | WARN_ON(worker->task && worker->task != current); | ||
571 | worker->task = current; | 596 | worker->task = current; |
597 | |||
598 | if (worker->flags & KTW_FREEZABLE) | ||
599 | set_freezable(); | ||
600 | |||
572 | repeat: | 601 | repeat: |
573 | set_current_state(TASK_INTERRUPTIBLE); /* mb paired w/ kthread_stop */ | 602 | set_current_state(TASK_INTERRUPTIBLE); /* mb paired w/ kthread_stop */ |
574 | 603 | ||
@@ -601,13 +630,132 @@ repeat: | |||
601 | } | 630 | } |
602 | EXPORT_SYMBOL_GPL(kthread_worker_fn); | 631 | EXPORT_SYMBOL_GPL(kthread_worker_fn); |
603 | 632 | ||
604 | /* insert @work before @pos in @worker */ | 633 | static struct kthread_worker * |
605 | static void insert_kthread_work(struct kthread_worker *worker, | 634 | __kthread_create_worker(int cpu, unsigned int flags, |
606 | struct kthread_work *work, | 635 | const char namefmt[], va_list args) |
607 | struct list_head *pos) | 636 | { |
637 | struct kthread_worker *worker; | ||
638 | struct task_struct *task; | ||
639 | |||
640 | worker = kzalloc(sizeof(*worker), GFP_KERNEL); | ||
641 | if (!worker) | ||
642 | return ERR_PTR(-ENOMEM); | ||
643 | |||
644 | kthread_init_worker(worker); | ||
645 | |||
646 | if (cpu >= 0) { | ||
647 | char name[TASK_COMM_LEN]; | ||
648 | |||
649 | /* | ||
650 | * kthread_create_worker_on_cpu() allows to pass a generic | ||
651 | * namefmt in compare with kthread_create_on_cpu. We need | ||
652 | * to format it here. | ||
653 | */ | ||
654 | vsnprintf(name, sizeof(name), namefmt, args); | ||
655 | task = kthread_create_on_cpu(kthread_worker_fn, worker, | ||
656 | cpu, name); | ||
657 | } else { | ||
658 | task = __kthread_create_on_node(kthread_worker_fn, worker, | ||
659 | -1, namefmt, args); | ||
660 | } | ||
661 | |||
662 | if (IS_ERR(task)) | ||
663 | goto fail_task; | ||
664 | |||
665 | worker->flags = flags; | ||
666 | worker->task = task; | ||
667 | wake_up_process(task); | ||
668 | return worker; | ||
669 | |||
670 | fail_task: | ||
671 | kfree(worker); | ||
672 | return ERR_CAST(task); | ||
673 | } | ||
674 | |||
675 | /** | ||
676 | * kthread_create_worker - create a kthread worker | ||
677 | * @flags: flags modifying the default behavior of the worker | ||
678 | * @namefmt: printf-style name for the kthread worker (task). | ||
679 | * | ||
680 | * Returns a pointer to the allocated worker on success, ERR_PTR(-ENOMEM) | ||
681 | * when the needed structures could not get allocated, and ERR_PTR(-EINTR) | ||
682 | * when the worker was SIGKILLed. | ||
683 | */ | ||
684 | struct kthread_worker * | ||
685 | kthread_create_worker(unsigned int flags, const char namefmt[], ...) | ||
686 | { | ||
687 | struct kthread_worker *worker; | ||
688 | va_list args; | ||
689 | |||
690 | va_start(args, namefmt); | ||
691 | worker = __kthread_create_worker(-1, flags, namefmt, args); | ||
692 | va_end(args); | ||
693 | |||
694 | return worker; | ||
695 | } | ||
696 | EXPORT_SYMBOL(kthread_create_worker); | ||
697 | |||
698 | /** | ||
699 | * kthread_create_worker_on_cpu - create a kthread worker and bind it | ||
700 | * it to a given CPU and the associated NUMA node. | ||
701 | * @cpu: CPU number | ||
702 | * @flags: flags modifying the default behavior of the worker | ||
703 | * @namefmt: printf-style name for the kthread worker (task). | ||
704 | * | ||
705 | * Use a valid CPU number if you want to bind the kthread worker | ||
706 | * to the given CPU and the associated NUMA node. | ||
707 | * | ||
708 | * A good practice is to add the cpu number also into the worker name. | ||
709 | * For example, use kthread_create_worker_on_cpu(cpu, "helper/%d", cpu). | ||
710 | * | ||
711 | * Returns a pointer to the allocated worker on success, ERR_PTR(-ENOMEM) | ||
712 | * when the needed structures could not get allocated, and ERR_PTR(-EINTR) | ||
713 | * when the worker was SIGKILLed. | ||
714 | */ | ||
715 | struct kthread_worker * | ||
716 | kthread_create_worker_on_cpu(int cpu, unsigned int flags, | ||
717 | const char namefmt[], ...) | ||
718 | { | ||
719 | struct kthread_worker *worker; | ||
720 | va_list args; | ||
721 | |||
722 | va_start(args, namefmt); | ||
723 | worker = __kthread_create_worker(cpu, flags, namefmt, args); | ||
724 | va_end(args); | ||
725 | |||
726 | return worker; | ||
727 | } | ||
728 | EXPORT_SYMBOL(kthread_create_worker_on_cpu); | ||
729 | |||
730 | /* | ||
731 | * Returns true when the work could not be queued at the moment. | ||
732 | * It happens when it is already pending in a worker list | ||
733 | * or when it is being cancelled. | ||
734 | */ | ||
735 | static inline bool queuing_blocked(struct kthread_worker *worker, | ||
736 | struct kthread_work *work) | ||
608 | { | 737 | { |
609 | lockdep_assert_held(&worker->lock); | 738 | lockdep_assert_held(&worker->lock); |
610 | 739 | ||
740 | return !list_empty(&work->node) || work->canceling; | ||
741 | } | ||
742 | |||
743 | static void kthread_insert_work_sanity_check(struct kthread_worker *worker, | ||
744 | struct kthread_work *work) | ||
745 | { | ||
746 | lockdep_assert_held(&worker->lock); | ||
747 | WARN_ON_ONCE(!list_empty(&work->node)); | ||
748 | /* Do not use a work with >1 worker, see kthread_queue_work() */ | ||
749 | WARN_ON_ONCE(work->worker && work->worker != worker); | ||
750 | } | ||
751 | |||
752 | /* insert @work before @pos in @worker */ | ||
753 | static void kthread_insert_work(struct kthread_worker *worker, | ||
754 | struct kthread_work *work, | ||
755 | struct list_head *pos) | ||
756 | { | ||
757 | kthread_insert_work_sanity_check(worker, work); | ||
758 | |||
611 | list_add_tail(&work->node, pos); | 759 | list_add_tail(&work->node, pos); |
612 | work->worker = worker; | 760 | work->worker = worker; |
613 | if (!worker->current_work && likely(worker->task)) | 761 | if (!worker->current_work && likely(worker->task)) |
@@ -615,29 +763,133 @@ static void insert_kthread_work(struct kthread_worker *worker, | |||
615 | } | 763 | } |
616 | 764 | ||
617 | /** | 765 | /** |
618 | * queue_kthread_work - queue a kthread_work | 766 | * kthread_queue_work - queue a kthread_work |
619 | * @worker: target kthread_worker | 767 | * @worker: target kthread_worker |
620 | * @work: kthread_work to queue | 768 | * @work: kthread_work to queue |
621 | * | 769 | * |
622 | * Queue @work to work processor @task for async execution. @task | 770 | * Queue @work to work processor @task for async execution. @task |
623 | * must have been created with kthread_worker_create(). Returns %true | 771 | * must have been created with kthread_worker_create(). Returns %true |
624 | * if @work was successfully queued, %false if it was already pending. | 772 | * if @work was successfully queued, %false if it was already pending. |
773 | * | ||
774 | * Reinitialize the work if it needs to be used by another worker. | ||
775 | * For example, when the worker was stopped and started again. | ||
625 | */ | 776 | */ |
626 | bool queue_kthread_work(struct kthread_worker *worker, | 777 | bool kthread_queue_work(struct kthread_worker *worker, |
627 | struct kthread_work *work) | 778 | struct kthread_work *work) |
628 | { | 779 | { |
629 | bool ret = false; | 780 | bool ret = false; |
630 | unsigned long flags; | 781 | unsigned long flags; |
631 | 782 | ||
632 | spin_lock_irqsave(&worker->lock, flags); | 783 | spin_lock_irqsave(&worker->lock, flags); |
633 | if (list_empty(&work->node)) { | 784 | if (!queuing_blocked(worker, work)) { |
634 | insert_kthread_work(worker, work, &worker->work_list); | 785 | kthread_insert_work(worker, work, &worker->work_list); |
786 | ret = true; | ||
787 | } | ||
788 | spin_unlock_irqrestore(&worker->lock, flags); | ||
789 | return ret; | ||
790 | } | ||
791 | EXPORT_SYMBOL_GPL(kthread_queue_work); | ||
792 | |||
793 | /** | ||
794 | * kthread_delayed_work_timer_fn - callback that queues the associated kthread | ||
795 | * delayed work when the timer expires. | ||
796 | * @__data: pointer to the data associated with the timer | ||
797 | * | ||
798 | * The format of the function is defined by struct timer_list. | ||
799 | * It should have been called from irqsafe timer with irq already off. | ||
800 | */ | ||
801 | void kthread_delayed_work_timer_fn(unsigned long __data) | ||
802 | { | ||
803 | struct kthread_delayed_work *dwork = | ||
804 | (struct kthread_delayed_work *)__data; | ||
805 | struct kthread_work *work = &dwork->work; | ||
806 | struct kthread_worker *worker = work->worker; | ||
807 | |||
808 | /* | ||
809 | * This might happen when a pending work is reinitialized. | ||
810 | * It means that it is used a wrong way. | ||
811 | */ | ||
812 | if (WARN_ON_ONCE(!worker)) | ||
813 | return; | ||
814 | |||
815 | spin_lock(&worker->lock); | ||
816 | /* Work must not be used with >1 worker, see kthread_queue_work(). */ | ||
817 | WARN_ON_ONCE(work->worker != worker); | ||
818 | |||
819 | /* Move the work from worker->delayed_work_list. */ | ||
820 | WARN_ON_ONCE(list_empty(&work->node)); | ||
821 | list_del_init(&work->node); | ||
822 | kthread_insert_work(worker, work, &worker->work_list); | ||
823 | |||
824 | spin_unlock(&worker->lock); | ||
825 | } | ||
826 | EXPORT_SYMBOL(kthread_delayed_work_timer_fn); | ||
827 | |||
828 | void __kthread_queue_delayed_work(struct kthread_worker *worker, | ||
829 | struct kthread_delayed_work *dwork, | ||
830 | unsigned long delay) | ||
831 | { | ||
832 | struct timer_list *timer = &dwork->timer; | ||
833 | struct kthread_work *work = &dwork->work; | ||
834 | |||
835 | WARN_ON_ONCE(timer->function != kthread_delayed_work_timer_fn || | ||
836 | timer->data != (unsigned long)dwork); | ||
837 | |||
838 | /* | ||
839 | * If @delay is 0, queue @dwork->work immediately. This is for | ||
840 | * both optimization and correctness. The earliest @timer can | ||
841 | * expire is on the closest next tick and delayed_work users depend | ||
842 | * on that there's no such delay when @delay is 0. | ||
843 | */ | ||
844 | if (!delay) { | ||
845 | kthread_insert_work(worker, work, &worker->work_list); | ||
846 | return; | ||
847 | } | ||
848 | |||
849 | /* Be paranoid and try to detect possible races already now. */ | ||
850 | kthread_insert_work_sanity_check(worker, work); | ||
851 | |||
852 | list_add(&work->node, &worker->delayed_work_list); | ||
853 | work->worker = worker; | ||
854 | timer_stats_timer_set_start_info(&dwork->timer); | ||
855 | timer->expires = jiffies + delay; | ||
856 | add_timer(timer); | ||
857 | } | ||
858 | |||
859 | /** | ||
860 | * kthread_queue_delayed_work - queue the associated kthread work | ||
861 | * after a delay. | ||
862 | * @worker: target kthread_worker | ||
863 | * @dwork: kthread_delayed_work to queue | ||
864 | * @delay: number of jiffies to wait before queuing | ||
865 | * | ||
866 | * If the work has not been pending it starts a timer that will queue | ||
867 | * the work after the given @delay. If @delay is zero, it queues the | ||
868 | * work immediately. | ||
869 | * | ||
870 | * Return: %false if the @work has already been pending. It means that | ||
871 | * either the timer was running or the work was queued. It returns %true | ||
872 | * otherwise. | ||
873 | */ | ||
874 | bool kthread_queue_delayed_work(struct kthread_worker *worker, | ||
875 | struct kthread_delayed_work *dwork, | ||
876 | unsigned long delay) | ||
877 | { | ||
878 | struct kthread_work *work = &dwork->work; | ||
879 | unsigned long flags; | ||
880 | bool ret = false; | ||
881 | |||
882 | spin_lock_irqsave(&worker->lock, flags); | ||
883 | |||
884 | if (!queuing_blocked(worker, work)) { | ||
885 | __kthread_queue_delayed_work(worker, dwork, delay); | ||
635 | ret = true; | 886 | ret = true; |
636 | } | 887 | } |
888 | |||
637 | spin_unlock_irqrestore(&worker->lock, flags); | 889 | spin_unlock_irqrestore(&worker->lock, flags); |
638 | return ret; | 890 | return ret; |
639 | } | 891 | } |
640 | EXPORT_SYMBOL_GPL(queue_kthread_work); | 892 | EXPORT_SYMBOL_GPL(kthread_queue_delayed_work); |
641 | 893 | ||
642 | struct kthread_flush_work { | 894 | struct kthread_flush_work { |
643 | struct kthread_work work; | 895 | struct kthread_work work; |
@@ -652,12 +904,12 @@ static void kthread_flush_work_fn(struct kthread_work *work) | |||
652 | } | 904 | } |
653 | 905 | ||
654 | /** | 906 | /** |
655 | * flush_kthread_work - flush a kthread_work | 907 | * kthread_flush_work - flush a kthread_work |
656 | * @work: work to flush | 908 | * @work: work to flush |
657 | * | 909 | * |
658 | * If @work is queued or executing, wait for it to finish execution. | 910 | * If @work is queued or executing, wait for it to finish execution. |
659 | */ | 911 | */ |
660 | void flush_kthread_work(struct kthread_work *work) | 912 | void kthread_flush_work(struct kthread_work *work) |
661 | { | 913 | { |
662 | struct kthread_flush_work fwork = { | 914 | struct kthread_flush_work fwork = { |
663 | KTHREAD_WORK_INIT(fwork.work, kthread_flush_work_fn), | 915 | KTHREAD_WORK_INIT(fwork.work, kthread_flush_work_fn), |
@@ -666,21 +918,19 @@ void flush_kthread_work(struct kthread_work *work) | |||
666 | struct kthread_worker *worker; | 918 | struct kthread_worker *worker; |
667 | bool noop = false; | 919 | bool noop = false; |
668 | 920 | ||
669 | retry: | ||
670 | worker = work->worker; | 921 | worker = work->worker; |
671 | if (!worker) | 922 | if (!worker) |
672 | return; | 923 | return; |
673 | 924 | ||
674 | spin_lock_irq(&worker->lock); | 925 | spin_lock_irq(&worker->lock); |
675 | if (work->worker != worker) { | 926 | /* Work must not be used with >1 worker, see kthread_queue_work(). */ |
676 | spin_unlock_irq(&worker->lock); | 927 | WARN_ON_ONCE(work->worker != worker); |
677 | goto retry; | ||
678 | } | ||
679 | 928 | ||
680 | if (!list_empty(&work->node)) | 929 | if (!list_empty(&work->node)) |
681 | insert_kthread_work(worker, &fwork.work, work->node.next); | 930 | kthread_insert_work(worker, &fwork.work, work->node.next); |
682 | else if (worker->current_work == work) | 931 | else if (worker->current_work == work) |
683 | insert_kthread_work(worker, &fwork.work, worker->work_list.next); | 932 | kthread_insert_work(worker, &fwork.work, |
933 | worker->work_list.next); | ||
684 | else | 934 | else |
685 | noop = true; | 935 | noop = true; |
686 | 936 | ||
@@ -689,23 +939,214 @@ retry: | |||
689 | if (!noop) | 939 | if (!noop) |
690 | wait_for_completion(&fwork.done); | 940 | wait_for_completion(&fwork.done); |
691 | } | 941 | } |
692 | EXPORT_SYMBOL_GPL(flush_kthread_work); | 942 | EXPORT_SYMBOL_GPL(kthread_flush_work); |
943 | |||
944 | /* | ||
945 | * This function removes the work from the worker queue. Also it makes sure | ||
946 | * that it won't get queued later via the delayed work's timer. | ||
947 | * | ||
948 | * The work might still be in use when this function finishes. See the | ||
949 | * current_work proceed by the worker. | ||
950 | * | ||
951 | * Return: %true if @work was pending and successfully canceled, | ||
952 | * %false if @work was not pending | ||
953 | */ | ||
954 | static bool __kthread_cancel_work(struct kthread_work *work, bool is_dwork, | ||
955 | unsigned long *flags) | ||
956 | { | ||
957 | /* Try to cancel the timer if exists. */ | ||
958 | if (is_dwork) { | ||
959 | struct kthread_delayed_work *dwork = | ||
960 | container_of(work, struct kthread_delayed_work, work); | ||
961 | struct kthread_worker *worker = work->worker; | ||
962 | |||
963 | /* | ||
964 | * del_timer_sync() must be called to make sure that the timer | ||
965 | * callback is not running. The lock must be temporary released | ||
966 | * to avoid a deadlock with the callback. In the meantime, | ||
967 | * any queuing is blocked by setting the canceling counter. | ||
968 | */ | ||
969 | work->canceling++; | ||
970 | spin_unlock_irqrestore(&worker->lock, *flags); | ||
971 | del_timer_sync(&dwork->timer); | ||
972 | spin_lock_irqsave(&worker->lock, *flags); | ||
973 | work->canceling--; | ||
974 | } | ||
975 | |||
976 | /* | ||
977 | * Try to remove the work from a worker list. It might either | ||
978 | * be from worker->work_list or from worker->delayed_work_list. | ||
979 | */ | ||
980 | if (!list_empty(&work->node)) { | ||
981 | list_del_init(&work->node); | ||
982 | return true; | ||
983 | } | ||
984 | |||
985 | return false; | ||
986 | } | ||
693 | 987 | ||
694 | /** | 988 | /** |
695 | * flush_kthread_worker - flush all current works on a kthread_worker | 989 | * kthread_mod_delayed_work - modify delay of or queue a kthread delayed work |
990 | * @worker: kthread worker to use | ||
991 | * @dwork: kthread delayed work to queue | ||
992 | * @delay: number of jiffies to wait before queuing | ||
993 | * | ||
994 | * If @dwork is idle, equivalent to kthread_queue_delayed_work(). Otherwise, | ||
995 | * modify @dwork's timer so that it expires after @delay. If @delay is zero, | ||
996 | * @work is guaranteed to be queued immediately. | ||
997 | * | ||
998 | * Return: %true if @dwork was pending and its timer was modified, | ||
999 | * %false otherwise. | ||
1000 | * | ||
1001 | * A special case is when the work is being canceled in parallel. | ||
1002 | * It might be caused either by the real kthread_cancel_delayed_work_sync() | ||
1003 | * or yet another kthread_mod_delayed_work() call. We let the other command | ||
1004 | * win and return %false here. The caller is supposed to synchronize these | ||
1005 | * operations a reasonable way. | ||
1006 | * | ||
1007 | * This function is safe to call from any context including IRQ handler. | ||
1008 | * See __kthread_cancel_work() and kthread_delayed_work_timer_fn() | ||
1009 | * for details. | ||
1010 | */ | ||
1011 | bool kthread_mod_delayed_work(struct kthread_worker *worker, | ||
1012 | struct kthread_delayed_work *dwork, | ||
1013 | unsigned long delay) | ||
1014 | { | ||
1015 | struct kthread_work *work = &dwork->work; | ||
1016 | unsigned long flags; | ||
1017 | int ret = false; | ||
1018 | |||
1019 | spin_lock_irqsave(&worker->lock, flags); | ||
1020 | |||
1021 | /* Do not bother with canceling when never queued. */ | ||
1022 | if (!work->worker) | ||
1023 | goto fast_queue; | ||
1024 | |||
1025 | /* Work must not be used with >1 worker, see kthread_queue_work() */ | ||
1026 | WARN_ON_ONCE(work->worker != worker); | ||
1027 | |||
1028 | /* Do not fight with another command that is canceling this work. */ | ||
1029 | if (work->canceling) | ||
1030 | goto out; | ||
1031 | |||
1032 | ret = __kthread_cancel_work(work, true, &flags); | ||
1033 | fast_queue: | ||
1034 | __kthread_queue_delayed_work(worker, dwork, delay); | ||
1035 | out: | ||
1036 | spin_unlock_irqrestore(&worker->lock, flags); | ||
1037 | return ret; | ||
1038 | } | ||
1039 | EXPORT_SYMBOL_GPL(kthread_mod_delayed_work); | ||
1040 | |||
1041 | static bool __kthread_cancel_work_sync(struct kthread_work *work, bool is_dwork) | ||
1042 | { | ||
1043 | struct kthread_worker *worker = work->worker; | ||
1044 | unsigned long flags; | ||
1045 | int ret = false; | ||
1046 | |||
1047 | if (!worker) | ||
1048 | goto out; | ||
1049 | |||
1050 | spin_lock_irqsave(&worker->lock, flags); | ||
1051 | /* Work must not be used with >1 worker, see kthread_queue_work(). */ | ||
1052 | WARN_ON_ONCE(work->worker != worker); | ||
1053 | |||
1054 | ret = __kthread_cancel_work(work, is_dwork, &flags); | ||
1055 | |||
1056 | if (worker->current_work != work) | ||
1057 | goto out_fast; | ||
1058 | |||
1059 | /* | ||
1060 | * The work is in progress and we need to wait with the lock released. | ||
1061 | * In the meantime, block any queuing by setting the canceling counter. | ||
1062 | */ | ||
1063 | work->canceling++; | ||
1064 | spin_unlock_irqrestore(&worker->lock, flags); | ||
1065 | kthread_flush_work(work); | ||
1066 | spin_lock_irqsave(&worker->lock, flags); | ||
1067 | work->canceling--; | ||
1068 | |||
1069 | out_fast: | ||
1070 | spin_unlock_irqrestore(&worker->lock, flags); | ||
1071 | out: | ||
1072 | return ret; | ||
1073 | } | ||
1074 | |||
1075 | /** | ||
1076 | * kthread_cancel_work_sync - cancel a kthread work and wait for it to finish | ||
1077 | * @work: the kthread work to cancel | ||
1078 | * | ||
1079 | * Cancel @work and wait for its execution to finish. This function | ||
1080 | * can be used even if the work re-queues itself. On return from this | ||
1081 | * function, @work is guaranteed to be not pending or executing on any CPU. | ||
1082 | * | ||
1083 | * kthread_cancel_work_sync(&delayed_work->work) must not be used for | ||
1084 | * delayed_work's. Use kthread_cancel_delayed_work_sync() instead. | ||
1085 | * | ||
1086 | * The caller must ensure that the worker on which @work was last | ||
1087 | * queued can't be destroyed before this function returns. | ||
1088 | * | ||
1089 | * Return: %true if @work was pending, %false otherwise. | ||
1090 | */ | ||
1091 | bool kthread_cancel_work_sync(struct kthread_work *work) | ||
1092 | { | ||
1093 | return __kthread_cancel_work_sync(work, false); | ||
1094 | } | ||
1095 | EXPORT_SYMBOL_GPL(kthread_cancel_work_sync); | ||
1096 | |||
1097 | /** | ||
1098 | * kthread_cancel_delayed_work_sync - cancel a kthread delayed work and | ||
1099 | * wait for it to finish. | ||
1100 | * @dwork: the kthread delayed work to cancel | ||
1101 | * | ||
1102 | * This is kthread_cancel_work_sync() for delayed works. | ||
1103 | * | ||
1104 | * Return: %true if @dwork was pending, %false otherwise. | ||
1105 | */ | ||
1106 | bool kthread_cancel_delayed_work_sync(struct kthread_delayed_work *dwork) | ||
1107 | { | ||
1108 | return __kthread_cancel_work_sync(&dwork->work, true); | ||
1109 | } | ||
1110 | EXPORT_SYMBOL_GPL(kthread_cancel_delayed_work_sync); | ||
1111 | |||
1112 | /** | ||
1113 | * kthread_flush_worker - flush all current works on a kthread_worker | ||
696 | * @worker: worker to flush | 1114 | * @worker: worker to flush |
697 | * | 1115 | * |
698 | * Wait until all currently executing or pending works on @worker are | 1116 | * Wait until all currently executing or pending works on @worker are |
699 | * finished. | 1117 | * finished. |
700 | */ | 1118 | */ |
701 | void flush_kthread_worker(struct kthread_worker *worker) | 1119 | void kthread_flush_worker(struct kthread_worker *worker) |
702 | { | 1120 | { |
703 | struct kthread_flush_work fwork = { | 1121 | struct kthread_flush_work fwork = { |
704 | KTHREAD_WORK_INIT(fwork.work, kthread_flush_work_fn), | 1122 | KTHREAD_WORK_INIT(fwork.work, kthread_flush_work_fn), |
705 | COMPLETION_INITIALIZER_ONSTACK(fwork.done), | 1123 | COMPLETION_INITIALIZER_ONSTACK(fwork.done), |
706 | }; | 1124 | }; |
707 | 1125 | ||
708 | queue_kthread_work(worker, &fwork.work); | 1126 | kthread_queue_work(worker, &fwork.work); |
709 | wait_for_completion(&fwork.done); | 1127 | wait_for_completion(&fwork.done); |
710 | } | 1128 | } |
711 | EXPORT_SYMBOL_GPL(flush_kthread_worker); | 1129 | EXPORT_SYMBOL_GPL(kthread_flush_worker); |
1130 | |||
1131 | /** | ||
1132 | * kthread_destroy_worker - destroy a kthread worker | ||
1133 | * @worker: worker to be destroyed | ||
1134 | * | ||
1135 | * Flush and destroy @worker. The simple flush is enough because the kthread | ||
1136 | * worker API is used only in trivial scenarios. There are no multi-step state | ||
1137 | * machines needed. | ||
1138 | */ | ||
1139 | void kthread_destroy_worker(struct kthread_worker *worker) | ||
1140 | { | ||
1141 | struct task_struct *task; | ||
1142 | |||
1143 | task = worker->task; | ||
1144 | if (WARN_ON(!task)) | ||
1145 | return; | ||
1146 | |||
1147 | kthread_flush_worker(worker); | ||
1148 | kthread_stop(task); | ||
1149 | WARN_ON(!list_empty(&worker->work_list)); | ||
1150 | kfree(worker); | ||
1151 | } | ||
1152 | EXPORT_SYMBOL(kthread_destroy_worker); | ||
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index 8bbe50704621..af4643873e71 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c | |||
@@ -274,7 +274,6 @@ static int klp_write_object_relocations(struct module *pmod, | |||
274 | 274 | ||
275 | objname = klp_is_module(obj) ? obj->name : "vmlinux"; | 275 | objname = klp_is_module(obj) ? obj->name : "vmlinux"; |
276 | 276 | ||
277 | module_disable_ro(pmod); | ||
278 | /* For each klp relocation section */ | 277 | /* For each klp relocation section */ |
279 | for (i = 1; i < pmod->klp_info->hdr.e_shnum; i++) { | 278 | for (i = 1; i < pmod->klp_info->hdr.e_shnum; i++) { |
280 | sec = pmod->klp_info->sechdrs + i; | 279 | sec = pmod->klp_info->sechdrs + i; |
@@ -309,7 +308,6 @@ static int klp_write_object_relocations(struct module *pmod, | |||
309 | break; | 308 | break; |
310 | } | 309 | } |
311 | 310 | ||
312 | module_enable_ro(pmod, true); | ||
313 | return ret; | 311 | return ret; |
314 | } | 312 | } |
315 | 313 | ||
@@ -547,9 +545,6 @@ static int __klp_enable_patch(struct klp_patch *patch) | |||
547 | list_prev_entry(patch, list)->state == KLP_DISABLED) | 545 | list_prev_entry(patch, list)->state == KLP_DISABLED) |
548 | return -EBUSY; | 546 | return -EBUSY; |
549 | 547 | ||
550 | pr_notice_once("tainting kernel with TAINT_LIVEPATCH\n"); | ||
551 | add_taint(TAINT_LIVEPATCH, LOCKDEP_STILL_OK); | ||
552 | |||
553 | pr_notice("enabling patch '%s'\n", patch->mod->name); | 548 | pr_notice("enabling patch '%s'\n", patch->mod->name); |
554 | 549 | ||
555 | klp_for_each_object(patch, obj) { | 550 | klp_for_each_object(patch, obj) { |
@@ -763,6 +758,12 @@ static int klp_init_func(struct klp_object *obj, struct klp_func *func) | |||
763 | func->old_sympos ? func->old_sympos : 1); | 758 | func->old_sympos ? func->old_sympos : 1); |
764 | } | 759 | } |
765 | 760 | ||
761 | /* Arches may override this to finish any remaining arch-specific tasks */ | ||
762 | void __weak arch_klp_init_object_loaded(struct klp_patch *patch, | ||
763 | struct klp_object *obj) | ||
764 | { | ||
765 | } | ||
766 | |||
766 | /* parts of the initialization that is done only when the object is loaded */ | 767 | /* parts of the initialization that is done only when the object is loaded */ |
767 | static int klp_init_object_loaded(struct klp_patch *patch, | 768 | static int klp_init_object_loaded(struct klp_patch *patch, |
768 | struct klp_object *obj) | 769 | struct klp_object *obj) |
@@ -770,9 +771,15 @@ static int klp_init_object_loaded(struct klp_patch *patch, | |||
770 | struct klp_func *func; | 771 | struct klp_func *func; |
771 | int ret; | 772 | int ret; |
772 | 773 | ||
774 | module_disable_ro(patch->mod); | ||
773 | ret = klp_write_object_relocations(patch->mod, obj); | 775 | ret = klp_write_object_relocations(patch->mod, obj); |
774 | if (ret) | 776 | if (ret) { |
777 | module_enable_ro(patch->mod, true); | ||
775 | return ret; | 778 | return ret; |
779 | } | ||
780 | |||
781 | arch_klp_init_object_loaded(patch, obj); | ||
782 | module_enable_ro(patch->mod, true); | ||
776 | 783 | ||
777 | klp_for_each_func(obj, func) { | 784 | klp_for_each_func(obj, func) { |
778 | ret = klp_find_object_symbol(obj->name, func->old_name, | 785 | ret = klp_find_object_symbol(obj->name, func->old_name, |
diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile index 31322a4275cd..6f88e352cd4f 100644 --- a/kernel/locking/Makefile +++ b/kernel/locking/Makefile | |||
@@ -18,7 +18,6 @@ obj-$(CONFIG_LOCKDEP) += lockdep_proc.o | |||
18 | endif | 18 | endif |
19 | obj-$(CONFIG_SMP) += spinlock.o | 19 | obj-$(CONFIG_SMP) += spinlock.o |
20 | obj-$(CONFIG_LOCK_SPIN_ON_OWNER) += osq_lock.o | 20 | obj-$(CONFIG_LOCK_SPIN_ON_OWNER) += osq_lock.o |
21 | obj-$(CONFIG_SMP) += lglock.o | ||
22 | obj-$(CONFIG_PROVE_LOCKING) += spinlock.o | 21 | obj-$(CONFIG_PROVE_LOCKING) += spinlock.o |
23 | obj-$(CONFIG_QUEUED_SPINLOCKS) += qspinlock.o | 22 | obj-$(CONFIG_QUEUED_SPINLOCKS) += qspinlock.o |
24 | obj-$(CONFIG_RT_MUTEXES) += rtmutex.o | 23 | obj-$(CONFIG_RT_MUTEXES) += rtmutex.o |
diff --git a/kernel/locking/lglock.c b/kernel/locking/lglock.c deleted file mode 100644 index 951cfcd10b4a..000000000000 --- a/kernel/locking/lglock.c +++ /dev/null | |||
@@ -1,111 +0,0 @@ | |||
1 | /* See include/linux/lglock.h for description */ | ||
2 | #include <linux/module.h> | ||
3 | #include <linux/lglock.h> | ||
4 | #include <linux/cpu.h> | ||
5 | #include <linux/string.h> | ||
6 | |||
7 | /* | ||
8 | * Note there is no uninit, so lglocks cannot be defined in | ||
9 | * modules (but it's fine to use them from there) | ||
10 | * Could be added though, just undo lg_lock_init | ||
11 | */ | ||
12 | |||
13 | void lg_lock_init(struct lglock *lg, char *name) | ||
14 | { | ||
15 | LOCKDEP_INIT_MAP(&lg->lock_dep_map, name, &lg->lock_key, 0); | ||
16 | } | ||
17 | EXPORT_SYMBOL(lg_lock_init); | ||
18 | |||
19 | void lg_local_lock(struct lglock *lg) | ||
20 | { | ||
21 | arch_spinlock_t *lock; | ||
22 | |||
23 | preempt_disable(); | ||
24 | lock_acquire_shared(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_); | ||
25 | lock = this_cpu_ptr(lg->lock); | ||
26 | arch_spin_lock(lock); | ||
27 | } | ||
28 | EXPORT_SYMBOL(lg_local_lock); | ||
29 | |||
30 | void lg_local_unlock(struct lglock *lg) | ||
31 | { | ||
32 | arch_spinlock_t *lock; | ||
33 | |||
34 | lock_release(&lg->lock_dep_map, 1, _RET_IP_); | ||
35 | lock = this_cpu_ptr(lg->lock); | ||
36 | arch_spin_unlock(lock); | ||
37 | preempt_enable(); | ||
38 | } | ||
39 | EXPORT_SYMBOL(lg_local_unlock); | ||
40 | |||
41 | void lg_local_lock_cpu(struct lglock *lg, int cpu) | ||
42 | { | ||
43 | arch_spinlock_t *lock; | ||
44 | |||
45 | preempt_disable(); | ||
46 | lock_acquire_shared(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_); | ||
47 | lock = per_cpu_ptr(lg->lock, cpu); | ||
48 | arch_spin_lock(lock); | ||
49 | } | ||
50 | EXPORT_SYMBOL(lg_local_lock_cpu); | ||
51 | |||
52 | void lg_local_unlock_cpu(struct lglock *lg, int cpu) | ||
53 | { | ||
54 | arch_spinlock_t *lock; | ||
55 | |||
56 | lock_release(&lg->lock_dep_map, 1, _RET_IP_); | ||
57 | lock = per_cpu_ptr(lg->lock, cpu); | ||
58 | arch_spin_unlock(lock); | ||
59 | preempt_enable(); | ||
60 | } | ||
61 | EXPORT_SYMBOL(lg_local_unlock_cpu); | ||
62 | |||
63 | void lg_double_lock(struct lglock *lg, int cpu1, int cpu2) | ||
64 | { | ||
65 | BUG_ON(cpu1 == cpu2); | ||
66 | |||
67 | /* lock in cpu order, just like lg_global_lock */ | ||
68 | if (cpu2 < cpu1) | ||
69 | swap(cpu1, cpu2); | ||
70 | |||
71 | preempt_disable(); | ||
72 | lock_acquire_shared(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_); | ||
73 | arch_spin_lock(per_cpu_ptr(lg->lock, cpu1)); | ||
74 | arch_spin_lock(per_cpu_ptr(lg->lock, cpu2)); | ||
75 | } | ||
76 | |||
77 | void lg_double_unlock(struct lglock *lg, int cpu1, int cpu2) | ||
78 | { | ||
79 | lock_release(&lg->lock_dep_map, 1, _RET_IP_); | ||
80 | arch_spin_unlock(per_cpu_ptr(lg->lock, cpu1)); | ||
81 | arch_spin_unlock(per_cpu_ptr(lg->lock, cpu2)); | ||
82 | preempt_enable(); | ||
83 | } | ||
84 | |||
85 | void lg_global_lock(struct lglock *lg) | ||
86 | { | ||
87 | int i; | ||
88 | |||
89 | preempt_disable(); | ||
90 | lock_acquire_exclusive(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_); | ||
91 | for_each_possible_cpu(i) { | ||
92 | arch_spinlock_t *lock; | ||
93 | lock = per_cpu_ptr(lg->lock, i); | ||
94 | arch_spin_lock(lock); | ||
95 | } | ||
96 | } | ||
97 | EXPORT_SYMBOL(lg_global_lock); | ||
98 | |||
99 | void lg_global_unlock(struct lglock *lg) | ||
100 | { | ||
101 | int i; | ||
102 | |||
103 | lock_release(&lg->lock_dep_map, 1, _RET_IP_); | ||
104 | for_each_possible_cpu(i) { | ||
105 | arch_spinlock_t *lock; | ||
106 | lock = per_cpu_ptr(lg->lock, i); | ||
107 | arch_spin_unlock(lock); | ||
108 | } | ||
109 | preempt_enable(); | ||
110 | } | ||
111 | EXPORT_SYMBOL(lg_global_unlock); | ||
diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h index 51c4b24b6328..c2b88490d857 100644 --- a/kernel/locking/lockdep_internals.h +++ b/kernel/locking/lockdep_internals.h | |||
@@ -46,6 +46,14 @@ enum { | |||
46 | (LOCKF_USED_IN_HARDIRQ_READ | LOCKF_USED_IN_SOFTIRQ_READ) | 46 | (LOCKF_USED_IN_HARDIRQ_READ | LOCKF_USED_IN_SOFTIRQ_READ) |
47 | 47 | ||
48 | /* | 48 | /* |
49 | * CONFIG_PROVE_LOCKING_SMALL is defined for sparc. Sparc requires .text, | ||
50 | * .data and .bss to fit in required 32MB limit for the kernel. With | ||
51 | * PROVE_LOCKING we could go over this limit and cause system boot-up problems. | ||
52 | * So, reduce the static allocations for lockdeps related structures so that | ||
53 | * everything fits in current required size limit. | ||
54 | */ | ||
55 | #ifdef CONFIG_PROVE_LOCKING_SMALL | ||
56 | /* | ||
49 | * MAX_LOCKDEP_ENTRIES is the maximum number of lock dependencies | 57 | * MAX_LOCKDEP_ENTRIES is the maximum number of lock dependencies |
50 | * we track. | 58 | * we track. |
51 | * | 59 | * |
@@ -54,18 +62,24 @@ enum { | |||
54 | * table (if it's not there yet), and we check it for lock order | 62 | * table (if it's not there yet), and we check it for lock order |
55 | * conflicts and deadlocks. | 63 | * conflicts and deadlocks. |
56 | */ | 64 | */ |
65 | #define MAX_LOCKDEP_ENTRIES 16384UL | ||
66 | #define MAX_LOCKDEP_CHAINS_BITS 15 | ||
67 | #define MAX_STACK_TRACE_ENTRIES 262144UL | ||
68 | #else | ||
57 | #define MAX_LOCKDEP_ENTRIES 32768UL | 69 | #define MAX_LOCKDEP_ENTRIES 32768UL |
58 | 70 | ||
59 | #define MAX_LOCKDEP_CHAINS_BITS 16 | 71 | #define MAX_LOCKDEP_CHAINS_BITS 16 |
60 | #define MAX_LOCKDEP_CHAINS (1UL << MAX_LOCKDEP_CHAINS_BITS) | ||
61 | |||
62 | #define MAX_LOCKDEP_CHAIN_HLOCKS (MAX_LOCKDEP_CHAINS*5) | ||
63 | 72 | ||
64 | /* | 73 | /* |
65 | * Stack-trace: tightly packed array of stack backtrace | 74 | * Stack-trace: tightly packed array of stack backtrace |
66 | * addresses. Protected by the hash_lock. | 75 | * addresses. Protected by the hash_lock. |
67 | */ | 76 | */ |
68 | #define MAX_STACK_TRACE_ENTRIES 524288UL | 77 | #define MAX_STACK_TRACE_ENTRIES 524288UL |
78 | #endif | ||
79 | |||
80 | #define MAX_LOCKDEP_CHAINS (1UL << MAX_LOCKDEP_CHAINS_BITS) | ||
81 | |||
82 | #define MAX_LOCKDEP_CHAIN_HLOCKS (MAX_LOCKDEP_CHAINS*5) | ||
69 | 83 | ||
70 | extern struct list_head all_lock_classes; | 84 | extern struct list_head all_lock_classes; |
71 | extern struct lock_chain lock_chains[]; | 85 | extern struct lock_chain lock_chains[]; |
diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c index bec0b647f9cc..ce182599cf2e 100644 --- a/kernel/locking/percpu-rwsem.c +++ b/kernel/locking/percpu-rwsem.c | |||
@@ -8,152 +8,186 @@ | |||
8 | #include <linux/sched.h> | 8 | #include <linux/sched.h> |
9 | #include <linux/errno.h> | 9 | #include <linux/errno.h> |
10 | 10 | ||
11 | int __percpu_init_rwsem(struct percpu_rw_semaphore *brw, | 11 | int __percpu_init_rwsem(struct percpu_rw_semaphore *sem, |
12 | const char *name, struct lock_class_key *rwsem_key) | 12 | const char *name, struct lock_class_key *rwsem_key) |
13 | { | 13 | { |
14 | brw->fast_read_ctr = alloc_percpu(int); | 14 | sem->read_count = alloc_percpu(int); |
15 | if (unlikely(!brw->fast_read_ctr)) | 15 | if (unlikely(!sem->read_count)) |
16 | return -ENOMEM; | 16 | return -ENOMEM; |
17 | 17 | ||
18 | /* ->rw_sem represents the whole percpu_rw_semaphore for lockdep */ | 18 | /* ->rw_sem represents the whole percpu_rw_semaphore for lockdep */ |
19 | __init_rwsem(&brw->rw_sem, name, rwsem_key); | 19 | rcu_sync_init(&sem->rss, RCU_SCHED_SYNC); |
20 | rcu_sync_init(&brw->rss, RCU_SCHED_SYNC); | 20 | __init_rwsem(&sem->rw_sem, name, rwsem_key); |
21 | atomic_set(&brw->slow_read_ctr, 0); | 21 | init_waitqueue_head(&sem->writer); |
22 | init_waitqueue_head(&brw->write_waitq); | 22 | sem->readers_block = 0; |
23 | return 0; | 23 | return 0; |
24 | } | 24 | } |
25 | EXPORT_SYMBOL_GPL(__percpu_init_rwsem); | 25 | EXPORT_SYMBOL_GPL(__percpu_init_rwsem); |
26 | 26 | ||
27 | void percpu_free_rwsem(struct percpu_rw_semaphore *brw) | 27 | void percpu_free_rwsem(struct percpu_rw_semaphore *sem) |
28 | { | 28 | { |
29 | /* | 29 | /* |
30 | * XXX: temporary kludge. The error path in alloc_super() | 30 | * XXX: temporary kludge. The error path in alloc_super() |
31 | * assumes that percpu_free_rwsem() is safe after kzalloc(). | 31 | * assumes that percpu_free_rwsem() is safe after kzalloc(). |
32 | */ | 32 | */ |
33 | if (!brw->fast_read_ctr) | 33 | if (!sem->read_count) |
34 | return; | 34 | return; |
35 | 35 | ||
36 | rcu_sync_dtor(&brw->rss); | 36 | rcu_sync_dtor(&sem->rss); |
37 | free_percpu(brw->fast_read_ctr); | 37 | free_percpu(sem->read_count); |
38 | brw->fast_read_ctr = NULL; /* catch use after free bugs */ | 38 | sem->read_count = NULL; /* catch use after free bugs */ |
39 | } | 39 | } |
40 | EXPORT_SYMBOL_GPL(percpu_free_rwsem); | 40 | EXPORT_SYMBOL_GPL(percpu_free_rwsem); |
41 | 41 | ||
42 | /* | 42 | int __percpu_down_read(struct percpu_rw_semaphore *sem, int try) |
43 | * This is the fast-path for down_read/up_read. If it succeeds we rely | ||
44 | * on the barriers provided by rcu_sync_enter/exit; see the comments in | ||
45 | * percpu_down_write() and percpu_up_write(). | ||
46 | * | ||
47 | * If this helper fails the callers rely on the normal rw_semaphore and | ||
48 | * atomic_dec_and_test(), so in this case we have the necessary barriers. | ||
49 | */ | ||
50 | static bool update_fast_ctr(struct percpu_rw_semaphore *brw, unsigned int val) | ||
51 | { | 43 | { |
52 | bool success; | 44 | /* |
45 | * Due to having preemption disabled the decrement happens on | ||
46 | * the same CPU as the increment, avoiding the | ||
47 | * increment-on-one-CPU-and-decrement-on-another problem. | ||
48 | * | ||
49 | * If the reader misses the writer's assignment of readers_block, then | ||
50 | * the writer is guaranteed to see the reader's increment. | ||
51 | * | ||
52 | * Conversely, any readers that increment their sem->read_count after | ||
53 | * the writer looks are guaranteed to see the readers_block value, | ||
54 | * which in turn means that they are guaranteed to immediately | ||
55 | * decrement their sem->read_count, so that it doesn't matter that the | ||
56 | * writer missed them. | ||
57 | */ | ||
53 | 58 | ||
54 | preempt_disable(); | 59 | smp_mb(); /* A matches D */ |
55 | success = rcu_sync_is_idle(&brw->rss); | ||
56 | if (likely(success)) | ||
57 | __this_cpu_add(*brw->fast_read_ctr, val); | ||
58 | preempt_enable(); | ||
59 | 60 | ||
60 | return success; | 61 | /* |
61 | } | 62 | * If !readers_block the critical section starts here, matched by the |
63 | * release in percpu_up_write(). | ||
64 | */ | ||
65 | if (likely(!smp_load_acquire(&sem->readers_block))) | ||
66 | return 1; | ||
62 | 67 | ||
63 | /* | 68 | /* |
64 | * Like the normal down_read() this is not recursive, the writer can | 69 | * Per the above comment; we still have preemption disabled and |
65 | * come after the first percpu_down_read() and create the deadlock. | 70 | * will thus decrement on the same CPU as we incremented. |
66 | * | 71 | */ |
67 | * Note: returns with lock_is_held(brw->rw_sem) == T for lockdep, | 72 | __percpu_up_read(sem); |
68 | * percpu_up_read() does rwsem_release(). This pairs with the usage | ||
69 | * of ->rw_sem in percpu_down/up_write(). | ||
70 | */ | ||
71 | void percpu_down_read(struct percpu_rw_semaphore *brw) | ||
72 | { | ||
73 | might_sleep(); | ||
74 | rwsem_acquire_read(&brw->rw_sem.dep_map, 0, 0, _RET_IP_); | ||
75 | 73 | ||
76 | if (likely(update_fast_ctr(brw, +1))) | 74 | if (try) |
77 | return; | 75 | return 0; |
78 | 76 | ||
79 | /* Avoid rwsem_acquire_read() and rwsem_release() */ | 77 | /* |
80 | __down_read(&brw->rw_sem); | 78 | * We either call schedule() in the wait, or we'll fall through |
81 | atomic_inc(&brw->slow_read_ctr); | 79 | * and reschedule on the preempt_enable() in percpu_down_read(). |
82 | __up_read(&brw->rw_sem); | 80 | */ |
83 | } | 81 | preempt_enable_no_resched(); |
84 | EXPORT_SYMBOL_GPL(percpu_down_read); | ||
85 | 82 | ||
86 | int percpu_down_read_trylock(struct percpu_rw_semaphore *brw) | 83 | /* |
87 | { | 84 | * Avoid lockdep for the down/up_read() we already have them. |
88 | if (unlikely(!update_fast_ctr(brw, +1))) { | 85 | */ |
89 | if (!__down_read_trylock(&brw->rw_sem)) | 86 | __down_read(&sem->rw_sem); |
90 | return 0; | 87 | this_cpu_inc(*sem->read_count); |
91 | atomic_inc(&brw->slow_read_ctr); | 88 | __up_read(&sem->rw_sem); |
92 | __up_read(&brw->rw_sem); | 89 | |
93 | } | 90 | preempt_disable(); |
94 | |||
95 | rwsem_acquire_read(&brw->rw_sem.dep_map, 0, 1, _RET_IP_); | ||
96 | return 1; | 91 | return 1; |
97 | } | 92 | } |
93 | EXPORT_SYMBOL_GPL(__percpu_down_read); | ||
98 | 94 | ||
99 | void percpu_up_read(struct percpu_rw_semaphore *brw) | 95 | void __percpu_up_read(struct percpu_rw_semaphore *sem) |
100 | { | 96 | { |
101 | rwsem_release(&brw->rw_sem.dep_map, 1, _RET_IP_); | 97 | smp_mb(); /* B matches C */ |
102 | 98 | /* | |
103 | if (likely(update_fast_ctr(brw, -1))) | 99 | * In other words, if they see our decrement (presumably to aggregate |
104 | return; | 100 | * zero, as that is the only time it matters) they will also see our |
101 | * critical section. | ||
102 | */ | ||
103 | __this_cpu_dec(*sem->read_count); | ||
105 | 104 | ||
106 | /* false-positive is possible but harmless */ | 105 | /* Prod writer to recheck readers_active */ |
107 | if (atomic_dec_and_test(&brw->slow_read_ctr)) | 106 | wake_up(&sem->writer); |
108 | wake_up_all(&brw->write_waitq); | ||
109 | } | 107 | } |
110 | EXPORT_SYMBOL_GPL(percpu_up_read); | 108 | EXPORT_SYMBOL_GPL(__percpu_up_read); |
109 | |||
110 | #define per_cpu_sum(var) \ | ||
111 | ({ \ | ||
112 | typeof(var) __sum = 0; \ | ||
113 | int cpu; \ | ||
114 | compiletime_assert_atomic_type(__sum); \ | ||
115 | for_each_possible_cpu(cpu) \ | ||
116 | __sum += per_cpu(var, cpu); \ | ||
117 | __sum; \ | ||
118 | }) | ||
111 | 119 | ||
112 | static int clear_fast_ctr(struct percpu_rw_semaphore *brw) | 120 | /* |
121 | * Return true if the modular sum of the sem->read_count per-CPU variable is | ||
122 | * zero. If this sum is zero, then it is stable due to the fact that if any | ||
123 | * newly arriving readers increment a given counter, they will immediately | ||
124 | * decrement that same counter. | ||
125 | */ | ||
126 | static bool readers_active_check(struct percpu_rw_semaphore *sem) | ||
113 | { | 127 | { |
114 | unsigned int sum = 0; | 128 | if (per_cpu_sum(*sem->read_count) != 0) |
115 | int cpu; | 129 | return false; |
130 | |||
131 | /* | ||
132 | * If we observed the decrement; ensure we see the entire critical | ||
133 | * section. | ||
134 | */ | ||
116 | 135 | ||
117 | for_each_possible_cpu(cpu) { | 136 | smp_mb(); /* C matches B */ |
118 | sum += per_cpu(*brw->fast_read_ctr, cpu); | ||
119 | per_cpu(*brw->fast_read_ctr, cpu) = 0; | ||
120 | } | ||
121 | 137 | ||
122 | return sum; | 138 | return true; |
123 | } | 139 | } |
124 | 140 | ||
125 | void percpu_down_write(struct percpu_rw_semaphore *brw) | 141 | void percpu_down_write(struct percpu_rw_semaphore *sem) |
126 | { | 142 | { |
143 | /* Notify readers to take the slow path. */ | ||
144 | rcu_sync_enter(&sem->rss); | ||
145 | |||
146 | down_write(&sem->rw_sem); | ||
147 | |||
127 | /* | 148 | /* |
128 | * Make rcu_sync_is_idle() == F and thus disable the fast-path in | 149 | * Notify new readers to block; up until now, and thus throughout the |
129 | * percpu_down_read() and percpu_up_read(), and wait for gp pass. | 150 | * longish rcu_sync_enter() above, new readers could still come in. |
130 | * | ||
131 | * The latter synchronises us with the preceding readers which used | ||
132 | * the fast-past, so we can not miss the result of __this_cpu_add() | ||
133 | * or anything else inside their criticial sections. | ||
134 | */ | 151 | */ |
135 | rcu_sync_enter(&brw->rss); | 152 | WRITE_ONCE(sem->readers_block, 1); |
136 | 153 | ||
137 | /* exclude other writers, and block the new readers completely */ | 154 | smp_mb(); /* D matches A */ |
138 | down_write(&brw->rw_sem); | ||
139 | 155 | ||
140 | /* nobody can use fast_read_ctr, move its sum into slow_read_ctr */ | 156 | /* |
141 | atomic_add(clear_fast_ctr(brw), &brw->slow_read_ctr); | 157 | * If they don't see our writer of readers_block, then we are |
158 | * guaranteed to see their sem->read_count increment, and therefore | ||
159 | * will wait for them. | ||
160 | */ | ||
142 | 161 | ||
143 | /* wait for all readers to complete their percpu_up_read() */ | 162 | /* Wait for all now active readers to complete. */ |
144 | wait_event(brw->write_waitq, !atomic_read(&brw->slow_read_ctr)); | 163 | wait_event(sem->writer, readers_active_check(sem)); |
145 | } | 164 | } |
146 | EXPORT_SYMBOL_GPL(percpu_down_write); | 165 | EXPORT_SYMBOL_GPL(percpu_down_write); |
147 | 166 | ||
148 | void percpu_up_write(struct percpu_rw_semaphore *brw) | 167 | void percpu_up_write(struct percpu_rw_semaphore *sem) |
149 | { | 168 | { |
150 | /* release the lock, but the readers can't use the fast-path */ | ||
151 | up_write(&brw->rw_sem); | ||
152 | /* | 169 | /* |
153 | * Enable the fast-path in percpu_down_read() and percpu_up_read() | 170 | * Signal the writer is done, no fast path yet. |
154 | * but only after another gp pass; this adds the necessary barrier | 171 | * |
155 | * to ensure the reader can't miss the changes done by us. | 172 | * One reason that we cannot just immediately flip to readers_fast is |
173 | * that new readers might fail to see the results of this writer's | ||
174 | * critical section. | ||
175 | * | ||
176 | * Therefore we force it through the slow path which guarantees an | ||
177 | * acquire and thereby guarantees the critical section's consistency. | ||
178 | */ | ||
179 | smp_store_release(&sem->readers_block, 0); | ||
180 | |||
181 | /* | ||
182 | * Release the write lock, this will allow readers back in the game. | ||
183 | */ | ||
184 | up_write(&sem->rw_sem); | ||
185 | |||
186 | /* | ||
187 | * Once this completes (at least one RCU-sched grace period hence) the | ||
188 | * reader fast path will be available again. Safe to use outside the | ||
189 | * exclusive write lock because its counting. | ||
156 | */ | 190 | */ |
157 | rcu_sync_exit(&brw->rss); | 191 | rcu_sync_exit(&sem->rss); |
158 | } | 192 | } |
159 | EXPORT_SYMBOL_GPL(percpu_up_write); | 193 | EXPORT_SYMBOL_GPL(percpu_up_write); |
diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h index 8a99abf58080..e3b5520005db 100644 --- a/kernel/locking/qspinlock_paravirt.h +++ b/kernel/locking/qspinlock_paravirt.h | |||
@@ -70,11 +70,14 @@ struct pv_node { | |||
70 | static inline bool pv_queued_spin_steal_lock(struct qspinlock *lock) | 70 | static inline bool pv_queued_spin_steal_lock(struct qspinlock *lock) |
71 | { | 71 | { |
72 | struct __qspinlock *l = (void *)lock; | 72 | struct __qspinlock *l = (void *)lock; |
73 | int ret = !(atomic_read(&lock->val) & _Q_LOCKED_PENDING_MASK) && | ||
74 | (cmpxchg(&l->locked, 0, _Q_LOCKED_VAL) == 0); | ||
75 | 73 | ||
76 | qstat_inc(qstat_pv_lock_stealing, ret); | 74 | if (!(atomic_read(&lock->val) & _Q_LOCKED_PENDING_MASK) && |
77 | return ret; | 75 | (cmpxchg(&l->locked, 0, _Q_LOCKED_VAL) == 0)) { |
76 | qstat_inc(qstat_pv_lock_stealing, true); | ||
77 | return true; | ||
78 | } | ||
79 | |||
80 | return false; | ||
78 | } | 81 | } |
79 | 82 | ||
80 | /* | 83 | /* |
@@ -257,7 +260,6 @@ static struct pv_node *pv_unhash(struct qspinlock *lock) | |||
257 | static inline bool | 260 | static inline bool |
258 | pv_wait_early(struct pv_node *prev, int loop) | 261 | pv_wait_early(struct pv_node *prev, int loop) |
259 | { | 262 | { |
260 | |||
261 | if ((loop & PV_PREV_CHECK_MASK) != 0) | 263 | if ((loop & PV_PREV_CHECK_MASK) != 0) |
262 | return false; | 264 | return false; |
263 | 265 | ||
@@ -286,12 +288,10 @@ static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev) | |||
286 | { | 288 | { |
287 | struct pv_node *pn = (struct pv_node *)node; | 289 | struct pv_node *pn = (struct pv_node *)node; |
288 | struct pv_node *pp = (struct pv_node *)prev; | 290 | struct pv_node *pp = (struct pv_node *)prev; |
289 | int waitcnt = 0; | ||
290 | int loop; | 291 | int loop; |
291 | bool wait_early; | 292 | bool wait_early; |
292 | 293 | ||
293 | /* waitcnt processing will be compiled out if !QUEUED_LOCK_STAT */ | 294 | for (;;) { |
294 | for (;; waitcnt++) { | ||
295 | for (wait_early = false, loop = SPIN_THRESHOLD; loop; loop--) { | 295 | for (wait_early = false, loop = SPIN_THRESHOLD; loop; loop--) { |
296 | if (READ_ONCE(node->locked)) | 296 | if (READ_ONCE(node->locked)) |
297 | return; | 297 | return; |
@@ -315,7 +315,6 @@ static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev) | |||
315 | 315 | ||
316 | if (!READ_ONCE(node->locked)) { | 316 | if (!READ_ONCE(node->locked)) { |
317 | qstat_inc(qstat_pv_wait_node, true); | 317 | qstat_inc(qstat_pv_wait_node, true); |
318 | qstat_inc(qstat_pv_wait_again, waitcnt); | ||
319 | qstat_inc(qstat_pv_wait_early, wait_early); | 318 | qstat_inc(qstat_pv_wait_early, wait_early); |
320 | pv_wait(&pn->state, vcpu_halted); | 319 | pv_wait(&pn->state, vcpu_halted); |
321 | } | 320 | } |
@@ -456,12 +455,9 @@ pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node) | |||
456 | pv_wait(&l->locked, _Q_SLOW_VAL); | 455 | pv_wait(&l->locked, _Q_SLOW_VAL); |
457 | 456 | ||
458 | /* | 457 | /* |
459 | * The unlocker should have freed the lock before kicking the | 458 | * Because of lock stealing, the queue head vCPU may not be |
460 | * CPU. So if the lock is still not free, it is a spurious | 459 | * able to acquire the lock before it has to wait again. |
461 | * wakeup or another vCPU has stolen the lock. The current | ||
462 | * vCPU should spin again. | ||
463 | */ | 460 | */ |
464 | qstat_inc(qstat_pv_spurious_wakeup, READ_ONCE(l->locked)); | ||
465 | } | 461 | } |
466 | 462 | ||
467 | /* | 463 | /* |
@@ -544,7 +540,7 @@ __visible void __pv_queued_spin_unlock(struct qspinlock *lock) | |||
544 | * unhash. Otherwise it would be possible to have multiple @lock | 540 | * unhash. Otherwise it would be possible to have multiple @lock |
545 | * entries, which would be BAD. | 541 | * entries, which would be BAD. |
546 | */ | 542 | */ |
547 | locked = cmpxchg(&l->locked, _Q_LOCKED_VAL, 0); | 543 | locked = cmpxchg_release(&l->locked, _Q_LOCKED_VAL, 0); |
548 | if (likely(locked == _Q_LOCKED_VAL)) | 544 | if (likely(locked == _Q_LOCKED_VAL)) |
549 | return; | 545 | return; |
550 | 546 | ||
diff --git a/kernel/locking/qspinlock_stat.h b/kernel/locking/qspinlock_stat.h index b9d031516254..eb0a599fcf58 100644 --- a/kernel/locking/qspinlock_stat.h +++ b/kernel/locking/qspinlock_stat.h | |||
@@ -24,8 +24,8 @@ | |||
24 | * pv_latency_wake - average latency (ns) from vCPU kick to wakeup | 24 | * pv_latency_wake - average latency (ns) from vCPU kick to wakeup |
25 | * pv_lock_slowpath - # of locking operations via the slowpath | 25 | * pv_lock_slowpath - # of locking operations via the slowpath |
26 | * pv_lock_stealing - # of lock stealing operations | 26 | * pv_lock_stealing - # of lock stealing operations |
27 | * pv_spurious_wakeup - # of spurious wakeups | 27 | * pv_spurious_wakeup - # of spurious wakeups in non-head vCPUs |
28 | * pv_wait_again - # of vCPU wait's that happened after a vCPU kick | 28 | * pv_wait_again - # of wait's after a queue head vCPU kick |
29 | * pv_wait_early - # of early vCPU wait's | 29 | * pv_wait_early - # of early vCPU wait's |
30 | * pv_wait_head - # of vCPU wait's at the queue head | 30 | * pv_wait_head - # of vCPU wait's at the queue head |
31 | * pv_wait_node - # of vCPU wait's at a non-head queue node | 31 | * pv_wait_node - # of vCPU wait's at a non-head queue node |
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index 447e08de1fab..2337b4bb2366 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c | |||
@@ -121,16 +121,19 @@ enum rwsem_wake_type { | |||
121 | * - woken process blocks are discarded from the list after having task zeroed | 121 | * - woken process blocks are discarded from the list after having task zeroed |
122 | * - writers are only marked woken if downgrading is false | 122 | * - writers are only marked woken if downgrading is false |
123 | */ | 123 | */ |
124 | static struct rw_semaphore * | 124 | static void __rwsem_mark_wake(struct rw_semaphore *sem, |
125 | __rwsem_mark_wake(struct rw_semaphore *sem, | 125 | enum rwsem_wake_type wake_type, |
126 | enum rwsem_wake_type wake_type, struct wake_q_head *wake_q) | 126 | struct wake_q_head *wake_q) |
127 | { | 127 | { |
128 | struct rwsem_waiter *waiter; | 128 | struct rwsem_waiter *waiter, *tmp; |
129 | struct task_struct *tsk; | 129 | long oldcount, woken = 0, adjustment = 0; |
130 | struct list_head *next; | 130 | |
131 | long oldcount, woken, loop, adjustment; | 131 | /* |
132 | * Take a peek at the queue head waiter such that we can determine | ||
133 | * the wakeup(s) to perform. | ||
134 | */ | ||
135 | waiter = list_first_entry(&sem->wait_list, struct rwsem_waiter, list); | ||
132 | 136 | ||
133 | waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list); | ||
134 | if (waiter->type == RWSEM_WAITING_FOR_WRITE) { | 137 | if (waiter->type == RWSEM_WAITING_FOR_WRITE) { |
135 | if (wake_type == RWSEM_WAKE_ANY) { | 138 | if (wake_type == RWSEM_WAKE_ANY) { |
136 | /* | 139 | /* |
@@ -142,19 +145,19 @@ __rwsem_mark_wake(struct rw_semaphore *sem, | |||
142 | */ | 145 | */ |
143 | wake_q_add(wake_q, waiter->task); | 146 | wake_q_add(wake_q, waiter->task); |
144 | } | 147 | } |
145 | goto out; | 148 | |
149 | return; | ||
146 | } | 150 | } |
147 | 151 | ||
148 | /* Writers might steal the lock before we grant it to the next reader. | 152 | /* |
153 | * Writers might steal the lock before we grant it to the next reader. | ||
149 | * We prefer to do the first reader grant before counting readers | 154 | * We prefer to do the first reader grant before counting readers |
150 | * so we can bail out early if a writer stole the lock. | 155 | * so we can bail out early if a writer stole the lock. |
151 | */ | 156 | */ |
152 | adjustment = 0; | ||
153 | if (wake_type != RWSEM_WAKE_READ_OWNED) { | 157 | if (wake_type != RWSEM_WAKE_READ_OWNED) { |
154 | adjustment = RWSEM_ACTIVE_READ_BIAS; | 158 | adjustment = RWSEM_ACTIVE_READ_BIAS; |
155 | try_reader_grant: | 159 | try_reader_grant: |
156 | oldcount = atomic_long_fetch_add(adjustment, &sem->count); | 160 | oldcount = atomic_long_fetch_add(adjustment, &sem->count); |
157 | |||
158 | if (unlikely(oldcount < RWSEM_WAITING_BIAS)) { | 161 | if (unlikely(oldcount < RWSEM_WAITING_BIAS)) { |
159 | /* | 162 | /* |
160 | * If the count is still less than RWSEM_WAITING_BIAS | 163 | * If the count is still less than RWSEM_WAITING_BIAS |
@@ -164,7 +167,8 @@ __rwsem_mark_wake(struct rw_semaphore *sem, | |||
164 | */ | 167 | */ |
165 | if (atomic_long_add_return(-adjustment, &sem->count) < | 168 | if (atomic_long_add_return(-adjustment, &sem->count) < |
166 | RWSEM_WAITING_BIAS) | 169 | RWSEM_WAITING_BIAS) |
167 | goto out; | 170 | return; |
171 | |||
168 | /* Last active locker left. Retry waking readers. */ | 172 | /* Last active locker left. Retry waking readers. */ |
169 | goto try_reader_grant; | 173 | goto try_reader_grant; |
170 | } | 174 | } |
@@ -176,38 +180,23 @@ __rwsem_mark_wake(struct rw_semaphore *sem, | |||
176 | rwsem_set_reader_owned(sem); | 180 | rwsem_set_reader_owned(sem); |
177 | } | 181 | } |
178 | 182 | ||
179 | /* Grant an infinite number of read locks to the readers at the front | 183 | /* |
180 | * of the queue. Note we increment the 'active part' of the count by | 184 | * Grant an infinite number of read locks to the readers at the front |
181 | * the number of readers before waking any processes up. | 185 | * of the queue. We know that woken will be at least 1 as we accounted |
186 | * for above. Note we increment the 'active part' of the count by the | ||
187 | * number of readers before waking any processes up. | ||
182 | */ | 188 | */ |
183 | woken = 0; | 189 | list_for_each_entry_safe(waiter, tmp, &sem->wait_list, list) { |
184 | do { | 190 | struct task_struct *tsk; |
185 | woken++; | ||
186 | 191 | ||
187 | if (waiter->list.next == &sem->wait_list) | 192 | if (waiter->type == RWSEM_WAITING_FOR_WRITE) |
188 | break; | 193 | break; |
189 | 194 | ||
190 | waiter = list_entry(waiter->list.next, | 195 | woken++; |
191 | struct rwsem_waiter, list); | ||
192 | |||
193 | } while (waiter->type != RWSEM_WAITING_FOR_WRITE); | ||
194 | |||
195 | adjustment = woken * RWSEM_ACTIVE_READ_BIAS - adjustment; | ||
196 | if (waiter->type != RWSEM_WAITING_FOR_WRITE) | ||
197 | /* hit end of list above */ | ||
198 | adjustment -= RWSEM_WAITING_BIAS; | ||
199 | |||
200 | if (adjustment) | ||
201 | atomic_long_add(adjustment, &sem->count); | ||
202 | |||
203 | next = sem->wait_list.next; | ||
204 | loop = woken; | ||
205 | do { | ||
206 | waiter = list_entry(next, struct rwsem_waiter, list); | ||
207 | next = waiter->list.next; | ||
208 | tsk = waiter->task; | 196 | tsk = waiter->task; |
209 | 197 | ||
210 | wake_q_add(wake_q, tsk); | 198 | wake_q_add(wake_q, tsk); |
199 | list_del(&waiter->list); | ||
211 | /* | 200 | /* |
212 | * Ensure that the last operation is setting the reader | 201 | * Ensure that the last operation is setting the reader |
213 | * waiter to nil such that rwsem_down_read_failed() cannot | 202 | * waiter to nil such that rwsem_down_read_failed() cannot |
@@ -215,13 +204,16 @@ __rwsem_mark_wake(struct rw_semaphore *sem, | |||
215 | * to the task to wakeup. | 204 | * to the task to wakeup. |
216 | */ | 205 | */ |
217 | smp_store_release(&waiter->task, NULL); | 206 | smp_store_release(&waiter->task, NULL); |
218 | } while (--loop); | 207 | } |
219 | 208 | ||
220 | sem->wait_list.next = next; | 209 | adjustment = woken * RWSEM_ACTIVE_READ_BIAS - adjustment; |
221 | next->prev = &sem->wait_list; | 210 | if (list_empty(&sem->wait_list)) { |
211 | /* hit end of list above */ | ||
212 | adjustment -= RWSEM_WAITING_BIAS; | ||
213 | } | ||
222 | 214 | ||
223 | out: | 215 | if (adjustment) |
224 | return sem; | 216 | atomic_long_add(adjustment, &sem->count); |
225 | } | 217 | } |
226 | 218 | ||
227 | /* | 219 | /* |
@@ -235,7 +227,6 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) | |||
235 | struct task_struct *tsk = current; | 227 | struct task_struct *tsk = current; |
236 | WAKE_Q(wake_q); | 228 | WAKE_Q(wake_q); |
237 | 229 | ||
238 | /* set up my own style of waitqueue */ | ||
239 | waiter.task = tsk; | 230 | waiter.task = tsk; |
240 | waiter.type = RWSEM_WAITING_FOR_READ; | 231 | waiter.type = RWSEM_WAITING_FOR_READ; |
241 | 232 | ||
@@ -247,7 +238,8 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) | |||
247 | /* we're now waiting on the lock, but no longer actively locking */ | 238 | /* we're now waiting on the lock, but no longer actively locking */ |
248 | count = atomic_long_add_return(adjustment, &sem->count); | 239 | count = atomic_long_add_return(adjustment, &sem->count); |
249 | 240 | ||
250 | /* If there are no active locks, wake the front queued process(es). | 241 | /* |
242 | * If there are no active locks, wake the front queued process(es). | ||
251 | * | 243 | * |
252 | * If there are no writers and we are first in the queue, | 244 | * If there are no writers and we are first in the queue, |
253 | * wake our own waiter to join the existing active readers ! | 245 | * wake our own waiter to join the existing active readers ! |
@@ -255,7 +247,7 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) | |||
255 | if (count == RWSEM_WAITING_BIAS || | 247 | if (count == RWSEM_WAITING_BIAS || |
256 | (count > RWSEM_WAITING_BIAS && | 248 | (count > RWSEM_WAITING_BIAS && |
257 | adjustment != -RWSEM_ACTIVE_READ_BIAS)) | 249 | adjustment != -RWSEM_ACTIVE_READ_BIAS)) |
258 | sem = __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); | 250 | __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); |
259 | 251 | ||
260 | raw_spin_unlock_irq(&sem->wait_lock); | 252 | raw_spin_unlock_irq(&sem->wait_lock); |
261 | wake_up_q(&wake_q); | 253 | wake_up_q(&wake_q); |
@@ -505,7 +497,7 @@ __rwsem_down_write_failed_common(struct rw_semaphore *sem, int state) | |||
505 | if (count > RWSEM_WAITING_BIAS) { | 497 | if (count > RWSEM_WAITING_BIAS) { |
506 | WAKE_Q(wake_q); | 498 | WAKE_Q(wake_q); |
507 | 499 | ||
508 | sem = __rwsem_mark_wake(sem, RWSEM_WAKE_READERS, &wake_q); | 500 | __rwsem_mark_wake(sem, RWSEM_WAKE_READERS, &wake_q); |
509 | /* | 501 | /* |
510 | * The wakeup is normally called _after_ the wait_lock | 502 | * The wakeup is normally called _after_ the wait_lock |
511 | * is released, but given that we are proactively waking | 503 | * is released, but given that we are proactively waking |
@@ -614,9 +606,8 @@ struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem) | |||
614 | raw_spin_lock_irqsave(&sem->wait_lock, flags); | 606 | raw_spin_lock_irqsave(&sem->wait_lock, flags); |
615 | locked: | 607 | locked: |
616 | 608 | ||
617 | /* do nothing if list empty */ | ||
618 | if (!list_empty(&sem->wait_list)) | 609 | if (!list_empty(&sem->wait_list)) |
619 | sem = __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); | 610 | __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); |
620 | 611 | ||
621 | raw_spin_unlock_irqrestore(&sem->wait_lock, flags); | 612 | raw_spin_unlock_irqrestore(&sem->wait_lock, flags); |
622 | wake_up_q(&wake_q); | 613 | wake_up_q(&wake_q); |
@@ -638,9 +629,8 @@ struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem) | |||
638 | 629 | ||
639 | raw_spin_lock_irqsave(&sem->wait_lock, flags); | 630 | raw_spin_lock_irqsave(&sem->wait_lock, flags); |
640 | 631 | ||
641 | /* do nothing if list empty */ | ||
642 | if (!list_empty(&sem->wait_list)) | 632 | if (!list_empty(&sem->wait_list)) |
643 | sem = __rwsem_mark_wake(sem, RWSEM_WAKE_READ_OWNED, &wake_q); | 633 | __rwsem_mark_wake(sem, RWSEM_WAKE_READ_OWNED, &wake_q); |
644 | 634 | ||
645 | raw_spin_unlock_irqrestore(&sem->wait_lock, flags); | 635 | raw_spin_unlock_irqrestore(&sem->wait_lock, flags); |
646 | wake_up_q(&wake_q); | 636 | wake_up_q(&wake_q); |
diff --git a/kernel/module.c b/kernel/module.c index 529efae9f481..f57dd63186e6 100644 --- a/kernel/module.c +++ b/kernel/module.c | |||
@@ -1149,6 +1149,8 @@ static size_t module_flags_taint(struct module *mod, char *buf) | |||
1149 | buf[l++] = 'C'; | 1149 | buf[l++] = 'C'; |
1150 | if (mod->taints & (1 << TAINT_UNSIGNED_MODULE)) | 1150 | if (mod->taints & (1 << TAINT_UNSIGNED_MODULE)) |
1151 | buf[l++] = 'E'; | 1151 | buf[l++] = 'E'; |
1152 | if (mod->taints & (1 << TAINT_LIVEPATCH)) | ||
1153 | buf[l++] = 'K'; | ||
1152 | /* | 1154 | /* |
1153 | * TAINT_FORCED_RMMOD: could be added. | 1155 | * TAINT_FORCED_RMMOD: could be added. |
1154 | * TAINT_CPU_OUT_OF_SPEC, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't | 1156 | * TAINT_CPU_OUT_OF_SPEC, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't |
@@ -2792,14 +2794,17 @@ static int copy_chunked_from_user(void *dst, const void __user *usrc, unsigned l | |||
2792 | } | 2794 | } |
2793 | 2795 | ||
2794 | #ifdef CONFIG_LIVEPATCH | 2796 | #ifdef CONFIG_LIVEPATCH |
2795 | static int find_livepatch_modinfo(struct module *mod, struct load_info *info) | 2797 | static int check_modinfo_livepatch(struct module *mod, struct load_info *info) |
2796 | { | 2798 | { |
2797 | mod->klp = get_modinfo(info, "livepatch") ? true : false; | 2799 | if (get_modinfo(info, "livepatch")) { |
2800 | mod->klp = true; | ||
2801 | add_taint_module(mod, TAINT_LIVEPATCH, LOCKDEP_STILL_OK); | ||
2802 | } | ||
2798 | 2803 | ||
2799 | return 0; | 2804 | return 0; |
2800 | } | 2805 | } |
2801 | #else /* !CONFIG_LIVEPATCH */ | 2806 | #else /* !CONFIG_LIVEPATCH */ |
2802 | static int find_livepatch_modinfo(struct module *mod, struct load_info *info) | 2807 | static int check_modinfo_livepatch(struct module *mod, struct load_info *info) |
2803 | { | 2808 | { |
2804 | if (get_modinfo(info, "livepatch")) { | 2809 | if (get_modinfo(info, "livepatch")) { |
2805 | pr_err("%s: module is marked as livepatch module, but livepatch support is disabled", | 2810 | pr_err("%s: module is marked as livepatch module, but livepatch support is disabled", |
@@ -2969,7 +2974,7 @@ static int check_modinfo(struct module *mod, struct load_info *info, int flags) | |||
2969 | "is unknown, you have been warned.\n", mod->name); | 2974 | "is unknown, you have been warned.\n", mod->name); |
2970 | } | 2975 | } |
2971 | 2976 | ||
2972 | err = find_livepatch_modinfo(mod, info); | 2977 | err = check_modinfo_livepatch(mod, info); |
2973 | if (err) | 2978 | if (err) |
2974 | return err; | 2979 | return err; |
2975 | 2980 | ||
diff --git a/kernel/padata.c b/kernel/padata.c index 993278895ccc..7848f0566403 100644 --- a/kernel/padata.c +++ b/kernel/padata.c | |||
@@ -30,6 +30,7 @@ | |||
30 | #include <linux/slab.h> | 30 | #include <linux/slab.h> |
31 | #include <linux/sysfs.h> | 31 | #include <linux/sysfs.h> |
32 | #include <linux/rcupdate.h> | 32 | #include <linux/rcupdate.h> |
33 | #include <linux/module.h> | ||
33 | 34 | ||
34 | #define MAX_OBJ_NUM 1000 | 35 | #define MAX_OBJ_NUM 1000 |
35 | 36 | ||
@@ -769,52 +770,43 @@ static inline int pinst_has_cpu(struct padata_instance *pinst, int cpu) | |||
769 | cpumask_test_cpu(cpu, pinst->cpumask.cbcpu); | 770 | cpumask_test_cpu(cpu, pinst->cpumask.cbcpu); |
770 | } | 771 | } |
771 | 772 | ||
772 | 773 | static int padata_cpu_online(unsigned int cpu, struct hlist_node *node) | |
773 | static int padata_cpu_callback(struct notifier_block *nfb, | ||
774 | unsigned long action, void *hcpu) | ||
775 | { | 774 | { |
776 | int err; | ||
777 | struct padata_instance *pinst; | 775 | struct padata_instance *pinst; |
778 | int cpu = (unsigned long)hcpu; | 776 | int ret; |
779 | 777 | ||
780 | pinst = container_of(nfb, struct padata_instance, cpu_notifier); | 778 | pinst = hlist_entry_safe(node, struct padata_instance, node); |
779 | if (!pinst_has_cpu(pinst, cpu)) | ||
780 | return 0; | ||
781 | 781 | ||
782 | switch (action) { | 782 | mutex_lock(&pinst->lock); |
783 | case CPU_ONLINE: | 783 | ret = __padata_add_cpu(pinst, cpu); |
784 | case CPU_ONLINE_FROZEN: | 784 | mutex_unlock(&pinst->lock); |
785 | case CPU_DOWN_FAILED: | 785 | return ret; |
786 | case CPU_DOWN_FAILED_FROZEN: | 786 | } |
787 | if (!pinst_has_cpu(pinst, cpu)) | ||
788 | break; | ||
789 | mutex_lock(&pinst->lock); | ||
790 | err = __padata_add_cpu(pinst, cpu); | ||
791 | mutex_unlock(&pinst->lock); | ||
792 | if (err) | ||
793 | return notifier_from_errno(err); | ||
794 | break; | ||
795 | 787 | ||
796 | case CPU_DOWN_PREPARE: | 788 | static int padata_cpu_prep_down(unsigned int cpu, struct hlist_node *node) |
797 | case CPU_DOWN_PREPARE_FROZEN: | 789 | { |
798 | case CPU_UP_CANCELED: | 790 | struct padata_instance *pinst; |
799 | case CPU_UP_CANCELED_FROZEN: | 791 | int ret; |
800 | if (!pinst_has_cpu(pinst, cpu)) | 792 | |
801 | break; | 793 | pinst = hlist_entry_safe(node, struct padata_instance, node); |
802 | mutex_lock(&pinst->lock); | 794 | if (!pinst_has_cpu(pinst, cpu)) |
803 | err = __padata_remove_cpu(pinst, cpu); | 795 | return 0; |
804 | mutex_unlock(&pinst->lock); | ||
805 | if (err) | ||
806 | return notifier_from_errno(err); | ||
807 | break; | ||
808 | } | ||
809 | 796 | ||
810 | return NOTIFY_OK; | 797 | mutex_lock(&pinst->lock); |
798 | ret = __padata_remove_cpu(pinst, cpu); | ||
799 | mutex_unlock(&pinst->lock); | ||
800 | return ret; | ||
811 | } | 801 | } |
802 | |||
803 | static enum cpuhp_state hp_online; | ||
812 | #endif | 804 | #endif |
813 | 805 | ||
814 | static void __padata_free(struct padata_instance *pinst) | 806 | static void __padata_free(struct padata_instance *pinst) |
815 | { | 807 | { |
816 | #ifdef CONFIG_HOTPLUG_CPU | 808 | #ifdef CONFIG_HOTPLUG_CPU |
817 | unregister_hotcpu_notifier(&pinst->cpu_notifier); | 809 | cpuhp_state_remove_instance_nocalls(hp_online, &pinst->node); |
818 | #endif | 810 | #endif |
819 | 811 | ||
820 | padata_stop(pinst); | 812 | padata_stop(pinst); |
@@ -1012,11 +1004,8 @@ struct padata_instance *padata_alloc(struct workqueue_struct *wq, | |||
1012 | mutex_init(&pinst->lock); | 1004 | mutex_init(&pinst->lock); |
1013 | 1005 | ||
1014 | #ifdef CONFIG_HOTPLUG_CPU | 1006 | #ifdef CONFIG_HOTPLUG_CPU |
1015 | pinst->cpu_notifier.notifier_call = padata_cpu_callback; | 1007 | cpuhp_state_add_instance_nocalls(hp_online, &pinst->node); |
1016 | pinst->cpu_notifier.priority = 0; | ||
1017 | register_hotcpu_notifier(&pinst->cpu_notifier); | ||
1018 | #endif | 1008 | #endif |
1019 | |||
1020 | return pinst; | 1009 | return pinst; |
1021 | 1010 | ||
1022 | err_free_masks: | 1011 | err_free_masks: |
@@ -1039,3 +1028,26 @@ void padata_free(struct padata_instance *pinst) | |||
1039 | kobject_put(&pinst->kobj); | 1028 | kobject_put(&pinst->kobj); |
1040 | } | 1029 | } |
1041 | EXPORT_SYMBOL(padata_free); | 1030 | EXPORT_SYMBOL(padata_free); |
1031 | |||
1032 | #ifdef CONFIG_HOTPLUG_CPU | ||
1033 | |||
1034 | static __init int padata_driver_init(void) | ||
1035 | { | ||
1036 | int ret; | ||
1037 | |||
1038 | ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, "padata:online", | ||
1039 | padata_cpu_online, | ||
1040 | padata_cpu_prep_down); | ||
1041 | if (ret < 0) | ||
1042 | return ret; | ||
1043 | hp_online = ret; | ||
1044 | return 0; | ||
1045 | } | ||
1046 | module_init(padata_driver_init); | ||
1047 | |||
1048 | static __exit void padata_driver_exit(void) | ||
1049 | { | ||
1050 | cpuhp_remove_multi_state(hp_online); | ||
1051 | } | ||
1052 | module_exit(padata_driver_exit); | ||
1053 | #endif | ||
diff --git a/kernel/panic.c b/kernel/panic.c index ca8cea1ef673..e6480e20379e 100644 --- a/kernel/panic.c +++ b/kernel/panic.c | |||
@@ -71,6 +71,32 @@ void __weak nmi_panic_self_stop(struct pt_regs *regs) | |||
71 | panic_smp_self_stop(); | 71 | panic_smp_self_stop(); |
72 | } | 72 | } |
73 | 73 | ||
74 | /* | ||
75 | * Stop other CPUs in panic. Architecture dependent code may override this | ||
76 | * with more suitable version. For example, if the architecture supports | ||
77 | * crash dump, it should save registers of each stopped CPU and disable | ||
78 | * per-CPU features such as virtualization extensions. | ||
79 | */ | ||
80 | void __weak crash_smp_send_stop(void) | ||
81 | { | ||
82 | static int cpus_stopped; | ||
83 | |||
84 | /* | ||
85 | * This function can be called twice in panic path, but obviously | ||
86 | * we execute this only once. | ||
87 | */ | ||
88 | if (cpus_stopped) | ||
89 | return; | ||
90 | |||
91 | /* | ||
92 | * Note smp_send_stop is the usual smp shutdown function, which | ||
93 | * unfortunately means it may not be hardened to work in a panic | ||
94 | * situation. | ||
95 | */ | ||
96 | smp_send_stop(); | ||
97 | cpus_stopped = 1; | ||
98 | } | ||
99 | |||
74 | atomic_t panic_cpu = ATOMIC_INIT(PANIC_CPU_INVALID); | 100 | atomic_t panic_cpu = ATOMIC_INIT(PANIC_CPU_INVALID); |
75 | 101 | ||
76 | /* | 102 | /* |
@@ -164,14 +190,21 @@ void panic(const char *fmt, ...) | |||
164 | if (!_crash_kexec_post_notifiers) { | 190 | if (!_crash_kexec_post_notifiers) { |
165 | printk_nmi_flush_on_panic(); | 191 | printk_nmi_flush_on_panic(); |
166 | __crash_kexec(NULL); | 192 | __crash_kexec(NULL); |
167 | } | ||
168 | 193 | ||
169 | /* | 194 | /* |
170 | * Note smp_send_stop is the usual smp shutdown function, which | 195 | * Note smp_send_stop is the usual smp shutdown function, which |
171 | * unfortunately means it may not be hardened to work in a panic | 196 | * unfortunately means it may not be hardened to work in a |
172 | * situation. | 197 | * panic situation. |
173 | */ | 198 | */ |
174 | smp_send_stop(); | 199 | smp_send_stop(); |
200 | } else { | ||
201 | /* | ||
202 | * If we want to do crash dump after notifier calls and | ||
203 | * kmsg_dump, we will need architecture dependent extra | ||
204 | * works in addition to stopping other CPUs. | ||
205 | */ | ||
206 | crash_smp_send_stop(); | ||
207 | } | ||
175 | 208 | ||
176 | /* | 209 | /* |
177 | * Run any panic handlers, including those that might need to | 210 | * Run any panic handlers, including those that might need to |
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index a65ba137fd15..df9e8e9e0be7 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c | |||
@@ -79,23 +79,36 @@ static void proc_cleanup_work(struct work_struct *work) | |||
79 | /* MAX_PID_NS_LEVEL is needed for limiting size of 'struct pid' */ | 79 | /* MAX_PID_NS_LEVEL is needed for limiting size of 'struct pid' */ |
80 | #define MAX_PID_NS_LEVEL 32 | 80 | #define MAX_PID_NS_LEVEL 32 |
81 | 81 | ||
82 | static struct ucounts *inc_pid_namespaces(struct user_namespace *ns) | ||
83 | { | ||
84 | return inc_ucount(ns, current_euid(), UCOUNT_PID_NAMESPACES); | ||
85 | } | ||
86 | |||
87 | static void dec_pid_namespaces(struct ucounts *ucounts) | ||
88 | { | ||
89 | dec_ucount(ucounts, UCOUNT_PID_NAMESPACES); | ||
90 | } | ||
91 | |||
82 | static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns, | 92 | static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns, |
83 | struct pid_namespace *parent_pid_ns) | 93 | struct pid_namespace *parent_pid_ns) |
84 | { | 94 | { |
85 | struct pid_namespace *ns; | 95 | struct pid_namespace *ns; |
86 | unsigned int level = parent_pid_ns->level + 1; | 96 | unsigned int level = parent_pid_ns->level + 1; |
97 | struct ucounts *ucounts; | ||
87 | int i; | 98 | int i; |
88 | int err; | 99 | int err; |
89 | 100 | ||
90 | if (level > MAX_PID_NS_LEVEL) { | 101 | err = -ENOSPC; |
91 | err = -EINVAL; | 102 | if (level > MAX_PID_NS_LEVEL) |
103 | goto out; | ||
104 | ucounts = inc_pid_namespaces(user_ns); | ||
105 | if (!ucounts) | ||
92 | goto out; | 106 | goto out; |
93 | } | ||
94 | 107 | ||
95 | err = -ENOMEM; | 108 | err = -ENOMEM; |
96 | ns = kmem_cache_zalloc(pid_ns_cachep, GFP_KERNEL); | 109 | ns = kmem_cache_zalloc(pid_ns_cachep, GFP_KERNEL); |
97 | if (ns == NULL) | 110 | if (ns == NULL) |
98 | goto out; | 111 | goto out_dec; |
99 | 112 | ||
100 | ns->pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL); | 113 | ns->pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL); |
101 | if (!ns->pidmap[0].page) | 114 | if (!ns->pidmap[0].page) |
@@ -114,6 +127,7 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns | |||
114 | ns->level = level; | 127 | ns->level = level; |
115 | ns->parent = get_pid_ns(parent_pid_ns); | 128 | ns->parent = get_pid_ns(parent_pid_ns); |
116 | ns->user_ns = get_user_ns(user_ns); | 129 | ns->user_ns = get_user_ns(user_ns); |
130 | ns->ucounts = ucounts; | ||
117 | ns->nr_hashed = PIDNS_HASH_ADDING; | 131 | ns->nr_hashed = PIDNS_HASH_ADDING; |
118 | INIT_WORK(&ns->proc_work, proc_cleanup_work); | 132 | INIT_WORK(&ns->proc_work, proc_cleanup_work); |
119 | 133 | ||
@@ -129,6 +143,8 @@ out_free_map: | |||
129 | kfree(ns->pidmap[0].page); | 143 | kfree(ns->pidmap[0].page); |
130 | out_free: | 144 | out_free: |
131 | kmem_cache_free(pid_ns_cachep, ns); | 145 | kmem_cache_free(pid_ns_cachep, ns); |
146 | out_dec: | ||
147 | dec_pid_namespaces(ucounts); | ||
132 | out: | 148 | out: |
133 | return ERR_PTR(err); | 149 | return ERR_PTR(err); |
134 | } | 150 | } |
@@ -146,6 +162,7 @@ static void destroy_pid_namespace(struct pid_namespace *ns) | |||
146 | ns_free_inum(&ns->ns); | 162 | ns_free_inum(&ns->ns); |
147 | for (i = 0; i < PIDMAP_ENTRIES; i++) | 163 | for (i = 0; i < PIDMAP_ENTRIES; i++) |
148 | kfree(ns->pidmap[i].page); | 164 | kfree(ns->pidmap[i].page); |
165 | dec_pid_namespaces(ns->ucounts); | ||
149 | put_user_ns(ns->user_ns); | 166 | put_user_ns(ns->user_ns); |
150 | call_rcu(&ns->rcu, delayed_free_pidns); | 167 | call_rcu(&ns->rcu, delayed_free_pidns); |
151 | } | 168 | } |
@@ -388,12 +405,37 @@ static int pidns_install(struct nsproxy *nsproxy, struct ns_common *ns) | |||
388 | return 0; | 405 | return 0; |
389 | } | 406 | } |
390 | 407 | ||
408 | static struct ns_common *pidns_get_parent(struct ns_common *ns) | ||
409 | { | ||
410 | struct pid_namespace *active = task_active_pid_ns(current); | ||
411 | struct pid_namespace *pid_ns, *p; | ||
412 | |||
413 | /* See if the parent is in the current namespace */ | ||
414 | pid_ns = p = to_pid_ns(ns)->parent; | ||
415 | for (;;) { | ||
416 | if (!p) | ||
417 | return ERR_PTR(-EPERM); | ||
418 | if (p == active) | ||
419 | break; | ||
420 | p = p->parent; | ||
421 | } | ||
422 | |||
423 | return &get_pid_ns(pid_ns)->ns; | ||
424 | } | ||
425 | |||
426 | static struct user_namespace *pidns_owner(struct ns_common *ns) | ||
427 | { | ||
428 | return to_pid_ns(ns)->user_ns; | ||
429 | } | ||
430 | |||
391 | const struct proc_ns_operations pidns_operations = { | 431 | const struct proc_ns_operations pidns_operations = { |
392 | .name = "pid", | 432 | .name = "pid", |
393 | .type = CLONE_NEWPID, | 433 | .type = CLONE_NEWPID, |
394 | .get = pidns_get, | 434 | .get = pidns_get, |
395 | .put = pidns_put, | 435 | .put = pidns_put, |
396 | .install = pidns_install, | 436 | .install = pidns_install, |
437 | .owner = pidns_owner, | ||
438 | .get_parent = pidns_get_parent, | ||
397 | }; | 439 | }; |
398 | 440 | ||
399 | static __init int pid_namespaces_init(void) | 441 | static __init int pid_namespaces_init(void) |
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 68d3ebc12601..e8517b63eb37 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig | |||
@@ -186,7 +186,7 @@ config PM_SLEEP_DEBUG | |||
186 | 186 | ||
187 | config DPM_WATCHDOG | 187 | config DPM_WATCHDOG |
188 | bool "Device suspend/resume watchdog" | 188 | bool "Device suspend/resume watchdog" |
189 | depends on PM_DEBUG && PSTORE | 189 | depends on PM_DEBUG && PSTORE && EXPERT |
190 | ---help--- | 190 | ---help--- |
191 | Sets up a watchdog timer to capture drivers that are | 191 | Sets up a watchdog timer to capture drivers that are |
192 | locked up attempting to suspend/resume a device. | 192 | locked up attempting to suspend/resume a device. |
@@ -197,7 +197,7 @@ config DPM_WATCHDOG | |||
197 | config DPM_WATCHDOG_TIMEOUT | 197 | config DPM_WATCHDOG_TIMEOUT |
198 | int "Watchdog timeout in seconds" | 198 | int "Watchdog timeout in seconds" |
199 | range 1 120 | 199 | range 1 120 |
200 | default 60 | 200 | default 120 |
201 | depends on DPM_WATCHDOG | 201 | depends on DPM_WATCHDOG |
202 | 202 | ||
203 | config PM_TRACE | 203 | config PM_TRACE |
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 33c79b6105c5..b26dbc48c75b 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c | |||
@@ -306,8 +306,10 @@ static int create_image(int platform_mode) | |||
306 | if (error) | 306 | if (error) |
307 | printk(KERN_ERR "PM: Error %d creating hibernation image\n", | 307 | printk(KERN_ERR "PM: Error %d creating hibernation image\n", |
308 | error); | 308 | error); |
309 | if (!in_suspend) | 309 | if (!in_suspend) { |
310 | events_check_enabled = false; | 310 | events_check_enabled = false; |
311 | clear_free_pages(); | ||
312 | } | ||
311 | 313 | ||
312 | platform_leave(platform_mode); | 314 | platform_leave(platform_mode); |
313 | 315 | ||
@@ -1189,22 +1191,6 @@ static int __init nohibernate_setup(char *str) | |||
1189 | return 1; | 1191 | return 1; |
1190 | } | 1192 | } |
1191 | 1193 | ||
1192 | static int __init page_poison_nohibernate_setup(char *str) | ||
1193 | { | ||
1194 | #ifdef CONFIG_PAGE_POISONING_ZERO | ||
1195 | /* | ||
1196 | * The zeroing option for page poison skips the checks on alloc. | ||
1197 | * since hibernation doesn't save free pages there's no way to | ||
1198 | * guarantee the pages will still be zeroed. | ||
1199 | */ | ||
1200 | if (!strcmp(str, "on")) { | ||
1201 | pr_info("Disabling hibernation due to page poisoning\n"); | ||
1202 | return nohibernate_setup(str); | ||
1203 | } | ||
1204 | #endif | ||
1205 | return 1; | ||
1206 | } | ||
1207 | |||
1208 | __setup("noresume", noresume_setup); | 1194 | __setup("noresume", noresume_setup); |
1209 | __setup("resume_offset=", resume_offset_setup); | 1195 | __setup("resume_offset=", resume_offset_setup); |
1210 | __setup("resume=", resume_setup); | 1196 | __setup("resume=", resume_setup); |
@@ -1212,4 +1198,3 @@ __setup("hibernate=", hibernate_setup); | |||
1212 | __setup("resumewait", resumewait_setup); | 1198 | __setup("resumewait", resumewait_setup); |
1213 | __setup("resumedelay=", resumedelay_setup); | 1199 | __setup("resumedelay=", resumedelay_setup); |
1214 | __setup("nohibernate", nohibernate_setup); | 1200 | __setup("nohibernate", nohibernate_setup); |
1215 | __setup("page_poison=", page_poison_nohibernate_setup); | ||
diff --git a/kernel/power/main.c b/kernel/power/main.c index 5ea50b1b7595..281a697fd458 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c | |||
@@ -644,6 +644,7 @@ static int __init pm_init(void) | |||
644 | return error; | 644 | return error; |
645 | hibernate_image_size_init(); | 645 | hibernate_image_size_init(); |
646 | hibernate_reserved_size_init(); | 646 | hibernate_reserved_size_init(); |
647 | pm_states_init(); | ||
647 | power_kobj = kobject_create_and_add("power", NULL); | 648 | power_kobj = kobject_create_and_add("power", NULL); |
648 | if (!power_kobj) | 649 | if (!power_kobj) |
649 | return -ENOMEM; | 650 | return -ENOMEM; |
diff --git a/kernel/power/power.h b/kernel/power/power.h index 242d8b827dd5..56d1d0dedf76 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h | |||
@@ -110,6 +110,8 @@ extern int create_basic_memory_bitmaps(void); | |||
110 | extern void free_basic_memory_bitmaps(void); | 110 | extern void free_basic_memory_bitmaps(void); |
111 | extern int hibernate_preallocate_memory(void); | 111 | extern int hibernate_preallocate_memory(void); |
112 | 112 | ||
113 | extern void clear_free_pages(void); | ||
114 | |||
113 | /** | 115 | /** |
114 | * Auxiliary structure used for reading the snapshot image data and | 116 | * Auxiliary structure used for reading the snapshot image data and |
115 | * metadata from and writing them to the list of page backup entries | 117 | * metadata from and writing them to the list of page backup entries |
diff --git a/kernel/power/process.c b/kernel/power/process.c index 8f27d5a8adf6..2fba066e125f 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c | |||
@@ -144,23 +144,12 @@ int freeze_processes(void) | |||
144 | /* | 144 | /* |
145 | * Now that the whole userspace is frozen we need to disbale | 145 | * Now that the whole userspace is frozen we need to disbale |
146 | * the OOM killer to disallow any further interference with | 146 | * the OOM killer to disallow any further interference with |
147 | * killable tasks. | 147 | * killable tasks. There is no guarantee oom victims will |
148 | * ever reach a point they go away we have to wait with a timeout. | ||
148 | */ | 149 | */ |
149 | if (!error && !oom_killer_disable()) | 150 | if (!error && !oom_killer_disable(msecs_to_jiffies(freeze_timeout_msecs))) |
150 | error = -EBUSY; | 151 | error = -EBUSY; |
151 | 152 | ||
152 | /* | ||
153 | * There is a hard to fix race between oom_reaper kernel thread | ||
154 | * and oom_killer_disable. oom_reaper calls exit_oom_victim | ||
155 | * before the victim reaches exit_mm so try to freeze all the tasks | ||
156 | * again and catch such a left over task. | ||
157 | */ | ||
158 | if (!error) { | ||
159 | pr_info("Double checking all user space processes after OOM killer disable... "); | ||
160 | error = try_to_freeze_tasks(true); | ||
161 | pr_cont("\n"); | ||
162 | } | ||
163 | |||
164 | if (error) | 153 | if (error) |
165 | thaw_processes(); | 154 | thaw_processes(); |
166 | return error; | 155 | return error; |
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index b02228411d57..4f0f0604f1c4 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c | |||
@@ -1132,6 +1132,28 @@ void free_basic_memory_bitmaps(void) | |||
1132 | pr_debug("PM: Basic memory bitmaps freed\n"); | 1132 | pr_debug("PM: Basic memory bitmaps freed\n"); |
1133 | } | 1133 | } |
1134 | 1134 | ||
1135 | void clear_free_pages(void) | ||
1136 | { | ||
1137 | #ifdef CONFIG_PAGE_POISONING_ZERO | ||
1138 | struct memory_bitmap *bm = free_pages_map; | ||
1139 | unsigned long pfn; | ||
1140 | |||
1141 | if (WARN_ON(!(free_pages_map))) | ||
1142 | return; | ||
1143 | |||
1144 | memory_bm_position_reset(bm); | ||
1145 | pfn = memory_bm_next_pfn(bm); | ||
1146 | while (pfn != BM_END_OF_MAP) { | ||
1147 | if (pfn_valid(pfn)) | ||
1148 | clear_highpage(pfn_to_page(pfn)); | ||
1149 | |||
1150 | pfn = memory_bm_next_pfn(bm); | ||
1151 | } | ||
1152 | memory_bm_position_reset(bm); | ||
1153 | pr_info("PM: free pages cleared after restore\n"); | ||
1154 | #endif /* PAGE_POISONING_ZERO */ | ||
1155 | } | ||
1156 | |||
1135 | /** | 1157 | /** |
1136 | * snapshot_additional_pages - Estimate the number of extra pages needed. | 1158 | * snapshot_additional_pages - Estimate the number of extra pages needed. |
1137 | * @zone: Memory zone to carry out the computation for. | 1159 | * @zone: Memory zone to carry out the computation for. |
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 0acab9d7f96f..6ccb08f57fcb 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c | |||
@@ -118,10 +118,18 @@ static bool valid_state(suspend_state_t state) | |||
118 | */ | 118 | */ |
119 | static bool relative_states; | 119 | static bool relative_states; |
120 | 120 | ||
121 | void __init pm_states_init(void) | ||
122 | { | ||
123 | /* | ||
124 | * freeze state should be supported even without any suspend_ops, | ||
125 | * initialize pm_states accordingly here | ||
126 | */ | ||
127 | pm_states[PM_SUSPEND_FREEZE] = pm_labels[relative_states ? 0 : 2]; | ||
128 | } | ||
129 | |||
121 | static int __init sleep_states_setup(char *str) | 130 | static int __init sleep_states_setup(char *str) |
122 | { | 131 | { |
123 | relative_states = !strncmp(str, "1", 1); | 132 | relative_states = !strncmp(str, "1", 1); |
124 | pm_states[PM_SUSPEND_FREEZE] = pm_labels[relative_states ? 0 : 2]; | ||
125 | return 1; | 133 | return 1; |
126 | } | 134 | } |
127 | 135 | ||
@@ -211,7 +219,7 @@ static int platform_suspend_begin(suspend_state_t state) | |||
211 | { | 219 | { |
212 | if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->begin) | 220 | if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->begin) |
213 | return freeze_ops->begin(); | 221 | return freeze_ops->begin(); |
214 | else if (suspend_ops->begin) | 222 | else if (suspend_ops && suspend_ops->begin) |
215 | return suspend_ops->begin(state); | 223 | return suspend_ops->begin(state); |
216 | else | 224 | else |
217 | return 0; | 225 | return 0; |
@@ -221,7 +229,7 @@ static void platform_resume_end(suspend_state_t state) | |||
221 | { | 229 | { |
222 | if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->end) | 230 | if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->end) |
223 | freeze_ops->end(); | 231 | freeze_ops->end(); |
224 | else if (suspend_ops->end) | 232 | else if (suspend_ops && suspend_ops->end) |
225 | suspend_ops->end(); | 233 | suspend_ops->end(); |
226 | } | 234 | } |
227 | 235 | ||
@@ -490,9 +498,9 @@ static int enter_state(suspend_state_t state) | |||
490 | 498 | ||
491 | #ifndef CONFIG_SUSPEND_SKIP_SYNC | 499 | #ifndef CONFIG_SUSPEND_SKIP_SYNC |
492 | trace_suspend_resume(TPS("sync_filesystems"), 0, true); | 500 | trace_suspend_resume(TPS("sync_filesystems"), 0, true); |
493 | printk(KERN_INFO "PM: Syncing filesystems ... "); | 501 | pr_info("PM: Syncing filesystems ... "); |
494 | sys_sync(); | 502 | sys_sync(); |
495 | printk("done.\n"); | 503 | pr_cont("done.\n"); |
496 | trace_suspend_resume(TPS("sync_filesystems"), 0, false); | 504 | trace_suspend_resume(TPS("sync_filesystems"), 0, false); |
497 | #endif | 505 | #endif |
498 | 506 | ||
diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c index 084452e34a12..bdff5ed57f10 100644 --- a/kernel/power/suspend_test.c +++ b/kernel/power/suspend_test.c | |||
@@ -203,8 +203,10 @@ static int __init test_suspend(void) | |||
203 | 203 | ||
204 | /* RTCs have initialized by now too ... can we use one? */ | 204 | /* RTCs have initialized by now too ... can we use one? */ |
205 | dev = class_find_device(rtc_class, NULL, NULL, has_wakealarm); | 205 | dev = class_find_device(rtc_class, NULL, NULL, has_wakealarm); |
206 | if (dev) | 206 | if (dev) { |
207 | rtc = rtc_class_open(dev_name(dev)); | 207 | rtc = rtc_class_open(dev_name(dev)); |
208 | put_device(dev); | ||
209 | } | ||
208 | if (!rtc) { | 210 | if (!rtc) { |
209 | printk(warn_no_rtc); | 211 | printk(warn_no_rtc); |
210 | return 0; | 212 | return 0; |
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index eea6dbc2d8cf..f7a55e9ff2f7 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c | |||
@@ -655,11 +655,8 @@ static ssize_t msg_print_ext_header(char *buf, size_t size, | |||
655 | * better readable output. 'c' in the record flags mark the first | 655 | * better readable output. 'c' in the record flags mark the first |
656 | * fragment of a line, '+' the following. | 656 | * fragment of a line, '+' the following. |
657 | */ | 657 | */ |
658 | if (msg->flags & LOG_CONT && !(prev_flags & LOG_CONT)) | 658 | if (msg->flags & LOG_CONT) |
659 | cont = 'c'; | 659 | cont = (prev_flags & LOG_CONT) ? '+' : 'c'; |
660 | else if ((msg->flags & LOG_CONT) || | ||
661 | ((prev_flags & LOG_CONT) && !(msg->flags & LOG_PREFIX))) | ||
662 | cont = '+'; | ||
663 | 660 | ||
664 | return scnprintf(buf, size, "%u,%llu,%llu,%c;", | 661 | return scnprintf(buf, size, "%u,%llu,%llu,%c;", |
665 | (msg->facility << 3) | msg->level, seq, ts_usec, cont); | 662 | (msg->facility << 3) | msg->level, seq, ts_usec, cont); |
@@ -1643,35 +1640,33 @@ static struct cont { | |||
1643 | bool flushed:1; /* buffer sealed and committed */ | 1640 | bool flushed:1; /* buffer sealed and committed */ |
1644 | } cont; | 1641 | } cont; |
1645 | 1642 | ||
1646 | static void cont_flush(enum log_flags flags) | 1643 | static void cont_flush(void) |
1647 | { | 1644 | { |
1648 | if (cont.flushed) | 1645 | if (cont.flushed) |
1649 | return; | 1646 | return; |
1650 | if (cont.len == 0) | 1647 | if (cont.len == 0) |
1651 | return; | 1648 | return; |
1652 | |||
1653 | if (cont.cons) { | 1649 | if (cont.cons) { |
1654 | /* | 1650 | /* |
1655 | * If a fragment of this line was directly flushed to the | 1651 | * If a fragment of this line was directly flushed to the |
1656 | * console; wait for the console to pick up the rest of the | 1652 | * console; wait for the console to pick up the rest of the |
1657 | * line. LOG_NOCONS suppresses a duplicated output. | 1653 | * line. LOG_NOCONS suppresses a duplicated output. |
1658 | */ | 1654 | */ |
1659 | log_store(cont.facility, cont.level, flags | LOG_NOCONS, | 1655 | log_store(cont.facility, cont.level, cont.flags | LOG_NOCONS, |
1660 | cont.ts_nsec, NULL, 0, cont.buf, cont.len); | 1656 | cont.ts_nsec, NULL, 0, cont.buf, cont.len); |
1661 | cont.flags = flags; | ||
1662 | cont.flushed = true; | 1657 | cont.flushed = true; |
1663 | } else { | 1658 | } else { |
1664 | /* | 1659 | /* |
1665 | * If no fragment of this line ever reached the console, | 1660 | * If no fragment of this line ever reached the console, |
1666 | * just submit it to the store and free the buffer. | 1661 | * just submit it to the store and free the buffer. |
1667 | */ | 1662 | */ |
1668 | log_store(cont.facility, cont.level, flags, 0, | 1663 | log_store(cont.facility, cont.level, cont.flags, 0, |
1669 | NULL, 0, cont.buf, cont.len); | 1664 | NULL, 0, cont.buf, cont.len); |
1670 | cont.len = 0; | 1665 | cont.len = 0; |
1671 | } | 1666 | } |
1672 | } | 1667 | } |
1673 | 1668 | ||
1674 | static bool cont_add(int facility, int level, const char *text, size_t len) | 1669 | static bool cont_add(int facility, int level, enum log_flags flags, const char *text, size_t len) |
1675 | { | 1670 | { |
1676 | if (cont.len && cont.flushed) | 1671 | if (cont.len && cont.flushed) |
1677 | return false; | 1672 | return false; |
@@ -1682,7 +1677,7 @@ static bool cont_add(int facility, int level, const char *text, size_t len) | |||
1682 | * the line gets too long, split it up in separate records. | 1677 | * the line gets too long, split it up in separate records. |
1683 | */ | 1678 | */ |
1684 | if (nr_ext_console_drivers || cont.len + len > sizeof(cont.buf)) { | 1679 | if (nr_ext_console_drivers || cont.len + len > sizeof(cont.buf)) { |
1685 | cont_flush(LOG_CONT); | 1680 | cont_flush(); |
1686 | return false; | 1681 | return false; |
1687 | } | 1682 | } |
1688 | 1683 | ||
@@ -1691,7 +1686,7 @@ static bool cont_add(int facility, int level, const char *text, size_t len) | |||
1691 | cont.level = level; | 1686 | cont.level = level; |
1692 | cont.owner = current; | 1687 | cont.owner = current; |
1693 | cont.ts_nsec = local_clock(); | 1688 | cont.ts_nsec = local_clock(); |
1694 | cont.flags = 0; | 1689 | cont.flags = flags; |
1695 | cont.cons = 0; | 1690 | cont.cons = 0; |
1696 | cont.flushed = false; | 1691 | cont.flushed = false; |
1697 | } | 1692 | } |
@@ -1699,8 +1694,15 @@ static bool cont_add(int facility, int level, const char *text, size_t len) | |||
1699 | memcpy(cont.buf + cont.len, text, len); | 1694 | memcpy(cont.buf + cont.len, text, len); |
1700 | cont.len += len; | 1695 | cont.len += len; |
1701 | 1696 | ||
1697 | // The original flags come from the first line, | ||
1698 | // but later continuations can add a newline. | ||
1699 | if (flags & LOG_NEWLINE) { | ||
1700 | cont.flags |= LOG_NEWLINE; | ||
1701 | cont_flush(); | ||
1702 | } | ||
1703 | |||
1702 | if (cont.len > (sizeof(cont.buf) * 80) / 100) | 1704 | if (cont.len > (sizeof(cont.buf) * 80) / 100) |
1703 | cont_flush(LOG_CONT); | 1705 | cont_flush(); |
1704 | 1706 | ||
1705 | return true; | 1707 | return true; |
1706 | } | 1708 | } |
@@ -1733,6 +1735,35 @@ static size_t cont_print_text(char *text, size_t size) | |||
1733 | return textlen; | 1735 | return textlen; |
1734 | } | 1736 | } |
1735 | 1737 | ||
1738 | static size_t log_output(int facility, int level, enum log_flags lflags, const char *dict, size_t dictlen, char *text, size_t text_len) | ||
1739 | { | ||
1740 | /* | ||
1741 | * If an earlier line was buffered, and we're a continuation | ||
1742 | * write from the same process, try to add it to the buffer. | ||
1743 | */ | ||
1744 | if (cont.len) { | ||
1745 | if (cont.owner == current && (lflags & LOG_CONT)) { | ||
1746 | if (cont_add(facility, level, lflags, text, text_len)) | ||
1747 | return text_len; | ||
1748 | } | ||
1749 | /* Otherwise, make sure it's flushed */ | ||
1750 | cont_flush(); | ||
1751 | } | ||
1752 | |||
1753 | /* Skip empty continuation lines that couldn't be added - they just flush */ | ||
1754 | if (!text_len && (lflags & LOG_CONT)) | ||
1755 | return 0; | ||
1756 | |||
1757 | /* If it doesn't end in a newline, try to buffer the current line */ | ||
1758 | if (!(lflags & LOG_NEWLINE)) { | ||
1759 | if (cont_add(facility, level, lflags, text, text_len)) | ||
1760 | return text_len; | ||
1761 | } | ||
1762 | |||
1763 | /* Store it in the record log */ | ||
1764 | return log_store(facility, level, lflags, 0, dict, dictlen, text, text_len); | ||
1765 | } | ||
1766 | |||
1736 | asmlinkage int vprintk_emit(int facility, int level, | 1767 | asmlinkage int vprintk_emit(int facility, int level, |
1737 | const char *dict, size_t dictlen, | 1768 | const char *dict, size_t dictlen, |
1738 | const char *fmt, va_list args) | 1769 | const char *fmt, va_list args) |
@@ -1819,10 +1850,9 @@ asmlinkage int vprintk_emit(int facility, int level, | |||
1819 | 1850 | ||
1820 | /* strip kernel syslog prefix and extract log level or control flags */ | 1851 | /* strip kernel syslog prefix and extract log level or control flags */ |
1821 | if (facility == 0) { | 1852 | if (facility == 0) { |
1822 | int kern_level = printk_get_level(text); | 1853 | int kern_level; |
1823 | 1854 | ||
1824 | if (kern_level) { | 1855 | while ((kern_level = printk_get_level(text)) != 0) { |
1825 | const char *end_of_header = printk_skip_level(text); | ||
1826 | switch (kern_level) { | 1856 | switch (kern_level) { |
1827 | case '0' ... '7': | 1857 | case '0' ... '7': |
1828 | if (level == LOGLEVEL_DEFAULT) | 1858 | if (level == LOGLEVEL_DEFAULT) |
@@ -1830,14 +1860,13 @@ asmlinkage int vprintk_emit(int facility, int level, | |||
1830 | /* fallthrough */ | 1860 | /* fallthrough */ |
1831 | case 'd': /* KERN_DEFAULT */ | 1861 | case 'd': /* KERN_DEFAULT */ |
1832 | lflags |= LOG_PREFIX; | 1862 | lflags |= LOG_PREFIX; |
1863 | break; | ||
1864 | case 'c': /* KERN_CONT */ | ||
1865 | lflags |= LOG_CONT; | ||
1833 | } | 1866 | } |
1834 | /* | 1867 | |
1835 | * No need to check length here because vscnprintf | 1868 | text_len -= 2; |
1836 | * put '\0' at the end of the string. Only valid and | 1869 | text += 2; |
1837 | * newly printed level is detected. | ||
1838 | */ | ||
1839 | text_len -= end_of_header - text; | ||
1840 | text = (char *)end_of_header; | ||
1841 | } | 1870 | } |
1842 | } | 1871 | } |
1843 | 1872 | ||
@@ -1847,45 +1876,7 @@ asmlinkage int vprintk_emit(int facility, int level, | |||
1847 | if (dict) | 1876 | if (dict) |
1848 | lflags |= LOG_PREFIX|LOG_NEWLINE; | 1877 | lflags |= LOG_PREFIX|LOG_NEWLINE; |
1849 | 1878 | ||
1850 | if (!(lflags & LOG_NEWLINE)) { | 1879 | printed_len += log_output(facility, level, lflags, dict, dictlen, text, text_len); |
1851 | /* | ||
1852 | * Flush the conflicting buffer. An earlier newline was missing, | ||
1853 | * or another task also prints continuation lines. | ||
1854 | */ | ||
1855 | if (cont.len && (lflags & LOG_PREFIX || cont.owner != current)) | ||
1856 | cont_flush(LOG_NEWLINE); | ||
1857 | |||
1858 | /* buffer line if possible, otherwise store it right away */ | ||
1859 | if (cont_add(facility, level, text, text_len)) | ||
1860 | printed_len += text_len; | ||
1861 | else | ||
1862 | printed_len += log_store(facility, level, | ||
1863 | lflags | LOG_CONT, 0, | ||
1864 | dict, dictlen, text, text_len); | ||
1865 | } else { | ||
1866 | bool stored = false; | ||
1867 | |||
1868 | /* | ||
1869 | * If an earlier newline was missing and it was the same task, | ||
1870 | * either merge it with the current buffer and flush, or if | ||
1871 | * there was a race with interrupts (prefix == true) then just | ||
1872 | * flush it out and store this line separately. | ||
1873 | * If the preceding printk was from a different task and missed | ||
1874 | * a newline, flush and append the newline. | ||
1875 | */ | ||
1876 | if (cont.len) { | ||
1877 | if (cont.owner == current && !(lflags & LOG_PREFIX)) | ||
1878 | stored = cont_add(facility, level, text, | ||
1879 | text_len); | ||
1880 | cont_flush(LOG_NEWLINE); | ||
1881 | } | ||
1882 | |||
1883 | if (stored) | ||
1884 | printed_len += text_len; | ||
1885 | else | ||
1886 | printed_len += log_store(facility, level, lflags, 0, | ||
1887 | dict, dictlen, text, text_len); | ||
1888 | } | ||
1889 | 1880 | ||
1890 | logbuf_cpu = UINT_MAX; | 1881 | logbuf_cpu = UINT_MAX; |
1891 | raw_spin_unlock(&logbuf_lock); | 1882 | raw_spin_unlock(&logbuf_lock); |
diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 1d3b7665d0be..e6474f7272ec 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c | |||
@@ -73,6 +73,8 @@ void __ptrace_unlink(struct task_struct *child) | |||
73 | { | 73 | { |
74 | BUG_ON(!child->ptrace); | 74 | BUG_ON(!child->ptrace); |
75 | 75 | ||
76 | clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE); | ||
77 | |||
76 | child->parent = child->real_parent; | 78 | child->parent = child->real_parent; |
77 | list_del_init(&child->ptrace_entry); | 79 | list_del_init(&child->ptrace_entry); |
78 | 80 | ||
@@ -489,7 +491,6 @@ static int ptrace_detach(struct task_struct *child, unsigned int data) | |||
489 | 491 | ||
490 | /* Architecture-specific hardware disable .. */ | 492 | /* Architecture-specific hardware disable .. */ |
491 | ptrace_disable(child); | 493 | ptrace_disable(child); |
492 | clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE); | ||
493 | 494 | ||
494 | write_lock_irq(&tasklist_lock); | 495 | write_lock_irq(&tasklist_lock); |
495 | /* | 496 | /* |
@@ -536,7 +537,7 @@ int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst | |||
536 | int this_len, retval; | 537 | int this_len, retval; |
537 | 538 | ||
538 | this_len = (len > sizeof(buf)) ? sizeof(buf) : len; | 539 | this_len = (len > sizeof(buf)) ? sizeof(buf) : len; |
539 | retval = access_process_vm(tsk, src, buf, this_len, 0); | 540 | retval = access_process_vm(tsk, src, buf, this_len, FOLL_FORCE); |
540 | if (!retval) { | 541 | if (!retval) { |
541 | if (copied) | 542 | if (copied) |
542 | break; | 543 | break; |
@@ -563,7 +564,8 @@ int ptrace_writedata(struct task_struct *tsk, char __user *src, unsigned long ds | |||
563 | this_len = (len > sizeof(buf)) ? sizeof(buf) : len; | 564 | this_len = (len > sizeof(buf)) ? sizeof(buf) : len; |
564 | if (copy_from_user(buf, src, this_len)) | 565 | if (copy_from_user(buf, src, this_len)) |
565 | return -EFAULT; | 566 | return -EFAULT; |
566 | retval = access_process_vm(tsk, dst, buf, this_len, 1); | 567 | retval = access_process_vm(tsk, dst, buf, this_len, |
568 | FOLL_FORCE | FOLL_WRITE); | ||
567 | if (!retval) { | 569 | if (!retval) { |
568 | if (copied) | 570 | if (copied) |
569 | break; | 571 | break; |
@@ -1126,7 +1128,7 @@ int generic_ptrace_peekdata(struct task_struct *tsk, unsigned long addr, | |||
1126 | unsigned long tmp; | 1128 | unsigned long tmp; |
1127 | int copied; | 1129 | int copied; |
1128 | 1130 | ||
1129 | copied = access_process_vm(tsk, addr, &tmp, sizeof(tmp), 0); | 1131 | copied = access_process_vm(tsk, addr, &tmp, sizeof(tmp), FOLL_FORCE); |
1130 | if (copied != sizeof(tmp)) | 1132 | if (copied != sizeof(tmp)) |
1131 | return -EIO; | 1133 | return -EIO; |
1132 | return put_user(tmp, (unsigned long __user *)data); | 1134 | return put_user(tmp, (unsigned long __user *)data); |
@@ -1137,7 +1139,8 @@ int generic_ptrace_pokedata(struct task_struct *tsk, unsigned long addr, | |||
1137 | { | 1139 | { |
1138 | int copied; | 1140 | int copied; |
1139 | 1141 | ||
1140 | copied = access_process_vm(tsk, addr, &data, sizeof(data), 1); | 1142 | copied = access_process_vm(tsk, addr, &data, sizeof(data), |
1143 | FOLL_FORCE | FOLL_WRITE); | ||
1141 | return (copied == sizeof(data)) ? 0 : -EIO; | 1144 | return (copied == sizeof(data)) ? 0 : -EIO; |
1142 | } | 1145 | } |
1143 | 1146 | ||
@@ -1154,7 +1157,8 @@ int compat_ptrace_request(struct task_struct *child, compat_long_t request, | |||
1154 | switch (request) { | 1157 | switch (request) { |
1155 | case PTRACE_PEEKTEXT: | 1158 | case PTRACE_PEEKTEXT: |
1156 | case PTRACE_PEEKDATA: | 1159 | case PTRACE_PEEKDATA: |
1157 | ret = access_process_vm(child, addr, &word, sizeof(word), 0); | 1160 | ret = access_process_vm(child, addr, &word, sizeof(word), |
1161 | FOLL_FORCE); | ||
1158 | if (ret != sizeof(word)) | 1162 | if (ret != sizeof(word)) |
1159 | ret = -EIO; | 1163 | ret = -EIO; |
1160 | else | 1164 | else |
@@ -1163,7 +1167,8 @@ int compat_ptrace_request(struct task_struct *child, compat_long_t request, | |||
1163 | 1167 | ||
1164 | case PTRACE_POKETEXT: | 1168 | case PTRACE_POKETEXT: |
1165 | case PTRACE_POKEDATA: | 1169 | case PTRACE_POKEDATA: |
1166 | ret = access_process_vm(child, addr, &data, sizeof(data), 1); | 1170 | ret = access_process_vm(child, addr, &data, sizeof(data), |
1171 | FOLL_FORCE | FOLL_WRITE); | ||
1167 | ret = (ret != sizeof(data) ? -EIO : 0); | 1172 | ret = (ret != sizeof(data) ? -EIO : 0); |
1168 | break; | 1173 | break; |
1169 | 1174 | ||
diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index d38ab08a3fe7..123ccbd22449 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c | |||
@@ -52,7 +52,7 @@ MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.vnet.ibm.com>"); | |||
52 | 52 | ||
53 | #define PERF_FLAG "-perf:" | 53 | #define PERF_FLAG "-perf:" |
54 | #define PERFOUT_STRING(s) \ | 54 | #define PERFOUT_STRING(s) \ |
55 | pr_alert("%s" PERF_FLAG s "\n", perf_type) | 55 | pr_alert("%s" PERF_FLAG " %s\n", perf_type, s) |
56 | #define VERBOSE_PERFOUT_STRING(s) \ | 56 | #define VERBOSE_PERFOUT_STRING(s) \ |
57 | do { if (verbose) pr_alert("%s" PERF_FLAG " %s\n", perf_type, s); } while (0) | 57 | do { if (verbose) pr_alert("%s" PERF_FLAG " %s\n", perf_type, s); } while (0) |
58 | #define VERBOSE_PERFOUT_ERRSTRING(s) \ | 58 | #define VERBOSE_PERFOUT_ERRSTRING(s) \ |
@@ -400,9 +400,8 @@ rcu_perf_writer(void *arg) | |||
400 | sp.sched_priority = 0; | 400 | sp.sched_priority = 0; |
401 | sched_setscheduler_nocheck(current, | 401 | sched_setscheduler_nocheck(current, |
402 | SCHED_NORMAL, &sp); | 402 | SCHED_NORMAL, &sp); |
403 | pr_alert("%s" PERF_FLAG | 403 | pr_alert("%s%s rcu_perf_writer %ld has %d measurements\n", |
404 | "rcu_perf_writer %ld has %d measurements\n", | 404 | perf_type, PERF_FLAG, me, MIN_MEAS); |
405 | perf_type, me, MIN_MEAS); | ||
406 | if (atomic_inc_return(&n_rcu_perf_writer_finished) >= | 405 | if (atomic_inc_return(&n_rcu_perf_writer_finished) >= |
407 | nrealwriters) { | 406 | nrealwriters) { |
408 | schedule_timeout_interruptible(10); | 407 | schedule_timeout_interruptible(10); |
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 971e2b138063..bf08fee53dc7 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c | |||
@@ -1238,6 +1238,7 @@ rcu_torture_stats_print(void) | |||
1238 | long pipesummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 }; | 1238 | long pipesummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 }; |
1239 | long batchsummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 }; | 1239 | long batchsummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 }; |
1240 | static unsigned long rtcv_snap = ULONG_MAX; | 1240 | static unsigned long rtcv_snap = ULONG_MAX; |
1241 | struct task_struct *wtp; | ||
1241 | 1242 | ||
1242 | for_each_possible_cpu(cpu) { | 1243 | for_each_possible_cpu(cpu) { |
1243 | for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { | 1244 | for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { |
@@ -1258,8 +1259,9 @@ rcu_torture_stats_print(void) | |||
1258 | atomic_read(&n_rcu_torture_alloc), | 1259 | atomic_read(&n_rcu_torture_alloc), |
1259 | atomic_read(&n_rcu_torture_alloc_fail), | 1260 | atomic_read(&n_rcu_torture_alloc_fail), |
1260 | atomic_read(&n_rcu_torture_free)); | 1261 | atomic_read(&n_rcu_torture_free)); |
1261 | pr_cont("rtmbe: %d rtbke: %ld rtbre: %ld ", | 1262 | pr_cont("rtmbe: %d rtbe: %ld rtbke: %ld rtbre: %ld ", |
1262 | atomic_read(&n_rcu_torture_mberror), | 1263 | atomic_read(&n_rcu_torture_mberror), |
1264 | n_rcu_torture_barrier_error, | ||
1263 | n_rcu_torture_boost_ktrerror, | 1265 | n_rcu_torture_boost_ktrerror, |
1264 | n_rcu_torture_boost_rterror); | 1266 | n_rcu_torture_boost_rterror); |
1265 | pr_cont("rtbf: %ld rtb: %ld nt: %ld ", | 1267 | pr_cont("rtbf: %ld rtb: %ld nt: %ld ", |
@@ -1312,10 +1314,12 @@ rcu_torture_stats_print(void) | |||
1312 | 1314 | ||
1313 | rcutorture_get_gp_data(cur_ops->ttype, | 1315 | rcutorture_get_gp_data(cur_ops->ttype, |
1314 | &flags, &gpnum, &completed); | 1316 | &flags, &gpnum, &completed); |
1315 | pr_alert("??? Writer stall state %s(%d) g%lu c%lu f%#x\n", | 1317 | wtp = READ_ONCE(writer_task); |
1318 | pr_alert("??? Writer stall state %s(%d) g%lu c%lu f%#x ->state %#lx\n", | ||
1316 | rcu_torture_writer_state_getname(), | 1319 | rcu_torture_writer_state_getname(), |
1317 | rcu_torture_writer_state, | 1320 | rcu_torture_writer_state, |
1318 | gpnum, completed, flags); | 1321 | gpnum, completed, flags, |
1322 | wtp == NULL ? ~0UL : wtp->state); | ||
1319 | show_rcu_gp_kthreads(); | 1323 | show_rcu_gp_kthreads(); |
1320 | rcu_ftrace_dump(DUMP_ALL); | 1324 | rcu_ftrace_dump(DUMP_ALL); |
1321 | } | 1325 | } |
@@ -1362,12 +1366,12 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag) | |||
1362 | onoff_interval, onoff_holdoff); | 1366 | onoff_interval, onoff_holdoff); |
1363 | } | 1367 | } |
1364 | 1368 | ||
1365 | static void rcutorture_booster_cleanup(int cpu) | 1369 | static int rcutorture_booster_cleanup(unsigned int cpu) |
1366 | { | 1370 | { |
1367 | struct task_struct *t; | 1371 | struct task_struct *t; |
1368 | 1372 | ||
1369 | if (boost_tasks[cpu] == NULL) | 1373 | if (boost_tasks[cpu] == NULL) |
1370 | return; | 1374 | return 0; |
1371 | mutex_lock(&boost_mutex); | 1375 | mutex_lock(&boost_mutex); |
1372 | t = boost_tasks[cpu]; | 1376 | t = boost_tasks[cpu]; |
1373 | boost_tasks[cpu] = NULL; | 1377 | boost_tasks[cpu] = NULL; |
@@ -1375,9 +1379,10 @@ static void rcutorture_booster_cleanup(int cpu) | |||
1375 | 1379 | ||
1376 | /* This must be outside of the mutex, otherwise deadlock! */ | 1380 | /* This must be outside of the mutex, otherwise deadlock! */ |
1377 | torture_stop_kthread(rcu_torture_boost, t); | 1381 | torture_stop_kthread(rcu_torture_boost, t); |
1382 | return 0; | ||
1378 | } | 1383 | } |
1379 | 1384 | ||
1380 | static int rcutorture_booster_init(int cpu) | 1385 | static int rcutorture_booster_init(unsigned int cpu) |
1381 | { | 1386 | { |
1382 | int retval; | 1387 | int retval; |
1383 | 1388 | ||
@@ -1577,28 +1582,7 @@ static void rcu_torture_barrier_cleanup(void) | |||
1577 | } | 1582 | } |
1578 | } | 1583 | } |
1579 | 1584 | ||
1580 | static int rcutorture_cpu_notify(struct notifier_block *self, | 1585 | static enum cpuhp_state rcutor_hp; |
1581 | unsigned long action, void *hcpu) | ||
1582 | { | ||
1583 | long cpu = (long)hcpu; | ||
1584 | |||
1585 | switch (action & ~CPU_TASKS_FROZEN) { | ||
1586 | case CPU_ONLINE: | ||
1587 | case CPU_DOWN_FAILED: | ||
1588 | (void)rcutorture_booster_init(cpu); | ||
1589 | break; | ||
1590 | case CPU_DOWN_PREPARE: | ||
1591 | rcutorture_booster_cleanup(cpu); | ||
1592 | break; | ||
1593 | default: | ||
1594 | break; | ||
1595 | } | ||
1596 | return NOTIFY_OK; | ||
1597 | } | ||
1598 | |||
1599 | static struct notifier_block rcutorture_cpu_nb = { | ||
1600 | .notifier_call = rcutorture_cpu_notify, | ||
1601 | }; | ||
1602 | 1586 | ||
1603 | static void | 1587 | static void |
1604 | rcu_torture_cleanup(void) | 1588 | rcu_torture_cleanup(void) |
@@ -1638,11 +1622,8 @@ rcu_torture_cleanup(void) | |||
1638 | for (i = 0; i < ncbflooders; i++) | 1622 | for (i = 0; i < ncbflooders; i++) |
1639 | torture_stop_kthread(rcu_torture_cbflood, cbflood_task[i]); | 1623 | torture_stop_kthread(rcu_torture_cbflood, cbflood_task[i]); |
1640 | if ((test_boost == 1 && cur_ops->can_boost) || | 1624 | if ((test_boost == 1 && cur_ops->can_boost) || |
1641 | test_boost == 2) { | 1625 | test_boost == 2) |
1642 | unregister_cpu_notifier(&rcutorture_cpu_nb); | 1626 | cpuhp_remove_state(rcutor_hp); |
1643 | for_each_possible_cpu(i) | ||
1644 | rcutorture_booster_cleanup(i); | ||
1645 | } | ||
1646 | 1627 | ||
1647 | /* | 1628 | /* |
1648 | * Wait for all RCU callbacks to fire, then do flavor-specific | 1629 | * Wait for all RCU callbacks to fire, then do flavor-specific |
@@ -1869,14 +1850,13 @@ rcu_torture_init(void) | |||
1869 | test_boost == 2) { | 1850 | test_boost == 2) { |
1870 | 1851 | ||
1871 | boost_starttime = jiffies + test_boost_interval * HZ; | 1852 | boost_starttime = jiffies + test_boost_interval * HZ; |
1872 | register_cpu_notifier(&rcutorture_cpu_nb); | 1853 | |
1873 | for_each_possible_cpu(i) { | 1854 | firsterr = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "RCU_TORTURE", |
1874 | if (cpu_is_offline(i)) | 1855 | rcutorture_booster_init, |
1875 | continue; /* Heuristic: CPU can go offline. */ | 1856 | rcutorture_booster_cleanup); |
1876 | firsterr = rcutorture_booster_init(i); | 1857 | if (firsterr < 0) |
1877 | if (firsterr) | 1858 | goto unwind; |
1878 | goto unwind; | 1859 | rcutor_hp = firsterr; |
1879 | } | ||
1880 | } | 1860 | } |
1881 | firsterr = torture_shutdown_init(shutdown_secs, rcu_torture_cleanup); | 1861 | firsterr = torture_shutdown_init(shutdown_secs, rcu_torture_cleanup); |
1882 | if (firsterr) | 1862 | if (firsterr) |
diff --git a/kernel/rcu/sync.c b/kernel/rcu/sync.c index be922c9f3d37..50d1861f7759 100644 --- a/kernel/rcu/sync.c +++ b/kernel/rcu/sync.c | |||
@@ -68,6 +68,8 @@ void rcu_sync_lockdep_assert(struct rcu_sync *rsp) | |||
68 | RCU_LOCKDEP_WARN(!gp_ops[rsp->gp_type].held(), | 68 | RCU_LOCKDEP_WARN(!gp_ops[rsp->gp_type].held(), |
69 | "suspicious rcu_sync_is_idle() usage"); | 69 | "suspicious rcu_sync_is_idle() usage"); |
70 | } | 70 | } |
71 | |||
72 | EXPORT_SYMBOL_GPL(rcu_sync_lockdep_assert); | ||
71 | #endif | 73 | #endif |
72 | 74 | ||
73 | /** | 75 | /** |
@@ -83,6 +85,18 @@ void rcu_sync_init(struct rcu_sync *rsp, enum rcu_sync_type type) | |||
83 | } | 85 | } |
84 | 86 | ||
85 | /** | 87 | /** |
88 | * Must be called after rcu_sync_init() and before first use. | ||
89 | * | ||
90 | * Ensures rcu_sync_is_idle() returns false and rcu_sync_{enter,exit}() | ||
91 | * pairs turn into NO-OPs. | ||
92 | */ | ||
93 | void rcu_sync_enter_start(struct rcu_sync *rsp) | ||
94 | { | ||
95 | rsp->gp_count++; | ||
96 | rsp->gp_state = GP_PASSED; | ||
97 | } | ||
98 | |||
99 | /** | ||
86 | * rcu_sync_enter() - Force readers onto slowpath | 100 | * rcu_sync_enter() - Force readers onto slowpath |
87 | * @rsp: Pointer to rcu_sync structure to use for synchronization | 101 | * @rsp: Pointer to rcu_sync structure to use for synchronization |
88 | * | 102 | * |
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index 944b1b491ed8..1898559e6b60 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c | |||
@@ -170,7 +170,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) | |||
170 | false)); | 170 | false)); |
171 | } | 171 | } |
172 | 172 | ||
173 | static void rcu_process_callbacks(struct softirq_action *unused) | 173 | static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused) |
174 | { | 174 | { |
175 | __rcu_process_callbacks(&rcu_sched_ctrlblk); | 175 | __rcu_process_callbacks(&rcu_sched_ctrlblk); |
176 | __rcu_process_callbacks(&rcu_bh_ctrlblk); | 176 | __rcu_process_callbacks(&rcu_bh_ctrlblk); |
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 5d80925e7fc8..69a5611a7e7c 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c | |||
@@ -41,7 +41,6 @@ | |||
41 | #include <linux/export.h> | 41 | #include <linux/export.h> |
42 | #include <linux/completion.h> | 42 | #include <linux/completion.h> |
43 | #include <linux/moduleparam.h> | 43 | #include <linux/moduleparam.h> |
44 | #include <linux/module.h> | ||
45 | #include <linux/percpu.h> | 44 | #include <linux/percpu.h> |
46 | #include <linux/notifier.h> | 45 | #include <linux/notifier.h> |
47 | #include <linux/cpu.h> | 46 | #include <linux/cpu.h> |
@@ -60,7 +59,6 @@ | |||
60 | #include "tree.h" | 59 | #include "tree.h" |
61 | #include "rcu.h" | 60 | #include "rcu.h" |
62 | 61 | ||
63 | MODULE_ALIAS("rcutree"); | ||
64 | #ifdef MODULE_PARAM_PREFIX | 62 | #ifdef MODULE_PARAM_PREFIX |
65 | #undef MODULE_PARAM_PREFIX | 63 | #undef MODULE_PARAM_PREFIX |
66 | #endif | 64 | #endif |
@@ -1848,6 +1846,7 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, | |||
1848 | struct rcu_data *rdp) | 1846 | struct rcu_data *rdp) |
1849 | { | 1847 | { |
1850 | bool ret; | 1848 | bool ret; |
1849 | bool need_gp; | ||
1851 | 1850 | ||
1852 | /* Handle the ends of any preceding grace periods first. */ | 1851 | /* Handle the ends of any preceding grace periods first. */ |
1853 | if (rdp->completed == rnp->completed && | 1852 | if (rdp->completed == rnp->completed && |
@@ -1874,9 +1873,10 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, | |||
1874 | */ | 1873 | */ |
1875 | rdp->gpnum = rnp->gpnum; | 1874 | rdp->gpnum = rnp->gpnum; |
1876 | trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpustart")); | 1875 | trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpustart")); |
1877 | rdp->cpu_no_qs.b.norm = true; | 1876 | need_gp = !!(rnp->qsmask & rdp->grpmask); |
1877 | rdp->cpu_no_qs.b.norm = need_gp; | ||
1878 | rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr); | 1878 | rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr); |
1879 | rdp->core_needs_qs = !!(rnp->qsmask & rdp->grpmask); | 1879 | rdp->core_needs_qs = need_gp; |
1880 | zero_cpu_stall_ticks(rdp); | 1880 | zero_cpu_stall_ticks(rdp); |
1881 | WRITE_ONCE(rdp->gpwrap, false); | 1881 | WRITE_ONCE(rdp->gpwrap, false); |
1882 | } | 1882 | } |
@@ -2344,7 +2344,7 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) | |||
2344 | WARN_ON_ONCE(!rcu_gp_in_progress(rsp)); | 2344 | WARN_ON_ONCE(!rcu_gp_in_progress(rsp)); |
2345 | WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS); | 2345 | WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS); |
2346 | raw_spin_unlock_irqrestore_rcu_node(rcu_get_root(rsp), flags); | 2346 | raw_spin_unlock_irqrestore_rcu_node(rcu_get_root(rsp), flags); |
2347 | swake_up(&rsp->gp_wq); /* Memory barrier implied by swake_up() path. */ | 2347 | rcu_gp_kthread_wake(rsp); |
2348 | } | 2348 | } |
2349 | 2349 | ||
2350 | /* | 2350 | /* |
@@ -2970,7 +2970,7 @@ static void force_quiescent_state(struct rcu_state *rsp) | |||
2970 | } | 2970 | } |
2971 | WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS); | 2971 | WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS); |
2972 | raw_spin_unlock_irqrestore_rcu_node(rnp_old, flags); | 2972 | raw_spin_unlock_irqrestore_rcu_node(rnp_old, flags); |
2973 | swake_up(&rsp->gp_wq); /* Memory barrier implied by swake_up() path. */ | 2973 | rcu_gp_kthread_wake(rsp); |
2974 | } | 2974 | } |
2975 | 2975 | ||
2976 | /* | 2976 | /* |
@@ -3013,7 +3013,7 @@ __rcu_process_callbacks(struct rcu_state *rsp) | |||
3013 | /* | 3013 | /* |
3014 | * Do RCU core processing for the current CPU. | 3014 | * Do RCU core processing for the current CPU. |
3015 | */ | 3015 | */ |
3016 | static void rcu_process_callbacks(struct softirq_action *unused) | 3016 | static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused) |
3017 | { | 3017 | { |
3018 | struct rcu_state *rsp; | 3018 | struct rcu_state *rsp; |
3019 | 3019 | ||
@@ -3792,8 +3792,6 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp) | |||
3792 | rnp = rdp->mynode; | 3792 | rnp = rdp->mynode; |
3793 | mask = rdp->grpmask; | 3793 | mask = rdp->grpmask; |
3794 | raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ | 3794 | raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ |
3795 | rnp->qsmaskinitnext |= mask; | ||
3796 | rnp->expmaskinitnext |= mask; | ||
3797 | if (!rdp->beenonline) | 3795 | if (!rdp->beenonline) |
3798 | WRITE_ONCE(rsp->ncpus, READ_ONCE(rsp->ncpus) + 1); | 3796 | WRITE_ONCE(rsp->ncpus, READ_ONCE(rsp->ncpus) + 1); |
3799 | rdp->beenonline = true; /* We have now been online. */ | 3797 | rdp->beenonline = true; /* We have now been online. */ |
@@ -3860,6 +3858,32 @@ int rcutree_dead_cpu(unsigned int cpu) | |||
3860 | return 0; | 3858 | return 0; |
3861 | } | 3859 | } |
3862 | 3860 | ||
3861 | /* | ||
3862 | * Mark the specified CPU as being online so that subsequent grace periods | ||
3863 | * (both expedited and normal) will wait on it. Note that this means that | ||
3864 | * incoming CPUs are not allowed to use RCU read-side critical sections | ||
3865 | * until this function is called. Failing to observe this restriction | ||
3866 | * will result in lockdep splats. | ||
3867 | */ | ||
3868 | void rcu_cpu_starting(unsigned int cpu) | ||
3869 | { | ||
3870 | unsigned long flags; | ||
3871 | unsigned long mask; | ||
3872 | struct rcu_data *rdp; | ||
3873 | struct rcu_node *rnp; | ||
3874 | struct rcu_state *rsp; | ||
3875 | |||
3876 | for_each_rcu_flavor(rsp) { | ||
3877 | rdp = this_cpu_ptr(rsp->rda); | ||
3878 | rnp = rdp->mynode; | ||
3879 | mask = rdp->grpmask; | ||
3880 | raw_spin_lock_irqsave_rcu_node(rnp, flags); | ||
3881 | rnp->qsmaskinitnext |= mask; | ||
3882 | rnp->expmaskinitnext |= mask; | ||
3883 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); | ||
3884 | } | ||
3885 | } | ||
3886 | |||
3863 | #ifdef CONFIG_HOTPLUG_CPU | 3887 | #ifdef CONFIG_HOTPLUG_CPU |
3864 | /* | 3888 | /* |
3865 | * The CPU is exiting the idle loop into the arch_cpu_idle_dead() | 3889 | * The CPU is exiting the idle loop into the arch_cpu_idle_dead() |
@@ -4209,8 +4233,10 @@ void __init rcu_init(void) | |||
4209 | * or the scheduler are operational. | 4233 | * or the scheduler are operational. |
4210 | */ | 4234 | */ |
4211 | pm_notifier(rcu_pm_notify, 0); | 4235 | pm_notifier(rcu_pm_notify, 0); |
4212 | for_each_online_cpu(cpu) | 4236 | for_each_online_cpu(cpu) { |
4213 | rcutree_prepare_cpu(cpu); | 4237 | rcutree_prepare_cpu(cpu); |
4238 | rcu_cpu_starting(cpu); | ||
4239 | } | ||
4214 | } | 4240 | } |
4215 | 4241 | ||
4216 | #include "tree_exp.h" | 4242 | #include "tree_exp.h" |
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index f714f873bf9d..e99a5234d9ed 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h | |||
@@ -400,6 +400,7 @@ struct rcu_data { | |||
400 | #ifdef CONFIG_RCU_FAST_NO_HZ | 400 | #ifdef CONFIG_RCU_FAST_NO_HZ |
401 | struct rcu_head oom_head; | 401 | struct rcu_head oom_head; |
402 | #endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ | 402 | #endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ |
403 | atomic_long_t exp_workdone0; /* # done by workqueue. */ | ||
403 | atomic_long_t exp_workdone1; /* # done by others #1. */ | 404 | atomic_long_t exp_workdone1; /* # done by others #1. */ |
404 | atomic_long_t exp_workdone2; /* # done by others #2. */ | 405 | atomic_long_t exp_workdone2; /* # done by others #2. */ |
405 | atomic_long_t exp_workdone3; /* # done by others #3. */ | 406 | atomic_long_t exp_workdone3; /* # done by others #3. */ |
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 6d86ab6ec2c9..24343eb87b58 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h | |||
@@ -359,7 +359,8 @@ static void sync_rcu_exp_select_cpus(struct rcu_state *rsp, | |||
359 | struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); | 359 | struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); |
360 | 360 | ||
361 | if (raw_smp_processor_id() == cpu || | 361 | if (raw_smp_processor_id() == cpu || |
362 | !(atomic_add_return(0, &rdtp->dynticks) & 0x1)) | 362 | !(atomic_add_return(0, &rdtp->dynticks) & 0x1) || |
363 | !(rnp->qsmaskinitnext & rdp->grpmask)) | ||
363 | mask_ofl_test |= rdp->grpmask; | 364 | mask_ofl_test |= rdp->grpmask; |
364 | } | 365 | } |
365 | mask_ofl_ipi = rnp->expmask & ~mask_ofl_test; | 366 | mask_ofl_ipi = rnp->expmask & ~mask_ofl_test; |
@@ -384,17 +385,16 @@ retry_ipi: | |||
384 | mask_ofl_ipi &= ~mask; | 385 | mask_ofl_ipi &= ~mask; |
385 | continue; | 386 | continue; |
386 | } | 387 | } |
387 | /* Failed, raced with offline. */ | 388 | /* Failed, raced with CPU hotplug operation. */ |
388 | raw_spin_lock_irqsave_rcu_node(rnp, flags); | 389 | raw_spin_lock_irqsave_rcu_node(rnp, flags); |
389 | if (cpu_online(cpu) && | 390 | if ((rnp->qsmaskinitnext & mask) && |
390 | (rnp->expmask & mask)) { | 391 | (rnp->expmask & mask)) { |
392 | /* Online, so delay for a bit and try again. */ | ||
391 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); | 393 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); |
392 | schedule_timeout_uninterruptible(1); | 394 | schedule_timeout_uninterruptible(1); |
393 | if (cpu_online(cpu) && | 395 | goto retry_ipi; |
394 | (rnp->expmask & mask)) | ||
395 | goto retry_ipi; | ||
396 | raw_spin_lock_irqsave_rcu_node(rnp, flags); | ||
397 | } | 396 | } |
397 | /* CPU really is offline, so we can ignore it. */ | ||
398 | if (!(rnp->expmask & mask)) | 398 | if (!(rnp->expmask & mask)) |
399 | mask_ofl_ipi &= ~mask; | 399 | mask_ofl_ipi &= ~mask; |
400 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); | 400 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); |
@@ -427,12 +427,10 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp) | |||
427 | jiffies_stall); | 427 | jiffies_stall); |
428 | if (ret > 0 || sync_rcu_preempt_exp_done(rnp_root)) | 428 | if (ret > 0 || sync_rcu_preempt_exp_done(rnp_root)) |
429 | return; | 429 | return; |
430 | if (ret < 0) { | 430 | WARN_ON(ret < 0); /* workqueues should not be signaled. */ |
431 | /* Hit a signal, disable CPU stall warnings. */ | 431 | if (rcu_cpu_stall_suppress) |
432 | swait_event(rsp->expedited_wq, | 432 | continue; |
433 | sync_rcu_preempt_exp_done(rnp_root)); | 433 | panic_on_rcu_stall(); |
434 | return; | ||
435 | } | ||
436 | pr_err("INFO: %s detected expedited stalls on CPUs/tasks: {", | 434 | pr_err("INFO: %s detected expedited stalls on CPUs/tasks: {", |
437 | rsp->name); | 435 | rsp->name); |
438 | ndetected = 0; | 436 | ndetected = 0; |
@@ -500,7 +498,6 @@ static void rcu_exp_wait_wake(struct rcu_state *rsp, unsigned long s) | |||
500 | * next GP, to proceed. | 498 | * next GP, to proceed. |
501 | */ | 499 | */ |
502 | mutex_lock(&rsp->exp_wake_mutex); | 500 | mutex_lock(&rsp->exp_wake_mutex); |
503 | mutex_unlock(&rsp->exp_mutex); | ||
504 | 501 | ||
505 | rcu_for_each_node_breadth_first(rsp, rnp) { | 502 | rcu_for_each_node_breadth_first(rsp, rnp) { |
506 | if (ULONG_CMP_LT(READ_ONCE(rnp->exp_seq_rq), s)) { | 503 | if (ULONG_CMP_LT(READ_ONCE(rnp->exp_seq_rq), s)) { |
@@ -516,6 +513,70 @@ static void rcu_exp_wait_wake(struct rcu_state *rsp, unsigned long s) | |||
516 | mutex_unlock(&rsp->exp_wake_mutex); | 513 | mutex_unlock(&rsp->exp_wake_mutex); |
517 | } | 514 | } |
518 | 515 | ||
516 | /* Let the workqueue handler know what it is supposed to do. */ | ||
517 | struct rcu_exp_work { | ||
518 | smp_call_func_t rew_func; | ||
519 | struct rcu_state *rew_rsp; | ||
520 | unsigned long rew_s; | ||
521 | struct work_struct rew_work; | ||
522 | }; | ||
523 | |||
524 | /* | ||
525 | * Work-queue handler to drive an expedited grace period forward. | ||
526 | */ | ||
527 | static void wait_rcu_exp_gp(struct work_struct *wp) | ||
528 | { | ||
529 | struct rcu_exp_work *rewp; | ||
530 | |||
531 | /* Initialize the rcu_node tree in preparation for the wait. */ | ||
532 | rewp = container_of(wp, struct rcu_exp_work, rew_work); | ||
533 | sync_rcu_exp_select_cpus(rewp->rew_rsp, rewp->rew_func); | ||
534 | |||
535 | /* Wait and clean up, including waking everyone. */ | ||
536 | rcu_exp_wait_wake(rewp->rew_rsp, rewp->rew_s); | ||
537 | } | ||
538 | |||
539 | /* | ||
540 | * Given an rcu_state pointer and a smp_call_function() handler, kick | ||
541 | * off the specified flavor of expedited grace period. | ||
542 | */ | ||
543 | static void _synchronize_rcu_expedited(struct rcu_state *rsp, | ||
544 | smp_call_func_t func) | ||
545 | { | ||
546 | struct rcu_data *rdp; | ||
547 | struct rcu_exp_work rew; | ||
548 | struct rcu_node *rnp; | ||
549 | unsigned long s; | ||
550 | |||
551 | /* If expedited grace periods are prohibited, fall back to normal. */ | ||
552 | if (rcu_gp_is_normal()) { | ||
553 | wait_rcu_gp(rsp->call); | ||
554 | return; | ||
555 | } | ||
556 | |||
557 | /* Take a snapshot of the sequence number. */ | ||
558 | s = rcu_exp_gp_seq_snap(rsp); | ||
559 | if (exp_funnel_lock(rsp, s)) | ||
560 | return; /* Someone else did our work for us. */ | ||
561 | |||
562 | /* Marshall arguments and schedule the expedited grace period. */ | ||
563 | rew.rew_func = func; | ||
564 | rew.rew_rsp = rsp; | ||
565 | rew.rew_s = s; | ||
566 | INIT_WORK_ONSTACK(&rew.rew_work, wait_rcu_exp_gp); | ||
567 | schedule_work(&rew.rew_work); | ||
568 | |||
569 | /* Wait for expedited grace period to complete. */ | ||
570 | rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id()); | ||
571 | rnp = rcu_get_root(rsp); | ||
572 | wait_event(rnp->exp_wq[(s >> 1) & 0x3], | ||
573 | sync_exp_work_done(rsp, | ||
574 | &rdp->exp_workdone0, s)); | ||
575 | |||
576 | /* Let the next expedited grace period start. */ | ||
577 | mutex_unlock(&rsp->exp_mutex); | ||
578 | } | ||
579 | |||
519 | /** | 580 | /** |
520 | * synchronize_sched_expedited - Brute-force RCU-sched grace period | 581 | * synchronize_sched_expedited - Brute-force RCU-sched grace period |
521 | * | 582 | * |
@@ -534,29 +595,13 @@ static void rcu_exp_wait_wake(struct rcu_state *rsp, unsigned long s) | |||
534 | */ | 595 | */ |
535 | void synchronize_sched_expedited(void) | 596 | void synchronize_sched_expedited(void) |
536 | { | 597 | { |
537 | unsigned long s; | ||
538 | struct rcu_state *rsp = &rcu_sched_state; | 598 | struct rcu_state *rsp = &rcu_sched_state; |
539 | 599 | ||
540 | /* If only one CPU, this is automatically a grace period. */ | 600 | /* If only one CPU, this is automatically a grace period. */ |
541 | if (rcu_blocking_is_gp()) | 601 | if (rcu_blocking_is_gp()) |
542 | return; | 602 | return; |
543 | 603 | ||
544 | /* If expedited grace periods are prohibited, fall back to normal. */ | 604 | _synchronize_rcu_expedited(rsp, sync_sched_exp_handler); |
545 | if (rcu_gp_is_normal()) { | ||
546 | wait_rcu_gp(call_rcu_sched); | ||
547 | return; | ||
548 | } | ||
549 | |||
550 | /* Take a snapshot of the sequence number. */ | ||
551 | s = rcu_exp_gp_seq_snap(rsp); | ||
552 | if (exp_funnel_lock(rsp, s)) | ||
553 | return; /* Someone else did our work for us. */ | ||
554 | |||
555 | /* Initialize the rcu_node tree in preparation for the wait. */ | ||
556 | sync_rcu_exp_select_cpus(rsp, sync_sched_exp_handler); | ||
557 | |||
558 | /* Wait and clean up, including waking everyone. */ | ||
559 | rcu_exp_wait_wake(rsp, s); | ||
560 | } | 605 | } |
561 | EXPORT_SYMBOL_GPL(synchronize_sched_expedited); | 606 | EXPORT_SYMBOL_GPL(synchronize_sched_expedited); |
562 | 607 | ||
@@ -620,23 +665,8 @@ static void sync_rcu_exp_handler(void *info) | |||
620 | void synchronize_rcu_expedited(void) | 665 | void synchronize_rcu_expedited(void) |
621 | { | 666 | { |
622 | struct rcu_state *rsp = rcu_state_p; | 667 | struct rcu_state *rsp = rcu_state_p; |
623 | unsigned long s; | ||
624 | |||
625 | /* If expedited grace periods are prohibited, fall back to normal. */ | ||
626 | if (rcu_gp_is_normal()) { | ||
627 | wait_rcu_gp(call_rcu); | ||
628 | return; | ||
629 | } | ||
630 | |||
631 | s = rcu_exp_gp_seq_snap(rsp); | ||
632 | if (exp_funnel_lock(rsp, s)) | ||
633 | return; /* Someone else did our work for us. */ | ||
634 | |||
635 | /* Initialize the rcu_node tree in preparation for the wait. */ | ||
636 | sync_rcu_exp_select_cpus(rsp, sync_rcu_exp_handler); | ||
637 | 668 | ||
638 | /* Wait for ->blkd_tasks lists to drain, then wake everyone up. */ | 669 | _synchronize_rcu_expedited(rsp, sync_rcu_exp_handler); |
639 | rcu_exp_wait_wake(rsp, s); | ||
640 | } | 670 | } |
641 | EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); | 671 | EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); |
642 | 672 | ||
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 0082fce402a0..85c5a883c6e3 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h | |||
@@ -2173,6 +2173,7 @@ static int rcu_nocb_kthread(void *arg) | |||
2173 | cl++; | 2173 | cl++; |
2174 | c++; | 2174 | c++; |
2175 | local_bh_enable(); | 2175 | local_bh_enable(); |
2176 | cond_resched_rcu_qs(); | ||
2176 | list = next; | 2177 | list = next; |
2177 | } | 2178 | } |
2178 | trace_rcu_batch_end(rdp->rsp->name, c, !!list, 0, 0, 1); | 2179 | trace_rcu_batch_end(rdp->rsp->name, c, !!list, 0, 0, 1); |
diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c index 86782f9a4604..b1f28972872c 100644 --- a/kernel/rcu/tree_trace.c +++ b/kernel/rcu/tree_trace.c | |||
@@ -185,16 +185,17 @@ static int show_rcuexp(struct seq_file *m, void *v) | |||
185 | int cpu; | 185 | int cpu; |
186 | struct rcu_state *rsp = (struct rcu_state *)m->private; | 186 | struct rcu_state *rsp = (struct rcu_state *)m->private; |
187 | struct rcu_data *rdp; | 187 | struct rcu_data *rdp; |
188 | unsigned long s1 = 0, s2 = 0, s3 = 0; | 188 | unsigned long s0 = 0, s1 = 0, s2 = 0, s3 = 0; |
189 | 189 | ||
190 | for_each_possible_cpu(cpu) { | 190 | for_each_possible_cpu(cpu) { |
191 | rdp = per_cpu_ptr(rsp->rda, cpu); | 191 | rdp = per_cpu_ptr(rsp->rda, cpu); |
192 | s0 += atomic_long_read(&rdp->exp_workdone0); | ||
192 | s1 += atomic_long_read(&rdp->exp_workdone1); | 193 | s1 += atomic_long_read(&rdp->exp_workdone1); |
193 | s2 += atomic_long_read(&rdp->exp_workdone2); | 194 | s2 += atomic_long_read(&rdp->exp_workdone2); |
194 | s3 += atomic_long_read(&rdp->exp_workdone3); | 195 | s3 += atomic_long_read(&rdp->exp_workdone3); |
195 | } | 196 | } |
196 | seq_printf(m, "s=%lu wd1=%lu wd2=%lu wd3=%lu n=%lu enq=%d sc=%lu\n", | 197 | seq_printf(m, "s=%lu wd0=%lu wd1=%lu wd2=%lu wd3=%lu n=%lu enq=%d sc=%lu\n", |
197 | rsp->expedited_sequence, s1, s2, s3, | 198 | rsp->expedited_sequence, s0, s1, s2, s3, |
198 | atomic_long_read(&rsp->expedited_normal), | 199 | atomic_long_read(&rsp->expedited_normal), |
199 | atomic_read(&rsp->expedited_need_qs), | 200 | atomic_read(&rsp->expedited_need_qs), |
200 | rsp->expedited_sequence / 2); | 201 | rsp->expedited_sequence / 2); |
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index f0d8322bc3ec..f19271dce0a9 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c | |||
@@ -46,7 +46,7 @@ | |||
46 | #include <linux/export.h> | 46 | #include <linux/export.h> |
47 | #include <linux/hardirq.h> | 47 | #include <linux/hardirq.h> |
48 | #include <linux/delay.h> | 48 | #include <linux/delay.h> |
49 | #include <linux/module.h> | 49 | #include <linux/moduleparam.h> |
50 | #include <linux/kthread.h> | 50 | #include <linux/kthread.h> |
51 | #include <linux/tick.h> | 51 | #include <linux/tick.h> |
52 | 52 | ||
@@ -54,7 +54,6 @@ | |||
54 | 54 | ||
55 | #include "rcu.h" | 55 | #include "rcu.h" |
56 | 56 | ||
57 | MODULE_ALIAS("rcupdate"); | ||
58 | #ifdef MODULE_PARAM_PREFIX | 57 | #ifdef MODULE_PARAM_PREFIX |
59 | #undef MODULE_PARAM_PREFIX | 58 | #undef MODULE_PARAM_PREFIX |
60 | #endif | 59 | #endif |
diff --git a/kernel/relay.c b/kernel/relay.c index d797502140b9..da79a109dbeb 100644 --- a/kernel/relay.c +++ b/kernel/relay.c | |||
@@ -214,7 +214,7 @@ static void relay_destroy_buf(struct rchan_buf *buf) | |||
214 | __free_page(buf->page_array[i]); | 214 | __free_page(buf->page_array[i]); |
215 | relay_free_page_array(buf->page_array); | 215 | relay_free_page_array(buf->page_array); |
216 | } | 216 | } |
217 | chan->buf[buf->cpu] = NULL; | 217 | *per_cpu_ptr(chan->buf, buf->cpu) = NULL; |
218 | kfree(buf->padding); | 218 | kfree(buf->padding); |
219 | kfree(buf); | 219 | kfree(buf); |
220 | kref_put(&chan->kref, relay_destroy_channel); | 220 | kref_put(&chan->kref, relay_destroy_channel); |
@@ -328,13 +328,15 @@ static struct rchan_callbacks default_channel_callbacks = { | |||
328 | 328 | ||
329 | /** | 329 | /** |
330 | * wakeup_readers - wake up readers waiting on a channel | 330 | * wakeup_readers - wake up readers waiting on a channel |
331 | * @data: contains the channel buffer | 331 | * @work: contains the channel buffer |
332 | * | 332 | * |
333 | * This is the timer function used to defer reader waking. | 333 | * This is the function used to defer reader waking |
334 | */ | 334 | */ |
335 | static void wakeup_readers(unsigned long data) | 335 | static void wakeup_readers(struct irq_work *work) |
336 | { | 336 | { |
337 | struct rchan_buf *buf = (struct rchan_buf *)data; | 337 | struct rchan_buf *buf; |
338 | |||
339 | buf = container_of(work, struct rchan_buf, wakeup_work); | ||
338 | wake_up_interruptible(&buf->read_wait); | 340 | wake_up_interruptible(&buf->read_wait); |
339 | } | 341 | } |
340 | 342 | ||
@@ -352,9 +354,10 @@ static void __relay_reset(struct rchan_buf *buf, unsigned int init) | |||
352 | if (init) { | 354 | if (init) { |
353 | init_waitqueue_head(&buf->read_wait); | 355 | init_waitqueue_head(&buf->read_wait); |
354 | kref_init(&buf->kref); | 356 | kref_init(&buf->kref); |
355 | setup_timer(&buf->timer, wakeup_readers, (unsigned long)buf); | 357 | init_irq_work(&buf->wakeup_work, wakeup_readers); |
356 | } else | 358 | } else { |
357 | del_timer_sync(&buf->timer); | 359 | irq_work_sync(&buf->wakeup_work); |
360 | } | ||
358 | 361 | ||
359 | buf->subbufs_produced = 0; | 362 | buf->subbufs_produced = 0; |
360 | buf->subbufs_consumed = 0; | 363 | buf->subbufs_consumed = 0; |
@@ -382,20 +385,21 @@ static void __relay_reset(struct rchan_buf *buf, unsigned int init) | |||
382 | */ | 385 | */ |
383 | void relay_reset(struct rchan *chan) | 386 | void relay_reset(struct rchan *chan) |
384 | { | 387 | { |
388 | struct rchan_buf *buf; | ||
385 | unsigned int i; | 389 | unsigned int i; |
386 | 390 | ||
387 | if (!chan) | 391 | if (!chan) |
388 | return; | 392 | return; |
389 | 393 | ||
390 | if (chan->is_global && chan->buf[0]) { | 394 | if (chan->is_global && (buf = *per_cpu_ptr(chan->buf, 0))) { |
391 | __relay_reset(chan->buf[0], 0); | 395 | __relay_reset(buf, 0); |
392 | return; | 396 | return; |
393 | } | 397 | } |
394 | 398 | ||
395 | mutex_lock(&relay_channels_mutex); | 399 | mutex_lock(&relay_channels_mutex); |
396 | for_each_possible_cpu(i) | 400 | for_each_possible_cpu(i) |
397 | if (chan->buf[i]) | 401 | if ((buf = *per_cpu_ptr(chan->buf, i))) |
398 | __relay_reset(chan->buf[i], 0); | 402 | __relay_reset(buf, 0); |
399 | mutex_unlock(&relay_channels_mutex); | 403 | mutex_unlock(&relay_channels_mutex); |
400 | } | 404 | } |
401 | EXPORT_SYMBOL_GPL(relay_reset); | 405 | EXPORT_SYMBOL_GPL(relay_reset); |
@@ -440,7 +444,7 @@ static struct rchan_buf *relay_open_buf(struct rchan *chan, unsigned int cpu) | |||
440 | struct dentry *dentry; | 444 | struct dentry *dentry; |
441 | 445 | ||
442 | if (chan->is_global) | 446 | if (chan->is_global) |
443 | return chan->buf[0]; | 447 | return *per_cpu_ptr(chan->buf, 0); |
444 | 448 | ||
445 | buf = relay_create_buf(chan); | 449 | buf = relay_create_buf(chan); |
446 | if (!buf) | 450 | if (!buf) |
@@ -464,7 +468,7 @@ static struct rchan_buf *relay_open_buf(struct rchan *chan, unsigned int cpu) | |||
464 | __relay_reset(buf, 1); | 468 | __relay_reset(buf, 1); |
465 | 469 | ||
466 | if(chan->is_global) { | 470 | if(chan->is_global) { |
467 | chan->buf[0] = buf; | 471 | *per_cpu_ptr(chan->buf, 0) = buf; |
468 | buf->cpu = 0; | 472 | buf->cpu = 0; |
469 | } | 473 | } |
470 | 474 | ||
@@ -486,7 +490,7 @@ free_buf: | |||
486 | static void relay_close_buf(struct rchan_buf *buf) | 490 | static void relay_close_buf(struct rchan_buf *buf) |
487 | { | 491 | { |
488 | buf->finalized = 1; | 492 | buf->finalized = 1; |
489 | del_timer_sync(&buf->timer); | 493 | irq_work_sync(&buf->wakeup_work); |
490 | buf->chan->cb->remove_buf_file(buf->dentry); | 494 | buf->chan->cb->remove_buf_file(buf->dentry); |
491 | kref_put(&buf->kref, relay_remove_buf); | 495 | kref_put(&buf->kref, relay_remove_buf); |
492 | } | 496 | } |
@@ -512,46 +516,25 @@ static void setup_callbacks(struct rchan *chan, | |||
512 | chan->cb = cb; | 516 | chan->cb = cb; |
513 | } | 517 | } |
514 | 518 | ||
515 | /** | 519 | int relay_prepare_cpu(unsigned int cpu) |
516 | * relay_hotcpu_callback - CPU hotplug callback | ||
517 | * @nb: notifier block | ||
518 | * @action: hotplug action to take | ||
519 | * @hcpu: CPU number | ||
520 | * | ||
521 | * Returns the success/failure of the operation. (%NOTIFY_OK, %NOTIFY_BAD) | ||
522 | */ | ||
523 | static int relay_hotcpu_callback(struct notifier_block *nb, | ||
524 | unsigned long action, | ||
525 | void *hcpu) | ||
526 | { | 520 | { |
527 | unsigned int hotcpu = (unsigned long)hcpu; | ||
528 | struct rchan *chan; | 521 | struct rchan *chan; |
522 | struct rchan_buf *buf; | ||
529 | 523 | ||
530 | switch(action) { | 524 | mutex_lock(&relay_channels_mutex); |
531 | case CPU_UP_PREPARE: | 525 | list_for_each_entry(chan, &relay_channels, list) { |
532 | case CPU_UP_PREPARE_FROZEN: | 526 | if ((buf = *per_cpu_ptr(chan->buf, cpu))) |
533 | mutex_lock(&relay_channels_mutex); | 527 | continue; |
534 | list_for_each_entry(chan, &relay_channels, list) { | 528 | buf = relay_open_buf(chan, cpu); |
535 | if (chan->buf[hotcpu]) | 529 | if (!buf) { |
536 | continue; | 530 | pr_err("relay: cpu %d buffer creation failed\n", cpu); |
537 | chan->buf[hotcpu] = relay_open_buf(chan, hotcpu); | 531 | mutex_unlock(&relay_channels_mutex); |
538 | if(!chan->buf[hotcpu]) { | 532 | return -ENOMEM; |
539 | printk(KERN_ERR | ||
540 | "relay_hotcpu_callback: cpu %d buffer " | ||
541 | "creation failed\n", hotcpu); | ||
542 | mutex_unlock(&relay_channels_mutex); | ||
543 | return notifier_from_errno(-ENOMEM); | ||
544 | } | ||
545 | } | 533 | } |
546 | mutex_unlock(&relay_channels_mutex); | 534 | *per_cpu_ptr(chan->buf, cpu) = buf; |
547 | break; | ||
548 | case CPU_DEAD: | ||
549 | case CPU_DEAD_FROZEN: | ||
550 | /* No need to flush the cpu : will be flushed upon | ||
551 | * final relay_flush() call. */ | ||
552 | break; | ||
553 | } | 535 | } |
554 | return NOTIFY_OK; | 536 | mutex_unlock(&relay_channels_mutex); |
537 | return 0; | ||
555 | } | 538 | } |
556 | 539 | ||
557 | /** | 540 | /** |
@@ -583,6 +566,7 @@ struct rchan *relay_open(const char *base_filename, | |||
583 | { | 566 | { |
584 | unsigned int i; | 567 | unsigned int i; |
585 | struct rchan *chan; | 568 | struct rchan *chan; |
569 | struct rchan_buf *buf; | ||
586 | 570 | ||
587 | if (!(subbuf_size && n_subbufs)) | 571 | if (!(subbuf_size && n_subbufs)) |
588 | return NULL; | 572 | return NULL; |
@@ -593,6 +577,7 @@ struct rchan *relay_open(const char *base_filename, | |||
593 | if (!chan) | 577 | if (!chan) |
594 | return NULL; | 578 | return NULL; |
595 | 579 | ||
580 | chan->buf = alloc_percpu(struct rchan_buf *); | ||
596 | chan->version = RELAYFS_CHANNEL_VERSION; | 581 | chan->version = RELAYFS_CHANNEL_VERSION; |
597 | chan->n_subbufs = n_subbufs; | 582 | chan->n_subbufs = n_subbufs; |
598 | chan->subbuf_size = subbuf_size; | 583 | chan->subbuf_size = subbuf_size; |
@@ -608,9 +593,10 @@ struct rchan *relay_open(const char *base_filename, | |||
608 | 593 | ||
609 | mutex_lock(&relay_channels_mutex); | 594 | mutex_lock(&relay_channels_mutex); |
610 | for_each_online_cpu(i) { | 595 | for_each_online_cpu(i) { |
611 | chan->buf[i] = relay_open_buf(chan, i); | 596 | buf = relay_open_buf(chan, i); |
612 | if (!chan->buf[i]) | 597 | if (!buf) |
613 | goto free_bufs; | 598 | goto free_bufs; |
599 | *per_cpu_ptr(chan->buf, i) = buf; | ||
614 | } | 600 | } |
615 | list_add(&chan->list, &relay_channels); | 601 | list_add(&chan->list, &relay_channels); |
616 | mutex_unlock(&relay_channels_mutex); | 602 | mutex_unlock(&relay_channels_mutex); |
@@ -619,8 +605,8 @@ struct rchan *relay_open(const char *base_filename, | |||
619 | 605 | ||
620 | free_bufs: | 606 | free_bufs: |
621 | for_each_possible_cpu(i) { | 607 | for_each_possible_cpu(i) { |
622 | if (chan->buf[i]) | 608 | if ((buf = *per_cpu_ptr(chan->buf, i))) |
623 | relay_close_buf(chan->buf[i]); | 609 | relay_close_buf(buf); |
624 | } | 610 | } |
625 | 611 | ||
626 | kref_put(&chan->kref, relay_destroy_channel); | 612 | kref_put(&chan->kref, relay_destroy_channel); |
@@ -666,6 +652,7 @@ int relay_late_setup_files(struct rchan *chan, | |||
666 | unsigned int i, curr_cpu; | 652 | unsigned int i, curr_cpu; |
667 | unsigned long flags; | 653 | unsigned long flags; |
668 | struct dentry *dentry; | 654 | struct dentry *dentry; |
655 | struct rchan_buf *buf; | ||
669 | struct rchan_percpu_buf_dispatcher disp; | 656 | struct rchan_percpu_buf_dispatcher disp; |
670 | 657 | ||
671 | if (!chan || !base_filename) | 658 | if (!chan || !base_filename) |
@@ -684,10 +671,11 @@ int relay_late_setup_files(struct rchan *chan, | |||
684 | 671 | ||
685 | if (chan->is_global) { | 672 | if (chan->is_global) { |
686 | err = -EINVAL; | 673 | err = -EINVAL; |
687 | if (!WARN_ON_ONCE(!chan->buf[0])) { | 674 | buf = *per_cpu_ptr(chan->buf, 0); |
688 | dentry = relay_create_buf_file(chan, chan->buf[0], 0); | 675 | if (!WARN_ON_ONCE(!buf)) { |
676 | dentry = relay_create_buf_file(chan, buf, 0); | ||
689 | if (dentry && !WARN_ON_ONCE(!chan->is_global)) { | 677 | if (dentry && !WARN_ON_ONCE(!chan->is_global)) { |
690 | relay_set_buf_dentry(chan->buf[0], dentry); | 678 | relay_set_buf_dentry(buf, dentry); |
691 | err = 0; | 679 | err = 0; |
692 | } | 680 | } |
693 | } | 681 | } |
@@ -702,13 +690,14 @@ int relay_late_setup_files(struct rchan *chan, | |||
702 | * on all currently online CPUs. | 690 | * on all currently online CPUs. |
703 | */ | 691 | */ |
704 | for_each_online_cpu(i) { | 692 | for_each_online_cpu(i) { |
705 | if (unlikely(!chan->buf[i])) { | 693 | buf = *per_cpu_ptr(chan->buf, i); |
694 | if (unlikely(!buf)) { | ||
706 | WARN_ONCE(1, KERN_ERR "CPU has no buffer!\n"); | 695 | WARN_ONCE(1, KERN_ERR "CPU has no buffer!\n"); |
707 | err = -EINVAL; | 696 | err = -EINVAL; |
708 | break; | 697 | break; |
709 | } | 698 | } |
710 | 699 | ||
711 | dentry = relay_create_buf_file(chan, chan->buf[i], i); | 700 | dentry = relay_create_buf_file(chan, buf, i); |
712 | if (unlikely(!dentry)) { | 701 | if (unlikely(!dentry)) { |
713 | err = -EINVAL; | 702 | err = -EINVAL; |
714 | break; | 703 | break; |
@@ -716,10 +705,10 @@ int relay_late_setup_files(struct rchan *chan, | |||
716 | 705 | ||
717 | if (curr_cpu == i) { | 706 | if (curr_cpu == i) { |
718 | local_irq_save(flags); | 707 | local_irq_save(flags); |
719 | relay_set_buf_dentry(chan->buf[i], dentry); | 708 | relay_set_buf_dentry(buf, dentry); |
720 | local_irq_restore(flags); | 709 | local_irq_restore(flags); |
721 | } else { | 710 | } else { |
722 | disp.buf = chan->buf[i]; | 711 | disp.buf = buf; |
723 | disp.dentry = dentry; | 712 | disp.dentry = dentry; |
724 | smp_mb(); | 713 | smp_mb(); |
725 | /* relay_channels_mutex must be held, so wait. */ | 714 | /* relay_channels_mutex must be held, so wait. */ |
@@ -768,14 +757,15 @@ size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length) | |||
768 | buf->early_bytes += buf->chan->subbuf_size - | 757 | buf->early_bytes += buf->chan->subbuf_size - |
769 | buf->padding[old_subbuf]; | 758 | buf->padding[old_subbuf]; |
770 | smp_mb(); | 759 | smp_mb(); |
771 | if (waitqueue_active(&buf->read_wait)) | 760 | if (waitqueue_active(&buf->read_wait)) { |
772 | /* | 761 | /* |
773 | * Calling wake_up_interruptible() from here | 762 | * Calling wake_up_interruptible() from here |
774 | * will deadlock if we happen to be logging | 763 | * will deadlock if we happen to be logging |
775 | * from the scheduler (trying to re-grab | 764 | * from the scheduler (trying to re-grab |
776 | * rq->lock), so defer it. | 765 | * rq->lock), so defer it. |
777 | */ | 766 | */ |
778 | mod_timer(&buf->timer, jiffies + 1); | 767 | irq_work_queue(&buf->wakeup_work); |
768 | } | ||
779 | } | 769 | } |
780 | 770 | ||
781 | old = buf->data; | 771 | old = buf->data; |
@@ -822,11 +812,10 @@ void relay_subbufs_consumed(struct rchan *chan, | |||
822 | if (!chan) | 812 | if (!chan) |
823 | return; | 813 | return; |
824 | 814 | ||
825 | if (cpu >= NR_CPUS || !chan->buf[cpu] || | 815 | buf = *per_cpu_ptr(chan->buf, cpu); |
826 | subbufs_consumed > chan->n_subbufs) | 816 | if (cpu >= NR_CPUS || !buf || subbufs_consumed > chan->n_subbufs) |
827 | return; | 817 | return; |
828 | 818 | ||
829 | buf = chan->buf[cpu]; | ||
830 | if (subbufs_consumed > buf->subbufs_produced - buf->subbufs_consumed) | 819 | if (subbufs_consumed > buf->subbufs_produced - buf->subbufs_consumed) |
831 | buf->subbufs_consumed = buf->subbufs_produced; | 820 | buf->subbufs_consumed = buf->subbufs_produced; |
832 | else | 821 | else |
@@ -842,18 +831,19 @@ EXPORT_SYMBOL_GPL(relay_subbufs_consumed); | |||
842 | */ | 831 | */ |
843 | void relay_close(struct rchan *chan) | 832 | void relay_close(struct rchan *chan) |
844 | { | 833 | { |
834 | struct rchan_buf *buf; | ||
845 | unsigned int i; | 835 | unsigned int i; |
846 | 836 | ||
847 | if (!chan) | 837 | if (!chan) |
848 | return; | 838 | return; |
849 | 839 | ||
850 | mutex_lock(&relay_channels_mutex); | 840 | mutex_lock(&relay_channels_mutex); |
851 | if (chan->is_global && chan->buf[0]) | 841 | if (chan->is_global && (buf = *per_cpu_ptr(chan->buf, 0))) |
852 | relay_close_buf(chan->buf[0]); | 842 | relay_close_buf(buf); |
853 | else | 843 | else |
854 | for_each_possible_cpu(i) | 844 | for_each_possible_cpu(i) |
855 | if (chan->buf[i]) | 845 | if ((buf = *per_cpu_ptr(chan->buf, i))) |
856 | relay_close_buf(chan->buf[i]); | 846 | relay_close_buf(buf); |
857 | 847 | ||
858 | if (chan->last_toobig) | 848 | if (chan->last_toobig) |
859 | printk(KERN_WARNING "relay: one or more items not logged " | 849 | printk(KERN_WARNING "relay: one or more items not logged " |
@@ -874,20 +864,21 @@ EXPORT_SYMBOL_GPL(relay_close); | |||
874 | */ | 864 | */ |
875 | void relay_flush(struct rchan *chan) | 865 | void relay_flush(struct rchan *chan) |
876 | { | 866 | { |
867 | struct rchan_buf *buf; | ||
877 | unsigned int i; | 868 | unsigned int i; |
878 | 869 | ||
879 | if (!chan) | 870 | if (!chan) |
880 | return; | 871 | return; |
881 | 872 | ||
882 | if (chan->is_global && chan->buf[0]) { | 873 | if (chan->is_global && (buf = *per_cpu_ptr(chan->buf, 0))) { |
883 | relay_switch_subbuf(chan->buf[0], 0); | 874 | relay_switch_subbuf(buf, 0); |
884 | return; | 875 | return; |
885 | } | 876 | } |
886 | 877 | ||
887 | mutex_lock(&relay_channels_mutex); | 878 | mutex_lock(&relay_channels_mutex); |
888 | for_each_possible_cpu(i) | 879 | for_each_possible_cpu(i) |
889 | if (chan->buf[i]) | 880 | if ((buf = *per_cpu_ptr(chan->buf, i))) |
890 | relay_switch_subbuf(chan->buf[i], 0); | 881 | relay_switch_subbuf(buf, 0); |
891 | mutex_unlock(&relay_channels_mutex); | 882 | mutex_unlock(&relay_channels_mutex); |
892 | } | 883 | } |
893 | EXPORT_SYMBOL_GPL(relay_flush); | 884 | EXPORT_SYMBOL_GPL(relay_flush); |
@@ -1121,51 +1112,23 @@ static size_t relay_file_read_end_pos(struct rchan_buf *buf, | |||
1121 | return end_pos; | 1112 | return end_pos; |
1122 | } | 1113 | } |
1123 | 1114 | ||
1124 | /* | 1115 | static ssize_t relay_file_read(struct file *filp, |
1125 | * subbuf_read_actor - read up to one subbuf's worth of data | 1116 | char __user *buffer, |
1126 | */ | 1117 | size_t count, |
1127 | static int subbuf_read_actor(size_t read_start, | 1118 | loff_t *ppos) |
1128 | struct rchan_buf *buf, | ||
1129 | size_t avail, | ||
1130 | read_descriptor_t *desc) | ||
1131 | { | ||
1132 | void *from; | ||
1133 | int ret = 0; | ||
1134 | |||
1135 | from = buf->start + read_start; | ||
1136 | ret = avail; | ||
1137 | if (copy_to_user(desc->arg.buf, from, avail)) { | ||
1138 | desc->error = -EFAULT; | ||
1139 | ret = 0; | ||
1140 | } | ||
1141 | desc->arg.data += ret; | ||
1142 | desc->written += ret; | ||
1143 | desc->count -= ret; | ||
1144 | |||
1145 | return ret; | ||
1146 | } | ||
1147 | |||
1148 | typedef int (*subbuf_actor_t) (size_t read_start, | ||
1149 | struct rchan_buf *buf, | ||
1150 | size_t avail, | ||
1151 | read_descriptor_t *desc); | ||
1152 | |||
1153 | /* | ||
1154 | * relay_file_read_subbufs - read count bytes, bridging subbuf boundaries | ||
1155 | */ | ||
1156 | static ssize_t relay_file_read_subbufs(struct file *filp, loff_t *ppos, | ||
1157 | subbuf_actor_t subbuf_actor, | ||
1158 | read_descriptor_t *desc) | ||
1159 | { | 1119 | { |
1160 | struct rchan_buf *buf = filp->private_data; | 1120 | struct rchan_buf *buf = filp->private_data; |
1161 | size_t read_start, avail; | 1121 | size_t read_start, avail; |
1122 | size_t written = 0; | ||
1162 | int ret; | 1123 | int ret; |
1163 | 1124 | ||
1164 | if (!desc->count) | 1125 | if (!count) |
1165 | return 0; | 1126 | return 0; |
1166 | 1127 | ||
1167 | inode_lock(file_inode(filp)); | 1128 | inode_lock(file_inode(filp)); |
1168 | do { | 1129 | do { |
1130 | void *from; | ||
1131 | |||
1169 | if (!relay_file_read_avail(buf, *ppos)) | 1132 | if (!relay_file_read_avail(buf, *ppos)) |
1170 | break; | 1133 | break; |
1171 | 1134 | ||
@@ -1174,32 +1137,22 @@ static ssize_t relay_file_read_subbufs(struct file *filp, loff_t *ppos, | |||
1174 | if (!avail) | 1137 | if (!avail) |
1175 | break; | 1138 | break; |
1176 | 1139 | ||
1177 | avail = min(desc->count, avail); | 1140 | avail = min(count, avail); |
1178 | ret = subbuf_actor(read_start, buf, avail, desc); | 1141 | from = buf->start + read_start; |
1179 | if (desc->error < 0) | 1142 | ret = avail; |
1143 | if (copy_to_user(buffer, from, avail)) | ||
1180 | break; | 1144 | break; |
1181 | 1145 | ||
1182 | if (ret) { | 1146 | buffer += ret; |
1183 | relay_file_read_consume(buf, read_start, ret); | 1147 | written += ret; |
1184 | *ppos = relay_file_read_end_pos(buf, read_start, ret); | 1148 | count -= ret; |
1185 | } | ||
1186 | } while (desc->count && ret); | ||
1187 | inode_unlock(file_inode(filp)); | ||
1188 | 1149 | ||
1189 | return desc->written; | 1150 | relay_file_read_consume(buf, read_start, ret); |
1190 | } | 1151 | *ppos = relay_file_read_end_pos(buf, read_start, ret); |
1152 | } while (count); | ||
1153 | inode_unlock(file_inode(filp)); | ||
1191 | 1154 | ||
1192 | static ssize_t relay_file_read(struct file *filp, | 1155 | return written; |
1193 | char __user *buffer, | ||
1194 | size_t count, | ||
1195 | loff_t *ppos) | ||
1196 | { | ||
1197 | read_descriptor_t desc; | ||
1198 | desc.written = 0; | ||
1199 | desc.count = count; | ||
1200 | desc.arg.buf = buffer; | ||
1201 | desc.error = 0; | ||
1202 | return relay_file_read_subbufs(filp, ppos, subbuf_read_actor, &desc); | ||
1203 | } | 1156 | } |
1204 | 1157 | ||
1205 | static void relay_consume_bytes(struct rchan_buf *rbuf, int bytes_consumed) | 1158 | static void relay_consume_bytes(struct rchan_buf *rbuf, int bytes_consumed) |
@@ -1377,12 +1330,3 @@ const struct file_operations relay_file_operations = { | |||
1377 | .splice_read = relay_file_splice_read, | 1330 | .splice_read = relay_file_splice_read, |
1378 | }; | 1331 | }; |
1379 | EXPORT_SYMBOL_GPL(relay_file_operations); | 1332 | EXPORT_SYMBOL_GPL(relay_file_operations); |
1380 | |||
1381 | static __init int relay_init(void) | ||
1382 | { | ||
1383 | |||
1384 | hotcpu_notifier(relay_hotcpu_callback, 0); | ||
1385 | return 0; | ||
1386 | } | ||
1387 | |||
1388 | early_initcall(relay_init); | ||
diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c index a5d966cb8891..f1c8fd566246 100644 --- a/kernel/sched/auto_group.c +++ b/kernel/sched/auto_group.c | |||
@@ -111,10 +111,13 @@ bool task_wants_autogroup(struct task_struct *p, struct task_group *tg) | |||
111 | { | 111 | { |
112 | if (tg != &root_task_group) | 112 | if (tg != &root_task_group) |
113 | return false; | 113 | return false; |
114 | |||
115 | /* | 114 | /* |
116 | * We can only assume the task group can't go away on us if | 115 | * If we race with autogroup_move_group() the caller can use the old |
117 | * autogroup_move_group() can see us on ->thread_group list. | 116 | * value of signal->autogroup but in this case sched_move_task() will |
117 | * be called again before autogroup_kref_put(). | ||
118 | * | ||
119 | * However, there is no way sched_autogroup_exit_task() could tell us | ||
120 | * to avoid autogroup->tg, so we abuse PF_EXITING flag for this case. | ||
118 | */ | 121 | */ |
119 | if (p->flags & PF_EXITING) | 122 | if (p->flags & PF_EXITING) |
120 | return false; | 123 | return false; |
@@ -122,6 +125,16 @@ bool task_wants_autogroup(struct task_struct *p, struct task_group *tg) | |||
122 | return true; | 125 | return true; |
123 | } | 126 | } |
124 | 127 | ||
128 | void sched_autogroup_exit_task(struct task_struct *p) | ||
129 | { | ||
130 | /* | ||
131 | * We are going to call exit_notify() and autogroup_move_group() can't | ||
132 | * see this thread after that: we can no longer use signal->autogroup. | ||
133 | * See the PF_EXITING check in task_wants_autogroup(). | ||
134 | */ | ||
135 | sched_move_task(p); | ||
136 | } | ||
137 | |||
125 | static void | 138 | static void |
126 | autogroup_move_group(struct task_struct *p, struct autogroup *ag) | 139 | autogroup_move_group(struct task_struct *p, struct autogroup *ag) |
127 | { | 140 | { |
@@ -138,13 +151,20 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag) | |||
138 | } | 151 | } |
139 | 152 | ||
140 | p->signal->autogroup = autogroup_kref_get(ag); | 153 | p->signal->autogroup = autogroup_kref_get(ag); |
141 | 154 | /* | |
142 | if (!READ_ONCE(sysctl_sched_autogroup_enabled)) | 155 | * We can't avoid sched_move_task() after we changed signal->autogroup, |
143 | goto out; | 156 | * this process can already run with task_group() == prev->tg or we can |
144 | 157 | * race with cgroup code which can read autogroup = prev under rq->lock. | |
158 | * In the latter case for_each_thread() can not miss a migrating thread, | ||
159 | * cpu_cgroup_attach() must not be possible after cgroup_exit() and it | ||
160 | * can't be removed from thread list, we hold ->siglock. | ||
161 | * | ||
162 | * If an exiting thread was already removed from thread list we rely on | ||
163 | * sched_autogroup_exit_task(). | ||
164 | */ | ||
145 | for_each_thread(p, t) | 165 | for_each_thread(p, t) |
146 | sched_move_task(t); | 166 | sched_move_task(t); |
147 | out: | 167 | |
148 | unlock_task_sighand(p, &flags); | 168 | unlock_task_sighand(p, &flags); |
149 | autogroup_kref_put(prev); | 169 | autogroup_kref_put(prev); |
150 | } | 170 | } |
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 44817c640e99..154fd689fe02 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
@@ -581,6 +581,8 @@ static bool wake_up_full_nohz_cpu(int cpu) | |||
581 | * If needed we can still optimize that later with an | 581 | * If needed we can still optimize that later with an |
582 | * empty IRQ. | 582 | * empty IRQ. |
583 | */ | 583 | */ |
584 | if (cpu_is_offline(cpu)) | ||
585 | return true; /* Don't try to wake offline CPUs. */ | ||
584 | if (tick_nohz_full_cpu(cpu)) { | 586 | if (tick_nohz_full_cpu(cpu)) { |
585 | if (cpu != smp_processor_id() || | 587 | if (cpu != smp_processor_id() || |
586 | tick_nohz_tick_stopped()) | 588 | tick_nohz_tick_stopped()) |
@@ -591,6 +593,11 @@ static bool wake_up_full_nohz_cpu(int cpu) | |||
591 | return false; | 593 | return false; |
592 | } | 594 | } |
593 | 595 | ||
596 | /* | ||
597 | * Wake up the specified CPU. If the CPU is going offline, it is the | ||
598 | * caller's responsibility to deal with the lost wakeup, for example, | ||
599 | * by hooking into the CPU_DEAD notifier like timers and hrtimers do. | ||
600 | */ | ||
594 | void wake_up_nohz_cpu(int cpu) | 601 | void wake_up_nohz_cpu(int cpu) |
595 | { | 602 | { |
596 | if (!wake_up_full_nohz_cpu(cpu)) | 603 | if (!wake_up_full_nohz_cpu(cpu)) |
@@ -1063,8 +1070,12 @@ static int migration_cpu_stop(void *data) | |||
1063 | * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because | 1070 | * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because |
1064 | * we're holding p->pi_lock. | 1071 | * we're holding p->pi_lock. |
1065 | */ | 1072 | */ |
1066 | if (task_rq(p) == rq && task_on_rq_queued(p)) | 1073 | if (task_rq(p) == rq) { |
1067 | rq = __migrate_task(rq, p, arg->dest_cpu); | 1074 | if (task_on_rq_queued(p)) |
1075 | rq = __migrate_task(rq, p, arg->dest_cpu); | ||
1076 | else | ||
1077 | p->wake_cpu = arg->dest_cpu; | ||
1078 | } | ||
1068 | raw_spin_unlock(&rq->lock); | 1079 | raw_spin_unlock(&rq->lock); |
1069 | raw_spin_unlock(&p->pi_lock); | 1080 | raw_spin_unlock(&p->pi_lock); |
1070 | 1081 | ||
@@ -1105,10 +1116,10 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) | |||
1105 | 1116 | ||
1106 | p->sched_class->set_cpus_allowed(p, new_mask); | 1117 | p->sched_class->set_cpus_allowed(p, new_mask); |
1107 | 1118 | ||
1108 | if (running) | ||
1109 | p->sched_class->set_curr_task(rq); | ||
1110 | if (queued) | 1119 | if (queued) |
1111 | enqueue_task(rq, p, ENQUEUE_RESTORE); | 1120 | enqueue_task(rq, p, ENQUEUE_RESTORE); |
1121 | if (running) | ||
1122 | set_curr_task(rq, p); | ||
1112 | } | 1123 | } |
1113 | 1124 | ||
1114 | /* | 1125 | /* |
@@ -1265,7 +1276,7 @@ static void __migrate_swap_task(struct task_struct *p, int cpu) | |||
1265 | /* | 1276 | /* |
1266 | * Task isn't running anymore; make it appear like we migrated | 1277 | * Task isn't running anymore; make it appear like we migrated |
1267 | * it before it went to sleep. This means on wakeup we make the | 1278 | * it before it went to sleep. This means on wakeup we make the |
1268 | * previous cpu our targer instead of where it really is. | 1279 | * previous cpu our target instead of where it really is. |
1269 | */ | 1280 | */ |
1270 | p->wake_cpu = cpu; | 1281 | p->wake_cpu = cpu; |
1271 | } | 1282 | } |
@@ -1629,23 +1640,25 @@ static inline int __set_cpus_allowed_ptr(struct task_struct *p, | |||
1629 | static void | 1640 | static void |
1630 | ttwu_stat(struct task_struct *p, int cpu, int wake_flags) | 1641 | ttwu_stat(struct task_struct *p, int cpu, int wake_flags) |
1631 | { | 1642 | { |
1632 | #ifdef CONFIG_SCHEDSTATS | 1643 | struct rq *rq; |
1633 | struct rq *rq = this_rq(); | ||
1634 | 1644 | ||
1635 | #ifdef CONFIG_SMP | 1645 | if (!schedstat_enabled()) |
1636 | int this_cpu = smp_processor_id(); | 1646 | return; |
1637 | 1647 | ||
1638 | if (cpu == this_cpu) { | 1648 | rq = this_rq(); |
1639 | schedstat_inc(rq, ttwu_local); | 1649 | |
1640 | schedstat_inc(p, se.statistics.nr_wakeups_local); | 1650 | #ifdef CONFIG_SMP |
1651 | if (cpu == rq->cpu) { | ||
1652 | schedstat_inc(rq->ttwu_local); | ||
1653 | schedstat_inc(p->se.statistics.nr_wakeups_local); | ||
1641 | } else { | 1654 | } else { |
1642 | struct sched_domain *sd; | 1655 | struct sched_domain *sd; |
1643 | 1656 | ||
1644 | schedstat_inc(p, se.statistics.nr_wakeups_remote); | 1657 | schedstat_inc(p->se.statistics.nr_wakeups_remote); |
1645 | rcu_read_lock(); | 1658 | rcu_read_lock(); |
1646 | for_each_domain(this_cpu, sd) { | 1659 | for_each_domain(rq->cpu, sd) { |
1647 | if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { | 1660 | if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { |
1648 | schedstat_inc(sd, ttwu_wake_remote); | 1661 | schedstat_inc(sd->ttwu_wake_remote); |
1649 | break; | 1662 | break; |
1650 | } | 1663 | } |
1651 | } | 1664 | } |
@@ -1653,17 +1666,14 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags) | |||
1653 | } | 1666 | } |
1654 | 1667 | ||
1655 | if (wake_flags & WF_MIGRATED) | 1668 | if (wake_flags & WF_MIGRATED) |
1656 | schedstat_inc(p, se.statistics.nr_wakeups_migrate); | 1669 | schedstat_inc(p->se.statistics.nr_wakeups_migrate); |
1657 | |||
1658 | #endif /* CONFIG_SMP */ | 1670 | #endif /* CONFIG_SMP */ |
1659 | 1671 | ||
1660 | schedstat_inc(rq, ttwu_count); | 1672 | schedstat_inc(rq->ttwu_count); |
1661 | schedstat_inc(p, se.statistics.nr_wakeups); | 1673 | schedstat_inc(p->se.statistics.nr_wakeups); |
1662 | 1674 | ||
1663 | if (wake_flags & WF_SYNC) | 1675 | if (wake_flags & WF_SYNC) |
1664 | schedstat_inc(p, se.statistics.nr_wakeups_sync); | 1676 | schedstat_inc(p->se.statistics.nr_wakeups_sync); |
1665 | |||
1666 | #endif /* CONFIG_SCHEDSTATS */ | ||
1667 | } | 1677 | } |
1668 | 1678 | ||
1669 | static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags) | 1679 | static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags) |
@@ -2084,8 +2094,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) | |||
2084 | 2094 | ||
2085 | ttwu_queue(p, cpu, wake_flags); | 2095 | ttwu_queue(p, cpu, wake_flags); |
2086 | stat: | 2096 | stat: |
2087 | if (schedstat_enabled()) | 2097 | ttwu_stat(p, cpu, wake_flags); |
2088 | ttwu_stat(p, cpu, wake_flags); | ||
2089 | out: | 2098 | out: |
2090 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); | 2099 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); |
2091 | 2100 | ||
@@ -2095,6 +2104,7 @@ out: | |||
2095 | /** | 2104 | /** |
2096 | * try_to_wake_up_local - try to wake up a local task with rq lock held | 2105 | * try_to_wake_up_local - try to wake up a local task with rq lock held |
2097 | * @p: the thread to be awakened | 2106 | * @p: the thread to be awakened |
2107 | * @cookie: context's cookie for pinning | ||
2098 | * | 2108 | * |
2099 | * Put @p on the run-queue if it's not already there. The caller must | 2109 | * Put @p on the run-queue if it's not already there. The caller must |
2100 | * ensure that this_rq() is locked, @p is bound to this_rq() and not | 2110 | * ensure that this_rq() is locked, @p is bound to this_rq() and not |
@@ -2133,8 +2143,7 @@ static void try_to_wake_up_local(struct task_struct *p, struct pin_cookie cookie | |||
2133 | ttwu_activate(rq, p, ENQUEUE_WAKEUP); | 2143 | ttwu_activate(rq, p, ENQUEUE_WAKEUP); |
2134 | 2144 | ||
2135 | ttwu_do_wakeup(rq, p, 0, cookie); | 2145 | ttwu_do_wakeup(rq, p, 0, cookie); |
2136 | if (schedstat_enabled()) | 2146 | ttwu_stat(p, smp_processor_id(), 0); |
2137 | ttwu_stat(p, smp_processor_id(), 0); | ||
2138 | out: | 2147 | out: |
2139 | raw_spin_unlock(&p->pi_lock); | 2148 | raw_spin_unlock(&p->pi_lock); |
2140 | } | 2149 | } |
@@ -2772,6 +2781,10 @@ static struct rq *finish_task_switch(struct task_struct *prev) | |||
2772 | * task and put them back on the free list. | 2781 | * task and put them back on the free list. |
2773 | */ | 2782 | */ |
2774 | kprobe_flush_task(prev); | 2783 | kprobe_flush_task(prev); |
2784 | |||
2785 | /* Task is done with its stack. */ | ||
2786 | put_task_stack(prev); | ||
2787 | |||
2775 | put_task_struct(prev); | 2788 | put_task_struct(prev); |
2776 | } | 2789 | } |
2777 | 2790 | ||
@@ -3192,6 +3205,9 @@ static inline void preempt_latency_stop(int val) { } | |||
3192 | */ | 3205 | */ |
3193 | static noinline void __schedule_bug(struct task_struct *prev) | 3206 | static noinline void __schedule_bug(struct task_struct *prev) |
3194 | { | 3207 | { |
3208 | /* Save this before calling printk(), since that will clobber it */ | ||
3209 | unsigned long preempt_disable_ip = get_preempt_disable_ip(current); | ||
3210 | |||
3195 | if (oops_in_progress) | 3211 | if (oops_in_progress) |
3196 | return; | 3212 | return; |
3197 | 3213 | ||
@@ -3202,13 +3218,12 @@ static noinline void __schedule_bug(struct task_struct *prev) | |||
3202 | print_modules(); | 3218 | print_modules(); |
3203 | if (irqs_disabled()) | 3219 | if (irqs_disabled()) |
3204 | print_irqtrace_events(prev); | 3220 | print_irqtrace_events(prev); |
3205 | #ifdef CONFIG_DEBUG_PREEMPT | 3221 | if (IS_ENABLED(CONFIG_DEBUG_PREEMPT) |
3206 | if (in_atomic_preempt_off()) { | 3222 | && in_atomic_preempt_off()) { |
3207 | pr_err("Preemption disabled at:"); | 3223 | pr_err("Preemption disabled at:"); |
3208 | print_ip_sym(current->preempt_disable_ip); | 3224 | print_ip_sym(preempt_disable_ip); |
3209 | pr_cont("\n"); | 3225 | pr_cont("\n"); |
3210 | } | 3226 | } |
3211 | #endif | ||
3212 | if (panic_on_warn) | 3227 | if (panic_on_warn) |
3213 | panic("scheduling while atomic\n"); | 3228 | panic("scheduling while atomic\n"); |
3214 | 3229 | ||
@@ -3234,7 +3249,7 @@ static inline void schedule_debug(struct task_struct *prev) | |||
3234 | 3249 | ||
3235 | profile_hit(SCHED_PROFILING, __builtin_return_address(0)); | 3250 | profile_hit(SCHED_PROFILING, __builtin_return_address(0)); |
3236 | 3251 | ||
3237 | schedstat_inc(this_rq(), sched_count); | 3252 | schedstat_inc(this_rq()->sched_count); |
3238 | } | 3253 | } |
3239 | 3254 | ||
3240 | /* | 3255 | /* |
@@ -3327,17 +3342,6 @@ static void __sched notrace __schedule(bool preempt) | |||
3327 | rq = cpu_rq(cpu); | 3342 | rq = cpu_rq(cpu); |
3328 | prev = rq->curr; | 3343 | prev = rq->curr; |
3329 | 3344 | ||
3330 | /* | ||
3331 | * do_exit() calls schedule() with preemption disabled as an exception; | ||
3332 | * however we must fix that up, otherwise the next task will see an | ||
3333 | * inconsistent (higher) preempt count. | ||
3334 | * | ||
3335 | * It also avoids the below schedule_debug() test from complaining | ||
3336 | * about this. | ||
3337 | */ | ||
3338 | if (unlikely(prev->state == TASK_DEAD)) | ||
3339 | preempt_enable_no_resched_notrace(); | ||
3340 | |||
3341 | schedule_debug(prev); | 3345 | schedule_debug(prev); |
3342 | 3346 | ||
3343 | if (sched_feat(HRTICK)) | 3347 | if (sched_feat(HRTICK)) |
@@ -3403,7 +3407,33 @@ static void __sched notrace __schedule(bool preempt) | |||
3403 | 3407 | ||
3404 | balance_callback(rq); | 3408 | balance_callback(rq); |
3405 | } | 3409 | } |
3406 | STACK_FRAME_NON_STANDARD(__schedule); /* switch_to() */ | 3410 | |
3411 | void __noreturn do_task_dead(void) | ||
3412 | { | ||
3413 | /* | ||
3414 | * The setting of TASK_RUNNING by try_to_wake_up() may be delayed | ||
3415 | * when the following two conditions become true. | ||
3416 | * - There is race condition of mmap_sem (It is acquired by | ||
3417 | * exit_mm()), and | ||
3418 | * - SMI occurs before setting TASK_RUNINNG. | ||
3419 | * (or hypervisor of virtual machine switches to other guest) | ||
3420 | * As a result, we may become TASK_RUNNING after becoming TASK_DEAD | ||
3421 | * | ||
3422 | * To avoid it, we have to wait for releasing tsk->pi_lock which | ||
3423 | * is held by try_to_wake_up() | ||
3424 | */ | ||
3425 | smp_mb(); | ||
3426 | raw_spin_unlock_wait(¤t->pi_lock); | ||
3427 | |||
3428 | /* causes final put_task_struct in finish_task_switch(). */ | ||
3429 | __set_current_state(TASK_DEAD); | ||
3430 | current->flags |= PF_NOFREEZE; /* tell freezer to ignore us */ | ||
3431 | __schedule(false); | ||
3432 | BUG(); | ||
3433 | /* Avoid "noreturn function does return". */ | ||
3434 | for (;;) | ||
3435 | cpu_relax(); /* For when BUG is null */ | ||
3436 | } | ||
3407 | 3437 | ||
3408 | static inline void sched_submit_work(struct task_struct *tsk) | 3438 | static inline void sched_submit_work(struct task_struct *tsk) |
3409 | { | 3439 | { |
@@ -3687,10 +3717,10 @@ void rt_mutex_setprio(struct task_struct *p, int prio) | |||
3687 | 3717 | ||
3688 | p->prio = prio; | 3718 | p->prio = prio; |
3689 | 3719 | ||
3690 | if (running) | ||
3691 | p->sched_class->set_curr_task(rq); | ||
3692 | if (queued) | 3720 | if (queued) |
3693 | enqueue_task(rq, p, queue_flag); | 3721 | enqueue_task(rq, p, queue_flag); |
3722 | if (running) | ||
3723 | set_curr_task(rq, p); | ||
3694 | 3724 | ||
3695 | check_class_changed(rq, p, prev_class, oldprio); | 3725 | check_class_changed(rq, p, prev_class, oldprio); |
3696 | out_unlock: | 3726 | out_unlock: |
@@ -3704,7 +3734,8 @@ out_unlock: | |||
3704 | 3734 | ||
3705 | void set_user_nice(struct task_struct *p, long nice) | 3735 | void set_user_nice(struct task_struct *p, long nice) |
3706 | { | 3736 | { |
3707 | int old_prio, delta, queued; | 3737 | bool queued, running; |
3738 | int old_prio, delta; | ||
3708 | struct rq_flags rf; | 3739 | struct rq_flags rf; |
3709 | struct rq *rq; | 3740 | struct rq *rq; |
3710 | 3741 | ||
@@ -3726,8 +3757,11 @@ void set_user_nice(struct task_struct *p, long nice) | |||
3726 | goto out_unlock; | 3757 | goto out_unlock; |
3727 | } | 3758 | } |
3728 | queued = task_on_rq_queued(p); | 3759 | queued = task_on_rq_queued(p); |
3760 | running = task_current(rq, p); | ||
3729 | if (queued) | 3761 | if (queued) |
3730 | dequeue_task(rq, p, DEQUEUE_SAVE); | 3762 | dequeue_task(rq, p, DEQUEUE_SAVE); |
3763 | if (running) | ||
3764 | put_prev_task(rq, p); | ||
3731 | 3765 | ||
3732 | p->static_prio = NICE_TO_PRIO(nice); | 3766 | p->static_prio = NICE_TO_PRIO(nice); |
3733 | set_load_weight(p); | 3767 | set_load_weight(p); |
@@ -3744,6 +3778,8 @@ void set_user_nice(struct task_struct *p, long nice) | |||
3744 | if (delta < 0 || (delta > 0 && task_running(rq, p))) | 3778 | if (delta < 0 || (delta > 0 && task_running(rq, p))) |
3745 | resched_curr(rq); | 3779 | resched_curr(rq); |
3746 | } | 3780 | } |
3781 | if (running) | ||
3782 | set_curr_task(rq, p); | ||
3747 | out_unlock: | 3783 | out_unlock: |
3748 | task_rq_unlock(rq, p, &rf); | 3784 | task_rq_unlock(rq, p, &rf); |
3749 | } | 3785 | } |
@@ -4243,8 +4279,6 @@ change: | |||
4243 | prev_class = p->sched_class; | 4279 | prev_class = p->sched_class; |
4244 | __setscheduler(rq, p, attr, pi); | 4280 | __setscheduler(rq, p, attr, pi); |
4245 | 4281 | ||
4246 | if (running) | ||
4247 | p->sched_class->set_curr_task(rq); | ||
4248 | if (queued) { | 4282 | if (queued) { |
4249 | /* | 4283 | /* |
4250 | * We enqueue to tail when the priority of a task is | 4284 | * We enqueue to tail when the priority of a task is |
@@ -4255,6 +4289,8 @@ change: | |||
4255 | 4289 | ||
4256 | enqueue_task(rq, p, queue_flags); | 4290 | enqueue_task(rq, p, queue_flags); |
4257 | } | 4291 | } |
4292 | if (running) | ||
4293 | set_curr_task(rq, p); | ||
4258 | 4294 | ||
4259 | check_class_changed(rq, p, prev_class, oldprio); | 4295 | check_class_changed(rq, p, prev_class, oldprio); |
4260 | preempt_disable(); /* avoid rq from going away on us */ | 4296 | preempt_disable(); /* avoid rq from going away on us */ |
@@ -4846,7 +4882,7 @@ SYSCALL_DEFINE0(sched_yield) | |||
4846 | { | 4882 | { |
4847 | struct rq *rq = this_rq_lock(); | 4883 | struct rq *rq = this_rq_lock(); |
4848 | 4884 | ||
4849 | schedstat_inc(rq, yld_count); | 4885 | schedstat_inc(rq->yld_count); |
4850 | current->sched_class->yield_task(rq); | 4886 | current->sched_class->yield_task(rq); |
4851 | 4887 | ||
4852 | /* | 4888 | /* |
@@ -4863,6 +4899,7 @@ SYSCALL_DEFINE0(sched_yield) | |||
4863 | return 0; | 4899 | return 0; |
4864 | } | 4900 | } |
4865 | 4901 | ||
4902 | #ifndef CONFIG_PREEMPT | ||
4866 | int __sched _cond_resched(void) | 4903 | int __sched _cond_resched(void) |
4867 | { | 4904 | { |
4868 | if (should_resched(0)) { | 4905 | if (should_resched(0)) { |
@@ -4872,6 +4909,7 @@ int __sched _cond_resched(void) | |||
4872 | return 0; | 4909 | return 0; |
4873 | } | 4910 | } |
4874 | EXPORT_SYMBOL(_cond_resched); | 4911 | EXPORT_SYMBOL(_cond_resched); |
4912 | #endif | ||
4875 | 4913 | ||
4876 | /* | 4914 | /* |
4877 | * __cond_resched_lock() - if a reschedule is pending, drop the given lock, | 4915 | * __cond_resched_lock() - if a reschedule is pending, drop the given lock, |
@@ -4997,7 +5035,7 @@ again: | |||
4997 | 5035 | ||
4998 | yielded = curr->sched_class->yield_to_task(rq, p, preempt); | 5036 | yielded = curr->sched_class->yield_to_task(rq, p, preempt); |
4999 | if (yielded) { | 5037 | if (yielded) { |
5000 | schedstat_inc(rq, yld_count); | 5038 | schedstat_inc(rq->yld_count); |
5001 | /* | 5039 | /* |
5002 | * Make p's CPU reschedule; pick_next_entity takes care of | 5040 | * Make p's CPU reschedule; pick_next_entity takes care of |
5003 | * fairness. | 5041 | * fairness. |
@@ -5154,21 +5192,14 @@ void sched_show_task(struct task_struct *p) | |||
5154 | int ppid; | 5192 | int ppid; |
5155 | unsigned long state = p->state; | 5193 | unsigned long state = p->state; |
5156 | 5194 | ||
5195 | if (!try_get_task_stack(p)) | ||
5196 | return; | ||
5157 | if (state) | 5197 | if (state) |
5158 | state = __ffs(state) + 1; | 5198 | state = __ffs(state) + 1; |
5159 | printk(KERN_INFO "%-15.15s %c", p->comm, | 5199 | printk(KERN_INFO "%-15.15s %c", p->comm, |
5160 | state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); | 5200 | state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); |
5161 | #if BITS_PER_LONG == 32 | ||
5162 | if (state == TASK_RUNNING) | ||
5163 | printk(KERN_CONT " running "); | ||
5164 | else | ||
5165 | printk(KERN_CONT " %08lx ", thread_saved_pc(p)); | ||
5166 | #else | ||
5167 | if (state == TASK_RUNNING) | 5201 | if (state == TASK_RUNNING) |
5168 | printk(KERN_CONT " running task "); | 5202 | printk(KERN_CONT " running task "); |
5169 | else | ||
5170 | printk(KERN_CONT " %016lx ", thread_saved_pc(p)); | ||
5171 | #endif | ||
5172 | #ifdef CONFIG_DEBUG_STACK_USAGE | 5203 | #ifdef CONFIG_DEBUG_STACK_USAGE |
5173 | free = stack_not_used(p); | 5204 | free = stack_not_used(p); |
5174 | #endif | 5205 | #endif |
@@ -5183,6 +5214,7 @@ void sched_show_task(struct task_struct *p) | |||
5183 | 5214 | ||
5184 | print_worker_info(KERN_INFO, p); | 5215 | print_worker_info(KERN_INFO, p); |
5185 | show_stack(p, NULL); | 5216 | show_stack(p, NULL); |
5217 | put_task_stack(p); | ||
5186 | } | 5218 | } |
5187 | 5219 | ||
5188 | void show_state_filter(unsigned long state_filter) | 5220 | void show_state_filter(unsigned long state_filter) |
@@ -5417,10 +5449,10 @@ void sched_setnuma(struct task_struct *p, int nid) | |||
5417 | 5449 | ||
5418 | p->numa_preferred_nid = nid; | 5450 | p->numa_preferred_nid = nid; |
5419 | 5451 | ||
5420 | if (running) | ||
5421 | p->sched_class->set_curr_task(rq); | ||
5422 | if (queued) | 5452 | if (queued) |
5423 | enqueue_task(rq, p, ENQUEUE_RESTORE); | 5453 | enqueue_task(rq, p, ENQUEUE_RESTORE); |
5454 | if (running) | ||
5455 | set_curr_task(rq, p); | ||
5424 | task_rq_unlock(rq, p, &rf); | 5456 | task_rq_unlock(rq, p, &rf); |
5425 | } | 5457 | } |
5426 | #endif /* CONFIG_NUMA_BALANCING */ | 5458 | #endif /* CONFIG_NUMA_BALANCING */ |
@@ -5717,6 +5749,8 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) | |||
5717 | } | 5749 | } |
5718 | } | 5750 | } |
5719 | #else /* !CONFIG_SCHED_DEBUG */ | 5751 | #else /* !CONFIG_SCHED_DEBUG */ |
5752 | |||
5753 | # define sched_debug_enabled 0 | ||
5720 | # define sched_domain_debug(sd, cpu) do { } while (0) | 5754 | # define sched_domain_debug(sd, cpu) do { } while (0) |
5721 | static inline bool sched_debug(void) | 5755 | static inline bool sched_debug(void) |
5722 | { | 5756 | { |
@@ -5735,6 +5769,7 @@ static int sd_degenerate(struct sched_domain *sd) | |||
5735 | SD_BALANCE_FORK | | 5769 | SD_BALANCE_FORK | |
5736 | SD_BALANCE_EXEC | | 5770 | SD_BALANCE_EXEC | |
5737 | SD_SHARE_CPUCAPACITY | | 5771 | SD_SHARE_CPUCAPACITY | |
5772 | SD_ASYM_CPUCAPACITY | | ||
5738 | SD_SHARE_PKG_RESOURCES | | 5773 | SD_SHARE_PKG_RESOURCES | |
5739 | SD_SHARE_POWERDOMAIN)) { | 5774 | SD_SHARE_POWERDOMAIN)) { |
5740 | if (sd->groups != sd->groups->next) | 5775 | if (sd->groups != sd->groups->next) |
@@ -5765,6 +5800,7 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) | |||
5765 | SD_BALANCE_NEWIDLE | | 5800 | SD_BALANCE_NEWIDLE | |
5766 | SD_BALANCE_FORK | | 5801 | SD_BALANCE_FORK | |
5767 | SD_BALANCE_EXEC | | 5802 | SD_BALANCE_EXEC | |
5803 | SD_ASYM_CPUCAPACITY | | ||
5768 | SD_SHARE_CPUCAPACITY | | 5804 | SD_SHARE_CPUCAPACITY | |
5769 | SD_SHARE_PKG_RESOURCES | | 5805 | SD_SHARE_PKG_RESOURCES | |
5770 | SD_PREFER_SIBLING | | 5806 | SD_PREFER_SIBLING | |
@@ -5909,10 +5945,8 @@ static void free_sched_groups(struct sched_group *sg, int free_sgc) | |||
5909 | } while (sg != first); | 5945 | } while (sg != first); |
5910 | } | 5946 | } |
5911 | 5947 | ||
5912 | static void free_sched_domain(struct rcu_head *rcu) | 5948 | static void destroy_sched_domain(struct sched_domain *sd) |
5913 | { | 5949 | { |
5914 | struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu); | ||
5915 | |||
5916 | /* | 5950 | /* |
5917 | * If its an overlapping domain it has private groups, iterate and | 5951 | * If its an overlapping domain it has private groups, iterate and |
5918 | * nuke them all. | 5952 | * nuke them all. |
@@ -5923,18 +5957,26 @@ static void free_sched_domain(struct rcu_head *rcu) | |||
5923 | kfree(sd->groups->sgc); | 5957 | kfree(sd->groups->sgc); |
5924 | kfree(sd->groups); | 5958 | kfree(sd->groups); |
5925 | } | 5959 | } |
5960 | if (sd->shared && atomic_dec_and_test(&sd->shared->ref)) | ||
5961 | kfree(sd->shared); | ||
5926 | kfree(sd); | 5962 | kfree(sd); |
5927 | } | 5963 | } |
5928 | 5964 | ||
5929 | static void destroy_sched_domain(struct sched_domain *sd, int cpu) | 5965 | static void destroy_sched_domains_rcu(struct rcu_head *rcu) |
5930 | { | 5966 | { |
5931 | call_rcu(&sd->rcu, free_sched_domain); | 5967 | struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu); |
5968 | |||
5969 | while (sd) { | ||
5970 | struct sched_domain *parent = sd->parent; | ||
5971 | destroy_sched_domain(sd); | ||
5972 | sd = parent; | ||
5973 | } | ||
5932 | } | 5974 | } |
5933 | 5975 | ||
5934 | static void destroy_sched_domains(struct sched_domain *sd, int cpu) | 5976 | static void destroy_sched_domains(struct sched_domain *sd) |
5935 | { | 5977 | { |
5936 | for (; sd; sd = sd->parent) | 5978 | if (sd) |
5937 | destroy_sched_domain(sd, cpu); | 5979 | call_rcu(&sd->rcu, destroy_sched_domains_rcu); |
5938 | } | 5980 | } |
5939 | 5981 | ||
5940 | /* | 5982 | /* |
@@ -5949,14 +5991,14 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu) | |||
5949 | DEFINE_PER_CPU(struct sched_domain *, sd_llc); | 5991 | DEFINE_PER_CPU(struct sched_domain *, sd_llc); |
5950 | DEFINE_PER_CPU(int, sd_llc_size); | 5992 | DEFINE_PER_CPU(int, sd_llc_size); |
5951 | DEFINE_PER_CPU(int, sd_llc_id); | 5993 | DEFINE_PER_CPU(int, sd_llc_id); |
5994 | DEFINE_PER_CPU(struct sched_domain_shared *, sd_llc_shared); | ||
5952 | DEFINE_PER_CPU(struct sched_domain *, sd_numa); | 5995 | DEFINE_PER_CPU(struct sched_domain *, sd_numa); |
5953 | DEFINE_PER_CPU(struct sched_domain *, sd_busy); | ||
5954 | DEFINE_PER_CPU(struct sched_domain *, sd_asym); | 5996 | DEFINE_PER_CPU(struct sched_domain *, sd_asym); |
5955 | 5997 | ||
5956 | static void update_top_cache_domain(int cpu) | 5998 | static void update_top_cache_domain(int cpu) |
5957 | { | 5999 | { |
6000 | struct sched_domain_shared *sds = NULL; | ||
5958 | struct sched_domain *sd; | 6001 | struct sched_domain *sd; |
5959 | struct sched_domain *busy_sd = NULL; | ||
5960 | int id = cpu; | 6002 | int id = cpu; |
5961 | int size = 1; | 6003 | int size = 1; |
5962 | 6004 | ||
@@ -5964,13 +6006,13 @@ static void update_top_cache_domain(int cpu) | |||
5964 | if (sd) { | 6006 | if (sd) { |
5965 | id = cpumask_first(sched_domain_span(sd)); | 6007 | id = cpumask_first(sched_domain_span(sd)); |
5966 | size = cpumask_weight(sched_domain_span(sd)); | 6008 | size = cpumask_weight(sched_domain_span(sd)); |
5967 | busy_sd = sd->parent; /* sd_busy */ | 6009 | sds = sd->shared; |
5968 | } | 6010 | } |
5969 | rcu_assign_pointer(per_cpu(sd_busy, cpu), busy_sd); | ||
5970 | 6011 | ||
5971 | rcu_assign_pointer(per_cpu(sd_llc, cpu), sd); | 6012 | rcu_assign_pointer(per_cpu(sd_llc, cpu), sd); |
5972 | per_cpu(sd_llc_size, cpu) = size; | 6013 | per_cpu(sd_llc_size, cpu) = size; |
5973 | per_cpu(sd_llc_id, cpu) = id; | 6014 | per_cpu(sd_llc_id, cpu) = id; |
6015 | rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds); | ||
5974 | 6016 | ||
5975 | sd = lowest_flag_domain(cpu, SD_NUMA); | 6017 | sd = lowest_flag_domain(cpu, SD_NUMA); |
5976 | rcu_assign_pointer(per_cpu(sd_numa, cpu), sd); | 6018 | rcu_assign_pointer(per_cpu(sd_numa, cpu), sd); |
@@ -6006,7 +6048,7 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) | |||
6006 | */ | 6048 | */ |
6007 | if (parent->flags & SD_PREFER_SIBLING) | 6049 | if (parent->flags & SD_PREFER_SIBLING) |
6008 | tmp->flags |= SD_PREFER_SIBLING; | 6050 | tmp->flags |= SD_PREFER_SIBLING; |
6009 | destroy_sched_domain(parent, cpu); | 6051 | destroy_sched_domain(parent); |
6010 | } else | 6052 | } else |
6011 | tmp = tmp->parent; | 6053 | tmp = tmp->parent; |
6012 | } | 6054 | } |
@@ -6014,7 +6056,7 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) | |||
6014 | if (sd && sd_degenerate(sd)) { | 6056 | if (sd && sd_degenerate(sd)) { |
6015 | tmp = sd; | 6057 | tmp = sd; |
6016 | sd = sd->parent; | 6058 | sd = sd->parent; |
6017 | destroy_sched_domain(tmp, cpu); | 6059 | destroy_sched_domain(tmp); |
6018 | if (sd) | 6060 | if (sd) |
6019 | sd->child = NULL; | 6061 | sd->child = NULL; |
6020 | } | 6062 | } |
@@ -6024,7 +6066,7 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) | |||
6024 | rq_attach_root(rq, rd); | 6066 | rq_attach_root(rq, rd); |
6025 | tmp = rq->sd; | 6067 | tmp = rq->sd; |
6026 | rcu_assign_pointer(rq->sd, sd); | 6068 | rcu_assign_pointer(rq->sd, sd); |
6027 | destroy_sched_domains(tmp, cpu); | 6069 | destroy_sched_domains(tmp); |
6028 | 6070 | ||
6029 | update_top_cache_domain(cpu); | 6071 | update_top_cache_domain(cpu); |
6030 | } | 6072 | } |
@@ -6267,7 +6309,6 @@ static void init_sched_groups_capacity(int cpu, struct sched_domain *sd) | |||
6267 | return; | 6309 | return; |
6268 | 6310 | ||
6269 | update_group_capacity(sd, cpu); | 6311 | update_group_capacity(sd, cpu); |
6270 | atomic_set(&sg->sgc->nr_busy_cpus, sg->group_weight); | ||
6271 | } | 6312 | } |
6272 | 6313 | ||
6273 | /* | 6314 | /* |
@@ -6355,6 +6396,9 @@ static void claim_allocations(int cpu, struct sched_domain *sd) | |||
6355 | WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd); | 6396 | WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd); |
6356 | *per_cpu_ptr(sdd->sd, cpu) = NULL; | 6397 | *per_cpu_ptr(sdd->sd, cpu) = NULL; |
6357 | 6398 | ||
6399 | if (atomic_read(&(*per_cpu_ptr(sdd->sds, cpu))->ref)) | ||
6400 | *per_cpu_ptr(sdd->sds, cpu) = NULL; | ||
6401 | |||
6358 | if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref)) | 6402 | if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref)) |
6359 | *per_cpu_ptr(sdd->sg, cpu) = NULL; | 6403 | *per_cpu_ptr(sdd->sg, cpu) = NULL; |
6360 | 6404 | ||
@@ -6374,26 +6418,37 @@ static int sched_domains_curr_level; | |||
6374 | /* | 6418 | /* |
6375 | * SD_flags allowed in topology descriptions. | 6419 | * SD_flags allowed in topology descriptions. |
6376 | * | 6420 | * |
6377 | * SD_SHARE_CPUCAPACITY - describes SMT topologies | 6421 | * These flags are purely descriptive of the topology and do not prescribe |
6378 | * SD_SHARE_PKG_RESOURCES - describes shared caches | 6422 | * behaviour. Behaviour is artificial and mapped in the below sd_init() |
6379 | * SD_NUMA - describes NUMA topologies | 6423 | * function: |
6380 | * SD_SHARE_POWERDOMAIN - describes shared power domain | 6424 | * |
6425 | * SD_SHARE_CPUCAPACITY - describes SMT topologies | ||
6426 | * SD_SHARE_PKG_RESOURCES - describes shared caches | ||
6427 | * SD_NUMA - describes NUMA topologies | ||
6428 | * SD_SHARE_POWERDOMAIN - describes shared power domain | ||
6429 | * SD_ASYM_CPUCAPACITY - describes mixed capacity topologies | ||
6381 | * | 6430 | * |
6382 | * Odd one out: | 6431 | * Odd one out, which beside describing the topology has a quirk also |
6383 | * SD_ASYM_PACKING - describes SMT quirks | 6432 | * prescribes the desired behaviour that goes along with it: |
6433 | * | ||
6434 | * SD_ASYM_PACKING - describes SMT quirks | ||
6384 | */ | 6435 | */ |
6385 | #define TOPOLOGY_SD_FLAGS \ | 6436 | #define TOPOLOGY_SD_FLAGS \ |
6386 | (SD_SHARE_CPUCAPACITY | \ | 6437 | (SD_SHARE_CPUCAPACITY | \ |
6387 | SD_SHARE_PKG_RESOURCES | \ | 6438 | SD_SHARE_PKG_RESOURCES | \ |
6388 | SD_NUMA | \ | 6439 | SD_NUMA | \ |
6389 | SD_ASYM_PACKING | \ | 6440 | SD_ASYM_PACKING | \ |
6441 | SD_ASYM_CPUCAPACITY | \ | ||
6390 | SD_SHARE_POWERDOMAIN) | 6442 | SD_SHARE_POWERDOMAIN) |
6391 | 6443 | ||
6392 | static struct sched_domain * | 6444 | static struct sched_domain * |
6393 | sd_init(struct sched_domain_topology_level *tl, int cpu) | 6445 | sd_init(struct sched_domain_topology_level *tl, |
6446 | const struct cpumask *cpu_map, | ||
6447 | struct sched_domain *child, int cpu) | ||
6394 | { | 6448 | { |
6395 | struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); | 6449 | struct sd_data *sdd = &tl->data; |
6396 | int sd_weight, sd_flags = 0; | 6450 | struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu); |
6451 | int sd_id, sd_weight, sd_flags = 0; | ||
6397 | 6452 | ||
6398 | #ifdef CONFIG_NUMA | 6453 | #ifdef CONFIG_NUMA |
6399 | /* | 6454 | /* |
@@ -6442,15 +6497,26 @@ sd_init(struct sched_domain_topology_level *tl, int cpu) | |||
6442 | .smt_gain = 0, | 6497 | .smt_gain = 0, |
6443 | .max_newidle_lb_cost = 0, | 6498 | .max_newidle_lb_cost = 0, |
6444 | .next_decay_max_lb_cost = jiffies, | 6499 | .next_decay_max_lb_cost = jiffies, |
6500 | .child = child, | ||
6445 | #ifdef CONFIG_SCHED_DEBUG | 6501 | #ifdef CONFIG_SCHED_DEBUG |
6446 | .name = tl->name, | 6502 | .name = tl->name, |
6447 | #endif | 6503 | #endif |
6448 | }; | 6504 | }; |
6449 | 6505 | ||
6506 | cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu)); | ||
6507 | sd_id = cpumask_first(sched_domain_span(sd)); | ||
6508 | |||
6450 | /* | 6509 | /* |
6451 | * Convert topological properties into behaviour. | 6510 | * Convert topological properties into behaviour. |
6452 | */ | 6511 | */ |
6453 | 6512 | ||
6513 | if (sd->flags & SD_ASYM_CPUCAPACITY) { | ||
6514 | struct sched_domain *t = sd; | ||
6515 | |||
6516 | for_each_lower_domain(t) | ||
6517 | t->flags |= SD_BALANCE_WAKE; | ||
6518 | } | ||
6519 | |||
6454 | if (sd->flags & SD_SHARE_CPUCAPACITY) { | 6520 | if (sd->flags & SD_SHARE_CPUCAPACITY) { |
6455 | sd->flags |= SD_PREFER_SIBLING; | 6521 | sd->flags |= SD_PREFER_SIBLING; |
6456 | sd->imbalance_pct = 110; | 6522 | sd->imbalance_pct = 110; |
@@ -6482,7 +6548,17 @@ sd_init(struct sched_domain_topology_level *tl, int cpu) | |||
6482 | sd->idle_idx = 1; | 6548 | sd->idle_idx = 1; |
6483 | } | 6549 | } |
6484 | 6550 | ||
6485 | sd->private = &tl->data; | 6551 | /* |
6552 | * For all levels sharing cache; connect a sched_domain_shared | ||
6553 | * instance. | ||
6554 | */ | ||
6555 | if (sd->flags & SD_SHARE_PKG_RESOURCES) { | ||
6556 | sd->shared = *per_cpu_ptr(sdd->sds, sd_id); | ||
6557 | atomic_inc(&sd->shared->ref); | ||
6558 | atomic_set(&sd->shared->nr_busy_cpus, sd_weight); | ||
6559 | } | ||
6560 | |||
6561 | sd->private = sdd; | ||
6486 | 6562 | ||
6487 | return sd; | 6563 | return sd; |
6488 | } | 6564 | } |
@@ -6509,6 +6585,9 @@ static struct sched_domain_topology_level *sched_domain_topology = | |||
6509 | 6585 | ||
6510 | void set_sched_topology(struct sched_domain_topology_level *tl) | 6586 | void set_sched_topology(struct sched_domain_topology_level *tl) |
6511 | { | 6587 | { |
6588 | if (WARN_ON_ONCE(sched_smp_initialized)) | ||
6589 | return; | ||
6590 | |||
6512 | sched_domain_topology = tl; | 6591 | sched_domain_topology = tl; |
6513 | } | 6592 | } |
6514 | 6593 | ||
@@ -6789,6 +6868,10 @@ static int __sdt_alloc(const struct cpumask *cpu_map) | |||
6789 | if (!sdd->sd) | 6868 | if (!sdd->sd) |
6790 | return -ENOMEM; | 6869 | return -ENOMEM; |
6791 | 6870 | ||
6871 | sdd->sds = alloc_percpu(struct sched_domain_shared *); | ||
6872 | if (!sdd->sds) | ||
6873 | return -ENOMEM; | ||
6874 | |||
6792 | sdd->sg = alloc_percpu(struct sched_group *); | 6875 | sdd->sg = alloc_percpu(struct sched_group *); |
6793 | if (!sdd->sg) | 6876 | if (!sdd->sg) |
6794 | return -ENOMEM; | 6877 | return -ENOMEM; |
@@ -6799,6 +6882,7 @@ static int __sdt_alloc(const struct cpumask *cpu_map) | |||
6799 | 6882 | ||
6800 | for_each_cpu(j, cpu_map) { | 6883 | for_each_cpu(j, cpu_map) { |
6801 | struct sched_domain *sd; | 6884 | struct sched_domain *sd; |
6885 | struct sched_domain_shared *sds; | ||
6802 | struct sched_group *sg; | 6886 | struct sched_group *sg; |
6803 | struct sched_group_capacity *sgc; | 6887 | struct sched_group_capacity *sgc; |
6804 | 6888 | ||
@@ -6809,6 +6893,13 @@ static int __sdt_alloc(const struct cpumask *cpu_map) | |||
6809 | 6893 | ||
6810 | *per_cpu_ptr(sdd->sd, j) = sd; | 6894 | *per_cpu_ptr(sdd->sd, j) = sd; |
6811 | 6895 | ||
6896 | sds = kzalloc_node(sizeof(struct sched_domain_shared), | ||
6897 | GFP_KERNEL, cpu_to_node(j)); | ||
6898 | if (!sds) | ||
6899 | return -ENOMEM; | ||
6900 | |||
6901 | *per_cpu_ptr(sdd->sds, j) = sds; | ||
6902 | |||
6812 | sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), | 6903 | sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), |
6813 | GFP_KERNEL, cpu_to_node(j)); | 6904 | GFP_KERNEL, cpu_to_node(j)); |
6814 | if (!sg) | 6905 | if (!sg) |
@@ -6848,6 +6939,8 @@ static void __sdt_free(const struct cpumask *cpu_map) | |||
6848 | kfree(*per_cpu_ptr(sdd->sd, j)); | 6939 | kfree(*per_cpu_ptr(sdd->sd, j)); |
6849 | } | 6940 | } |
6850 | 6941 | ||
6942 | if (sdd->sds) | ||
6943 | kfree(*per_cpu_ptr(sdd->sds, j)); | ||
6851 | if (sdd->sg) | 6944 | if (sdd->sg) |
6852 | kfree(*per_cpu_ptr(sdd->sg, j)); | 6945 | kfree(*per_cpu_ptr(sdd->sg, j)); |
6853 | if (sdd->sgc) | 6946 | if (sdd->sgc) |
@@ -6855,6 +6948,8 @@ static void __sdt_free(const struct cpumask *cpu_map) | |||
6855 | } | 6948 | } |
6856 | free_percpu(sdd->sd); | 6949 | free_percpu(sdd->sd); |
6857 | sdd->sd = NULL; | 6950 | sdd->sd = NULL; |
6951 | free_percpu(sdd->sds); | ||
6952 | sdd->sds = NULL; | ||
6858 | free_percpu(sdd->sg); | 6953 | free_percpu(sdd->sg); |
6859 | sdd->sg = NULL; | 6954 | sdd->sg = NULL; |
6860 | free_percpu(sdd->sgc); | 6955 | free_percpu(sdd->sgc); |
@@ -6866,16 +6961,12 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, | |||
6866 | const struct cpumask *cpu_map, struct sched_domain_attr *attr, | 6961 | const struct cpumask *cpu_map, struct sched_domain_attr *attr, |
6867 | struct sched_domain *child, int cpu) | 6962 | struct sched_domain *child, int cpu) |
6868 | { | 6963 | { |
6869 | struct sched_domain *sd = sd_init(tl, cpu); | 6964 | struct sched_domain *sd = sd_init(tl, cpu_map, child, cpu); |
6870 | if (!sd) | ||
6871 | return child; | ||
6872 | 6965 | ||
6873 | cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu)); | ||
6874 | if (child) { | 6966 | if (child) { |
6875 | sd->level = child->level + 1; | 6967 | sd->level = child->level + 1; |
6876 | sched_domain_level_max = max(sched_domain_level_max, sd->level); | 6968 | sched_domain_level_max = max(sched_domain_level_max, sd->level); |
6877 | child->parent = sd; | 6969 | child->parent = sd; |
6878 | sd->child = child; | ||
6879 | 6970 | ||
6880 | if (!cpumask_subset(sched_domain_span(child), | 6971 | if (!cpumask_subset(sched_domain_span(child), |
6881 | sched_domain_span(sd))) { | 6972 | sched_domain_span(sd))) { |
@@ -6906,6 +6997,7 @@ static int build_sched_domains(const struct cpumask *cpu_map, | |||
6906 | enum s_alloc alloc_state; | 6997 | enum s_alloc alloc_state; |
6907 | struct sched_domain *sd; | 6998 | struct sched_domain *sd; |
6908 | struct s_data d; | 6999 | struct s_data d; |
7000 | struct rq *rq = NULL; | ||
6909 | int i, ret = -ENOMEM; | 7001 | int i, ret = -ENOMEM; |
6910 | 7002 | ||
6911 | alloc_state = __visit_domain_allocation_hell(&d, cpu_map); | 7003 | alloc_state = __visit_domain_allocation_hell(&d, cpu_map); |
@@ -6956,11 +7048,22 @@ static int build_sched_domains(const struct cpumask *cpu_map, | |||
6956 | /* Attach the domains */ | 7048 | /* Attach the domains */ |
6957 | rcu_read_lock(); | 7049 | rcu_read_lock(); |
6958 | for_each_cpu(i, cpu_map) { | 7050 | for_each_cpu(i, cpu_map) { |
7051 | rq = cpu_rq(i); | ||
6959 | sd = *per_cpu_ptr(d.sd, i); | 7052 | sd = *per_cpu_ptr(d.sd, i); |
7053 | |||
7054 | /* Use READ_ONCE()/WRITE_ONCE() to avoid load/store tearing: */ | ||
7055 | if (rq->cpu_capacity_orig > READ_ONCE(d.rd->max_cpu_capacity)) | ||
7056 | WRITE_ONCE(d.rd->max_cpu_capacity, rq->cpu_capacity_orig); | ||
7057 | |||
6960 | cpu_attach_domain(sd, d.rd, i); | 7058 | cpu_attach_domain(sd, d.rd, i); |
6961 | } | 7059 | } |
6962 | rcu_read_unlock(); | 7060 | rcu_read_unlock(); |
6963 | 7061 | ||
7062 | if (rq && sched_debug_enabled) { | ||
7063 | pr_info("span: %*pbl (max cpu_capacity = %lu)\n", | ||
7064 | cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity); | ||
7065 | } | ||
7066 | |||
6964 | ret = 0; | 7067 | ret = 0; |
6965 | error: | 7068 | error: |
6966 | __free_domain_allocs(&d, alloc_state, cpu_map); | 7069 | __free_domain_allocs(&d, alloc_state, cpu_map); |
@@ -7319,6 +7422,22 @@ int sched_cpu_dying(unsigned int cpu) | |||
7319 | } | 7422 | } |
7320 | #endif | 7423 | #endif |
7321 | 7424 | ||
7425 | #ifdef CONFIG_SCHED_SMT | ||
7426 | DEFINE_STATIC_KEY_FALSE(sched_smt_present); | ||
7427 | |||
7428 | static void sched_init_smt(void) | ||
7429 | { | ||
7430 | /* | ||
7431 | * We've enumerated all CPUs and will assume that if any CPU | ||
7432 | * has SMT siblings, CPU0 will too. | ||
7433 | */ | ||
7434 | if (cpumask_weight(cpu_smt_mask(0)) > 1) | ||
7435 | static_branch_enable(&sched_smt_present); | ||
7436 | } | ||
7437 | #else | ||
7438 | static inline void sched_init_smt(void) { } | ||
7439 | #endif | ||
7440 | |||
7322 | void __init sched_init_smp(void) | 7441 | void __init sched_init_smp(void) |
7323 | { | 7442 | { |
7324 | cpumask_var_t non_isolated_cpus; | 7443 | cpumask_var_t non_isolated_cpus; |
@@ -7348,6 +7467,9 @@ void __init sched_init_smp(void) | |||
7348 | 7467 | ||
7349 | init_sched_rt_class(); | 7468 | init_sched_rt_class(); |
7350 | init_sched_dl_class(); | 7469 | init_sched_dl_class(); |
7470 | |||
7471 | sched_init_smt(); | ||
7472 | |||
7351 | sched_smp_initialized = true; | 7473 | sched_smp_initialized = true; |
7352 | } | 7474 | } |
7353 | 7475 | ||
@@ -7385,12 +7507,29 @@ static struct kmem_cache *task_group_cache __read_mostly; | |||
7385 | #endif | 7507 | #endif |
7386 | 7508 | ||
7387 | DECLARE_PER_CPU(cpumask_var_t, load_balance_mask); | 7509 | DECLARE_PER_CPU(cpumask_var_t, load_balance_mask); |
7510 | DECLARE_PER_CPU(cpumask_var_t, select_idle_mask); | ||
7511 | |||
7512 | #define WAIT_TABLE_BITS 8 | ||
7513 | #define WAIT_TABLE_SIZE (1 << WAIT_TABLE_BITS) | ||
7514 | static wait_queue_head_t bit_wait_table[WAIT_TABLE_SIZE] __cacheline_aligned; | ||
7515 | |||
7516 | wait_queue_head_t *bit_waitqueue(void *word, int bit) | ||
7517 | { | ||
7518 | const int shift = BITS_PER_LONG == 32 ? 5 : 6; | ||
7519 | unsigned long val = (unsigned long)word << shift | bit; | ||
7520 | |||
7521 | return bit_wait_table + hash_long(val, WAIT_TABLE_BITS); | ||
7522 | } | ||
7523 | EXPORT_SYMBOL(bit_waitqueue); | ||
7388 | 7524 | ||
7389 | void __init sched_init(void) | 7525 | void __init sched_init(void) |
7390 | { | 7526 | { |
7391 | int i, j; | 7527 | int i, j; |
7392 | unsigned long alloc_size = 0, ptr; | 7528 | unsigned long alloc_size = 0, ptr; |
7393 | 7529 | ||
7530 | for (i = 0; i < WAIT_TABLE_SIZE; i++) | ||
7531 | init_waitqueue_head(bit_wait_table + i); | ||
7532 | |||
7394 | #ifdef CONFIG_FAIR_GROUP_SCHED | 7533 | #ifdef CONFIG_FAIR_GROUP_SCHED |
7395 | alloc_size += 2 * nr_cpu_ids * sizeof(void **); | 7534 | alloc_size += 2 * nr_cpu_ids * sizeof(void **); |
7396 | #endif | 7535 | #endif |
@@ -7421,6 +7560,8 @@ void __init sched_init(void) | |||
7421 | for_each_possible_cpu(i) { | 7560 | for_each_possible_cpu(i) { |
7422 | per_cpu(load_balance_mask, i) = (cpumask_var_t)kzalloc_node( | 7561 | per_cpu(load_balance_mask, i) = (cpumask_var_t)kzalloc_node( |
7423 | cpumask_size(), GFP_KERNEL, cpu_to_node(i)); | 7562 | cpumask_size(), GFP_KERNEL, cpu_to_node(i)); |
7563 | per_cpu(select_idle_mask, i) = (cpumask_var_t)kzalloc_node( | ||
7564 | cpumask_size(), GFP_KERNEL, cpu_to_node(i)); | ||
7424 | } | 7565 | } |
7425 | #endif /* CONFIG_CPUMASK_OFFSTACK */ | 7566 | #endif /* CONFIG_CPUMASK_OFFSTACK */ |
7426 | 7567 | ||
@@ -7523,10 +7664,6 @@ void __init sched_init(void) | |||
7523 | 7664 | ||
7524 | set_load_weight(&init_task); | 7665 | set_load_weight(&init_task); |
7525 | 7666 | ||
7526 | #ifdef CONFIG_PREEMPT_NOTIFIERS | ||
7527 | INIT_HLIST_HEAD(&init_task.preempt_notifiers); | ||
7528 | #endif | ||
7529 | |||
7530 | /* | 7667 | /* |
7531 | * The boot idle thread does lazy MMU switching as well: | 7668 | * The boot idle thread does lazy MMU switching as well: |
7532 | */ | 7669 | */ |
@@ -7534,11 +7671,6 @@ void __init sched_init(void) | |||
7534 | enter_lazy_tlb(&init_mm, current); | 7671 | enter_lazy_tlb(&init_mm, current); |
7535 | 7672 | ||
7536 | /* | 7673 | /* |
7537 | * During early bootup we pretend to be a normal task: | ||
7538 | */ | ||
7539 | current->sched_class = &fair_sched_class; | ||
7540 | |||
7541 | /* | ||
7542 | * Make us the idle thread. Technically, schedule() should not be | 7674 | * Make us the idle thread. Technically, schedule() should not be |
7543 | * called from this thread, however somewhere below it might be, | 7675 | * called from this thread, however somewhere below it might be, |
7544 | * but because we are the idle thread, we just pick up running again | 7676 | * but because we are the idle thread, we just pick up running again |
@@ -7592,6 +7724,7 @@ EXPORT_SYMBOL(__might_sleep); | |||
7592 | void ___might_sleep(const char *file, int line, int preempt_offset) | 7724 | void ___might_sleep(const char *file, int line, int preempt_offset) |
7593 | { | 7725 | { |
7594 | static unsigned long prev_jiffy; /* ratelimiting */ | 7726 | static unsigned long prev_jiffy; /* ratelimiting */ |
7727 | unsigned long preempt_disable_ip; | ||
7595 | 7728 | ||
7596 | rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */ | 7729 | rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */ |
7597 | if ((preempt_count_equals(preempt_offset) && !irqs_disabled() && | 7730 | if ((preempt_count_equals(preempt_offset) && !irqs_disabled() && |
@@ -7602,6 +7735,9 @@ void ___might_sleep(const char *file, int line, int preempt_offset) | |||
7602 | return; | 7735 | return; |
7603 | prev_jiffy = jiffies; | 7736 | prev_jiffy = jiffies; |
7604 | 7737 | ||
7738 | /* Save this before calling printk(), since that will clobber it */ | ||
7739 | preempt_disable_ip = get_preempt_disable_ip(current); | ||
7740 | |||
7605 | printk(KERN_ERR | 7741 | printk(KERN_ERR |
7606 | "BUG: sleeping function called from invalid context at %s:%d\n", | 7742 | "BUG: sleeping function called from invalid context at %s:%d\n", |
7607 | file, line); | 7743 | file, line); |
@@ -7616,14 +7752,14 @@ void ___might_sleep(const char *file, int line, int preempt_offset) | |||
7616 | debug_show_held_locks(current); | 7752 | debug_show_held_locks(current); |
7617 | if (irqs_disabled()) | 7753 | if (irqs_disabled()) |
7618 | print_irqtrace_events(current); | 7754 | print_irqtrace_events(current); |
7619 | #ifdef CONFIG_DEBUG_PREEMPT | 7755 | if (IS_ENABLED(CONFIG_DEBUG_PREEMPT) |
7620 | if (!preempt_count_equals(preempt_offset)) { | 7756 | && !preempt_count_equals(preempt_offset)) { |
7621 | pr_err("Preemption disabled at:"); | 7757 | pr_err("Preemption disabled at:"); |
7622 | print_ip_sym(current->preempt_disable_ip); | 7758 | print_ip_sym(preempt_disable_ip); |
7623 | pr_cont("\n"); | 7759 | pr_cont("\n"); |
7624 | } | 7760 | } |
7625 | #endif | ||
7626 | dump_stack(); | 7761 | dump_stack(); |
7762 | add_taint(TAINT_WARN, LOCKDEP_STILL_OK); | ||
7627 | } | 7763 | } |
7628 | EXPORT_SYMBOL(___might_sleep); | 7764 | EXPORT_SYMBOL(___might_sleep); |
7629 | #endif | 7765 | #endif |
@@ -7644,12 +7780,10 @@ void normalize_rt_tasks(void) | |||
7644 | if (p->flags & PF_KTHREAD) | 7780 | if (p->flags & PF_KTHREAD) |
7645 | continue; | 7781 | continue; |
7646 | 7782 | ||
7647 | p->se.exec_start = 0; | 7783 | p->se.exec_start = 0; |
7648 | #ifdef CONFIG_SCHEDSTATS | 7784 | schedstat_set(p->se.statistics.wait_start, 0); |
7649 | p->se.statistics.wait_start = 0; | 7785 | schedstat_set(p->se.statistics.sleep_start, 0); |
7650 | p->se.statistics.sleep_start = 0; | 7786 | schedstat_set(p->se.statistics.block_start, 0); |
7651 | p->se.statistics.block_start = 0; | ||
7652 | #endif | ||
7653 | 7787 | ||
7654 | if (!dl_task(p) && !rt_task(p)) { | 7788 | if (!dl_task(p) && !rt_task(p)) { |
7655 | /* | 7789 | /* |
@@ -7710,7 +7844,7 @@ struct task_struct *curr_task(int cpu) | |||
7710 | * | 7844 | * |
7711 | * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! | 7845 | * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! |
7712 | */ | 7846 | */ |
7713 | void set_curr_task(int cpu, struct task_struct *p) | 7847 | void ia64_set_curr_task(int cpu, struct task_struct *p) |
7714 | { | 7848 | { |
7715 | cpu_curr(cpu) = p; | 7849 | cpu_curr(cpu) = p; |
7716 | } | 7850 | } |
@@ -7841,10 +7975,10 @@ void sched_move_task(struct task_struct *tsk) | |||
7841 | 7975 | ||
7842 | sched_change_group(tsk, TASK_MOVE_GROUP); | 7976 | sched_change_group(tsk, TASK_MOVE_GROUP); |
7843 | 7977 | ||
7844 | if (unlikely(running)) | ||
7845 | tsk->sched_class->set_curr_task(rq); | ||
7846 | if (queued) | 7978 | if (queued) |
7847 | enqueue_task(rq, tsk, ENQUEUE_RESTORE | ENQUEUE_MOVE); | 7979 | enqueue_task(rq, tsk, ENQUEUE_RESTORE | ENQUEUE_MOVE); |
7980 | if (unlikely(running)) | ||
7981 | set_curr_task(rq, tsk); | ||
7848 | 7982 | ||
7849 | task_rq_unlock(rq, tsk, &rf); | 7983 | task_rq_unlock(rq, tsk, &rf); |
7850 | } | 7984 | } |
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c index d4184498c9f5..e73119013c53 100644 --- a/kernel/sched/cpudeadline.c +++ b/kernel/sched/cpudeadline.c | |||
@@ -31,56 +31,81 @@ static inline int right_child(int i) | |||
31 | return (i << 1) + 2; | 31 | return (i << 1) + 2; |
32 | } | 32 | } |
33 | 33 | ||
34 | static void cpudl_exchange(struct cpudl *cp, int a, int b) | 34 | static void cpudl_heapify_down(struct cpudl *cp, int idx) |
35 | { | 35 | { |
36 | int cpu_a = cp->elements[a].cpu, cpu_b = cp->elements[b].cpu; | 36 | int l, r, largest; |
37 | 37 | ||
38 | swap(cp->elements[a].cpu, cp->elements[b].cpu); | 38 | int orig_cpu = cp->elements[idx].cpu; |
39 | swap(cp->elements[a].dl , cp->elements[b].dl ); | 39 | u64 orig_dl = cp->elements[idx].dl; |
40 | 40 | ||
41 | swap(cp->elements[cpu_a].idx, cp->elements[cpu_b].idx); | 41 | if (left_child(idx) >= cp->size) |
42 | } | 42 | return; |
43 | |||
44 | static void cpudl_heapify(struct cpudl *cp, int idx) | ||
45 | { | ||
46 | int l, r, largest; | ||
47 | 43 | ||
48 | /* adapted from lib/prio_heap.c */ | 44 | /* adapted from lib/prio_heap.c */ |
49 | while(1) { | 45 | while(1) { |
46 | u64 largest_dl; | ||
50 | l = left_child(idx); | 47 | l = left_child(idx); |
51 | r = right_child(idx); | 48 | r = right_child(idx); |
52 | largest = idx; | 49 | largest = idx; |
50 | largest_dl = orig_dl; | ||
53 | 51 | ||
54 | if ((l < cp->size) && dl_time_before(cp->elements[idx].dl, | 52 | if ((l < cp->size) && dl_time_before(orig_dl, |
55 | cp->elements[l].dl)) | 53 | cp->elements[l].dl)) { |
56 | largest = l; | 54 | largest = l; |
57 | if ((r < cp->size) && dl_time_before(cp->elements[largest].dl, | 55 | largest_dl = cp->elements[l].dl; |
58 | cp->elements[r].dl)) | 56 | } |
57 | if ((r < cp->size) && dl_time_before(largest_dl, | ||
58 | cp->elements[r].dl)) | ||
59 | largest = r; | 59 | largest = r; |
60 | |||
60 | if (largest == idx) | 61 | if (largest == idx) |
61 | break; | 62 | break; |
62 | 63 | ||
63 | /* Push idx down the heap one level and bump one up */ | 64 | /* pull largest child onto idx */ |
64 | cpudl_exchange(cp, largest, idx); | 65 | cp->elements[idx].cpu = cp->elements[largest].cpu; |
66 | cp->elements[idx].dl = cp->elements[largest].dl; | ||
67 | cp->elements[cp->elements[idx].cpu].idx = idx; | ||
65 | idx = largest; | 68 | idx = largest; |
66 | } | 69 | } |
70 | /* actual push down of saved original values orig_* */ | ||
71 | cp->elements[idx].cpu = orig_cpu; | ||
72 | cp->elements[idx].dl = orig_dl; | ||
73 | cp->elements[cp->elements[idx].cpu].idx = idx; | ||
67 | } | 74 | } |
68 | 75 | ||
69 | static void cpudl_change_key(struct cpudl *cp, int idx, u64 new_dl) | 76 | static void cpudl_heapify_up(struct cpudl *cp, int idx) |
70 | { | 77 | { |
71 | WARN_ON(idx == IDX_INVALID || !cpu_present(idx)); | 78 | int p; |
72 | 79 | ||
73 | if (dl_time_before(new_dl, cp->elements[idx].dl)) { | 80 | int orig_cpu = cp->elements[idx].cpu; |
74 | cp->elements[idx].dl = new_dl; | 81 | u64 orig_dl = cp->elements[idx].dl; |
75 | cpudl_heapify(cp, idx); | 82 | |
76 | } else { | 83 | if (idx == 0) |
77 | cp->elements[idx].dl = new_dl; | 84 | return; |
78 | while (idx > 0 && dl_time_before(cp->elements[parent(idx)].dl, | 85 | |
79 | cp->elements[idx].dl)) { | 86 | do { |
80 | cpudl_exchange(cp, idx, parent(idx)); | 87 | p = parent(idx); |
81 | idx = parent(idx); | 88 | if (dl_time_before(orig_dl, cp->elements[p].dl)) |
82 | } | 89 | break; |
83 | } | 90 | /* pull parent onto idx */ |
91 | cp->elements[idx].cpu = cp->elements[p].cpu; | ||
92 | cp->elements[idx].dl = cp->elements[p].dl; | ||
93 | cp->elements[cp->elements[idx].cpu].idx = idx; | ||
94 | idx = p; | ||
95 | } while (idx != 0); | ||
96 | /* actual push up of saved original values orig_* */ | ||
97 | cp->elements[idx].cpu = orig_cpu; | ||
98 | cp->elements[idx].dl = orig_dl; | ||
99 | cp->elements[cp->elements[idx].cpu].idx = idx; | ||
100 | } | ||
101 | |||
102 | static void cpudl_heapify(struct cpudl *cp, int idx) | ||
103 | { | ||
104 | if (idx > 0 && dl_time_before(cp->elements[parent(idx)].dl, | ||
105 | cp->elements[idx].dl)) | ||
106 | cpudl_heapify_up(cp, idx); | ||
107 | else | ||
108 | cpudl_heapify_down(cp, idx); | ||
84 | } | 109 | } |
85 | 110 | ||
86 | static inline int cpudl_maximum(struct cpudl *cp) | 111 | static inline int cpudl_maximum(struct cpudl *cp) |
@@ -120,16 +145,15 @@ out: | |||
120 | } | 145 | } |
121 | 146 | ||
122 | /* | 147 | /* |
123 | * cpudl_set - update the cpudl max-heap | 148 | * cpudl_clear - remove a cpu from the cpudl max-heap |
124 | * @cp: the cpudl max-heap context | 149 | * @cp: the cpudl max-heap context |
125 | * @cpu: the target cpu | 150 | * @cpu: the target cpu |
126 | * @dl: the new earliest deadline for this cpu | ||
127 | * | 151 | * |
128 | * Notes: assumes cpu_rq(cpu)->lock is locked | 152 | * Notes: assumes cpu_rq(cpu)->lock is locked |
129 | * | 153 | * |
130 | * Returns: (void) | 154 | * Returns: (void) |
131 | */ | 155 | */ |
132 | void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid) | 156 | void cpudl_clear(struct cpudl *cp, int cpu) |
133 | { | 157 | { |
134 | int old_idx, new_cpu; | 158 | int old_idx, new_cpu; |
135 | unsigned long flags; | 159 | unsigned long flags; |
@@ -137,47 +161,60 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid) | |||
137 | WARN_ON(!cpu_present(cpu)); | 161 | WARN_ON(!cpu_present(cpu)); |
138 | 162 | ||
139 | raw_spin_lock_irqsave(&cp->lock, flags); | 163 | raw_spin_lock_irqsave(&cp->lock, flags); |
164 | |||
140 | old_idx = cp->elements[cpu].idx; | 165 | old_idx = cp->elements[cpu].idx; |
141 | if (!is_valid) { | 166 | if (old_idx == IDX_INVALID) { |
142 | /* remove item */ | 167 | /* |
143 | if (old_idx == IDX_INVALID) { | 168 | * Nothing to remove if old_idx was invalid. |
144 | /* | 169 | * This could happen if a rq_offline_dl is |
145 | * Nothing to remove if old_idx was invalid. | 170 | * called for a CPU without -dl tasks running. |
146 | * This could happen if a rq_offline_dl is | 171 | */ |
147 | * called for a CPU without -dl tasks running. | 172 | } else { |
148 | */ | ||
149 | goto out; | ||
150 | } | ||
151 | new_cpu = cp->elements[cp->size - 1].cpu; | 173 | new_cpu = cp->elements[cp->size - 1].cpu; |
152 | cp->elements[old_idx].dl = cp->elements[cp->size - 1].dl; | 174 | cp->elements[old_idx].dl = cp->elements[cp->size - 1].dl; |
153 | cp->elements[old_idx].cpu = new_cpu; | 175 | cp->elements[old_idx].cpu = new_cpu; |
154 | cp->size--; | 176 | cp->size--; |
155 | cp->elements[new_cpu].idx = old_idx; | 177 | cp->elements[new_cpu].idx = old_idx; |
156 | cp->elements[cpu].idx = IDX_INVALID; | 178 | cp->elements[cpu].idx = IDX_INVALID; |
157 | while (old_idx > 0 && dl_time_before( | 179 | cpudl_heapify(cp, old_idx); |
158 | cp->elements[parent(old_idx)].dl, | ||
159 | cp->elements[old_idx].dl)) { | ||
160 | cpudl_exchange(cp, old_idx, parent(old_idx)); | ||
161 | old_idx = parent(old_idx); | ||
162 | } | ||
163 | cpumask_set_cpu(cpu, cp->free_cpus); | ||
164 | cpudl_heapify(cp, old_idx); | ||
165 | 180 | ||
166 | goto out; | 181 | cpumask_set_cpu(cpu, cp->free_cpus); |
167 | } | 182 | } |
183 | raw_spin_unlock_irqrestore(&cp->lock, flags); | ||
184 | } | ||
185 | |||
186 | /* | ||
187 | * cpudl_set - update the cpudl max-heap | ||
188 | * @cp: the cpudl max-heap context | ||
189 | * @cpu: the target cpu | ||
190 | * @dl: the new earliest deadline for this cpu | ||
191 | * | ||
192 | * Notes: assumes cpu_rq(cpu)->lock is locked | ||
193 | * | ||
194 | * Returns: (void) | ||
195 | */ | ||
196 | void cpudl_set(struct cpudl *cp, int cpu, u64 dl) | ||
197 | { | ||
198 | int old_idx; | ||
199 | unsigned long flags; | ||
168 | 200 | ||
201 | WARN_ON(!cpu_present(cpu)); | ||
202 | |||
203 | raw_spin_lock_irqsave(&cp->lock, flags); | ||
204 | |||
205 | old_idx = cp->elements[cpu].idx; | ||
169 | if (old_idx == IDX_INVALID) { | 206 | if (old_idx == IDX_INVALID) { |
170 | cp->size++; | 207 | int new_idx = cp->size++; |
171 | cp->elements[cp->size - 1].dl = dl; | 208 | cp->elements[new_idx].dl = dl; |
172 | cp->elements[cp->size - 1].cpu = cpu; | 209 | cp->elements[new_idx].cpu = cpu; |
173 | cp->elements[cpu].idx = cp->size - 1; | 210 | cp->elements[cpu].idx = new_idx; |
174 | cpudl_change_key(cp, cp->size - 1, dl); | 211 | cpudl_heapify_up(cp, new_idx); |
175 | cpumask_clear_cpu(cpu, cp->free_cpus); | 212 | cpumask_clear_cpu(cpu, cp->free_cpus); |
176 | } else { | 213 | } else { |
177 | cpudl_change_key(cp, old_idx, dl); | 214 | cp->elements[old_idx].dl = dl; |
215 | cpudl_heapify(cp, old_idx); | ||
178 | } | 216 | } |
179 | 217 | ||
180 | out: | ||
181 | raw_spin_unlock_irqrestore(&cp->lock, flags); | 218 | raw_spin_unlock_irqrestore(&cp->lock, flags); |
182 | } | 219 | } |
183 | 220 | ||
diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h index fcbdf83fed7e..f7da8c55bba0 100644 --- a/kernel/sched/cpudeadline.h +++ b/kernel/sched/cpudeadline.h | |||
@@ -23,7 +23,8 @@ struct cpudl { | |||
23 | #ifdef CONFIG_SMP | 23 | #ifdef CONFIG_SMP |
24 | int cpudl_find(struct cpudl *cp, struct task_struct *p, | 24 | int cpudl_find(struct cpudl *cp, struct task_struct *p, |
25 | struct cpumask *later_mask); | 25 | struct cpumask *later_mask); |
26 | void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid); | 26 | void cpudl_set(struct cpudl *cp, int cpu, u64 dl); |
27 | void cpudl_clear(struct cpudl *cp, int cpu); | ||
27 | int cpudl_init(struct cpudl *cp); | 28 | int cpudl_init(struct cpudl *cp); |
28 | void cpudl_set_freecpu(struct cpudl *cp, int cpu); | 29 | void cpudl_set_freecpu(struct cpudl *cp, int cpu); |
29 | void cpudl_clear_freecpu(struct cpudl *cp, int cpu); | 30 | void cpudl_clear_freecpu(struct cpudl *cp, int cpu); |
diff --git a/kernel/sched/cpufreq.c b/kernel/sched/cpufreq.c index 1141954e73b4..dbc51442ecbc 100644 --- a/kernel/sched/cpufreq.c +++ b/kernel/sched/cpufreq.c | |||
@@ -33,7 +33,7 @@ DEFINE_PER_CPU(struct update_util_data *, cpufreq_update_util_data); | |||
33 | */ | 33 | */ |
34 | void cpufreq_add_update_util_hook(int cpu, struct update_util_data *data, | 34 | void cpufreq_add_update_util_hook(int cpu, struct update_util_data *data, |
35 | void (*func)(struct update_util_data *data, u64 time, | 35 | void (*func)(struct update_util_data *data, u64 time, |
36 | unsigned long util, unsigned long max)) | 36 | unsigned int flags)) |
37 | { | 37 | { |
38 | if (WARN_ON(!data || !func)) | 38 | if (WARN_ON(!data || !func)) |
39 | return; | 39 | return; |
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index a84641b222c1..69e06898997d 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c | |||
@@ -12,7 +12,6 @@ | |||
12 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt | 12 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt |
13 | 13 | ||
14 | #include <linux/cpufreq.h> | 14 | #include <linux/cpufreq.h> |
15 | #include <linux/module.h> | ||
16 | #include <linux/slab.h> | 15 | #include <linux/slab.h> |
17 | #include <trace/events/power.h> | 16 | #include <trace/events/power.h> |
18 | 17 | ||
@@ -48,11 +47,14 @@ struct sugov_cpu { | |||
48 | struct sugov_policy *sg_policy; | 47 | struct sugov_policy *sg_policy; |
49 | 48 | ||
50 | unsigned int cached_raw_freq; | 49 | unsigned int cached_raw_freq; |
50 | unsigned long iowait_boost; | ||
51 | unsigned long iowait_boost_max; | ||
52 | u64 last_update; | ||
51 | 53 | ||
52 | /* The fields below are only needed when sharing a policy. */ | 54 | /* The fields below are only needed when sharing a policy. */ |
53 | unsigned long util; | 55 | unsigned long util; |
54 | unsigned long max; | 56 | unsigned long max; |
55 | u64 last_update; | 57 | unsigned int flags; |
56 | }; | 58 | }; |
57 | 59 | ||
58 | static DEFINE_PER_CPU(struct sugov_cpu, sugov_cpu); | 60 | static DEFINE_PER_CPU(struct sugov_cpu, sugov_cpu); |
@@ -144,24 +146,75 @@ static unsigned int get_next_freq(struct sugov_cpu *sg_cpu, unsigned long util, | |||
144 | return cpufreq_driver_resolve_freq(policy, freq); | 146 | return cpufreq_driver_resolve_freq(policy, freq); |
145 | } | 147 | } |
146 | 148 | ||
149 | static void sugov_get_util(unsigned long *util, unsigned long *max) | ||
150 | { | ||
151 | struct rq *rq = this_rq(); | ||
152 | unsigned long cfs_max; | ||
153 | |||
154 | cfs_max = arch_scale_cpu_capacity(NULL, smp_processor_id()); | ||
155 | |||
156 | *util = min(rq->cfs.avg.util_avg, cfs_max); | ||
157 | *max = cfs_max; | ||
158 | } | ||
159 | |||
160 | static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time, | ||
161 | unsigned int flags) | ||
162 | { | ||
163 | if (flags & SCHED_CPUFREQ_IOWAIT) { | ||
164 | sg_cpu->iowait_boost = sg_cpu->iowait_boost_max; | ||
165 | } else if (sg_cpu->iowait_boost) { | ||
166 | s64 delta_ns = time - sg_cpu->last_update; | ||
167 | |||
168 | /* Clear iowait_boost if the CPU apprears to have been idle. */ | ||
169 | if (delta_ns > TICK_NSEC) | ||
170 | sg_cpu->iowait_boost = 0; | ||
171 | } | ||
172 | } | ||
173 | |||
174 | static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, unsigned long *util, | ||
175 | unsigned long *max) | ||
176 | { | ||
177 | unsigned long boost_util = sg_cpu->iowait_boost; | ||
178 | unsigned long boost_max = sg_cpu->iowait_boost_max; | ||
179 | |||
180 | if (!boost_util) | ||
181 | return; | ||
182 | |||
183 | if (*util * boost_max < *max * boost_util) { | ||
184 | *util = boost_util; | ||
185 | *max = boost_max; | ||
186 | } | ||
187 | sg_cpu->iowait_boost >>= 1; | ||
188 | } | ||
189 | |||
147 | static void sugov_update_single(struct update_util_data *hook, u64 time, | 190 | static void sugov_update_single(struct update_util_data *hook, u64 time, |
148 | unsigned long util, unsigned long max) | 191 | unsigned int flags) |
149 | { | 192 | { |
150 | struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util); | 193 | struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util); |
151 | struct sugov_policy *sg_policy = sg_cpu->sg_policy; | 194 | struct sugov_policy *sg_policy = sg_cpu->sg_policy; |
152 | struct cpufreq_policy *policy = sg_policy->policy; | 195 | struct cpufreq_policy *policy = sg_policy->policy; |
196 | unsigned long util, max; | ||
153 | unsigned int next_f; | 197 | unsigned int next_f; |
154 | 198 | ||
199 | sugov_set_iowait_boost(sg_cpu, time, flags); | ||
200 | sg_cpu->last_update = time; | ||
201 | |||
155 | if (!sugov_should_update_freq(sg_policy, time)) | 202 | if (!sugov_should_update_freq(sg_policy, time)) |
156 | return; | 203 | return; |
157 | 204 | ||
158 | next_f = util == ULONG_MAX ? policy->cpuinfo.max_freq : | 205 | if (flags & SCHED_CPUFREQ_RT_DL) { |
159 | get_next_freq(sg_cpu, util, max); | 206 | next_f = policy->cpuinfo.max_freq; |
207 | } else { | ||
208 | sugov_get_util(&util, &max); | ||
209 | sugov_iowait_boost(sg_cpu, &util, &max); | ||
210 | next_f = get_next_freq(sg_cpu, util, max); | ||
211 | } | ||
160 | sugov_update_commit(sg_policy, time, next_f); | 212 | sugov_update_commit(sg_policy, time, next_f); |
161 | } | 213 | } |
162 | 214 | ||
163 | static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, | 215 | static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, |
164 | unsigned long util, unsigned long max) | 216 | unsigned long util, unsigned long max, |
217 | unsigned int flags) | ||
165 | { | 218 | { |
166 | struct sugov_policy *sg_policy = sg_cpu->sg_policy; | 219 | struct sugov_policy *sg_policy = sg_cpu->sg_policy; |
167 | struct cpufreq_policy *policy = sg_policy->policy; | 220 | struct cpufreq_policy *policy = sg_policy->policy; |
@@ -169,9 +222,11 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, | |||
169 | u64 last_freq_update_time = sg_policy->last_freq_update_time; | 222 | u64 last_freq_update_time = sg_policy->last_freq_update_time; |
170 | unsigned int j; | 223 | unsigned int j; |
171 | 224 | ||
172 | if (util == ULONG_MAX) | 225 | if (flags & SCHED_CPUFREQ_RT_DL) |
173 | return max_f; | 226 | return max_f; |
174 | 227 | ||
228 | sugov_iowait_boost(sg_cpu, &util, &max); | ||
229 | |||
175 | for_each_cpu(j, policy->cpus) { | 230 | for_each_cpu(j, policy->cpus) { |
176 | struct sugov_cpu *j_sg_cpu; | 231 | struct sugov_cpu *j_sg_cpu; |
177 | unsigned long j_util, j_max; | 232 | unsigned long j_util, j_max; |
@@ -186,41 +241,50 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, | |||
186 | * frequency update and the time elapsed between the last update | 241 | * frequency update and the time elapsed between the last update |
187 | * of the CPU utilization and the last frequency update is long | 242 | * of the CPU utilization and the last frequency update is long |
188 | * enough, don't take the CPU into account as it probably is | 243 | * enough, don't take the CPU into account as it probably is |
189 | * idle now. | 244 | * idle now (and clear iowait_boost for it). |
190 | */ | 245 | */ |
191 | delta_ns = last_freq_update_time - j_sg_cpu->last_update; | 246 | delta_ns = last_freq_update_time - j_sg_cpu->last_update; |
192 | if (delta_ns > TICK_NSEC) | 247 | if (delta_ns > TICK_NSEC) { |
248 | j_sg_cpu->iowait_boost = 0; | ||
193 | continue; | 249 | continue; |
194 | 250 | } | |
195 | j_util = j_sg_cpu->util; | 251 | if (j_sg_cpu->flags & SCHED_CPUFREQ_RT_DL) |
196 | if (j_util == ULONG_MAX) | ||
197 | return max_f; | 252 | return max_f; |
198 | 253 | ||
254 | j_util = j_sg_cpu->util; | ||
199 | j_max = j_sg_cpu->max; | 255 | j_max = j_sg_cpu->max; |
200 | if (j_util * max > j_max * util) { | 256 | if (j_util * max > j_max * util) { |
201 | util = j_util; | 257 | util = j_util; |
202 | max = j_max; | 258 | max = j_max; |
203 | } | 259 | } |
260 | |||
261 | sugov_iowait_boost(j_sg_cpu, &util, &max); | ||
204 | } | 262 | } |
205 | 263 | ||
206 | return get_next_freq(sg_cpu, util, max); | 264 | return get_next_freq(sg_cpu, util, max); |
207 | } | 265 | } |
208 | 266 | ||
209 | static void sugov_update_shared(struct update_util_data *hook, u64 time, | 267 | static void sugov_update_shared(struct update_util_data *hook, u64 time, |
210 | unsigned long util, unsigned long max) | 268 | unsigned int flags) |
211 | { | 269 | { |
212 | struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util); | 270 | struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util); |
213 | struct sugov_policy *sg_policy = sg_cpu->sg_policy; | 271 | struct sugov_policy *sg_policy = sg_cpu->sg_policy; |
272 | unsigned long util, max; | ||
214 | unsigned int next_f; | 273 | unsigned int next_f; |
215 | 274 | ||
275 | sugov_get_util(&util, &max); | ||
276 | |||
216 | raw_spin_lock(&sg_policy->update_lock); | 277 | raw_spin_lock(&sg_policy->update_lock); |
217 | 278 | ||
218 | sg_cpu->util = util; | 279 | sg_cpu->util = util; |
219 | sg_cpu->max = max; | 280 | sg_cpu->max = max; |
281 | sg_cpu->flags = flags; | ||
282 | |||
283 | sugov_set_iowait_boost(sg_cpu, time, flags); | ||
220 | sg_cpu->last_update = time; | 284 | sg_cpu->last_update = time; |
221 | 285 | ||
222 | if (sugov_should_update_freq(sg_policy, time)) { | 286 | if (sugov_should_update_freq(sg_policy, time)) { |
223 | next_f = sugov_next_freq_shared(sg_cpu, util, max); | 287 | next_f = sugov_next_freq_shared(sg_cpu, util, max, flags); |
224 | sugov_update_commit(sg_policy, time, next_f); | 288 | sugov_update_commit(sg_policy, time, next_f); |
225 | } | 289 | } |
226 | 290 | ||
@@ -444,10 +508,13 @@ static int sugov_start(struct cpufreq_policy *policy) | |||
444 | 508 | ||
445 | sg_cpu->sg_policy = sg_policy; | 509 | sg_cpu->sg_policy = sg_policy; |
446 | if (policy_is_shared(policy)) { | 510 | if (policy_is_shared(policy)) { |
447 | sg_cpu->util = ULONG_MAX; | 511 | sg_cpu->util = 0; |
448 | sg_cpu->max = 0; | 512 | sg_cpu->max = 0; |
513 | sg_cpu->flags = SCHED_CPUFREQ_RT; | ||
449 | sg_cpu->last_update = 0; | 514 | sg_cpu->last_update = 0; |
450 | sg_cpu->cached_raw_freq = 0; | 515 | sg_cpu->cached_raw_freq = 0; |
516 | sg_cpu->iowait_boost = 0; | ||
517 | sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq; | ||
451 | cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util, | 518 | cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util, |
452 | sugov_update_shared); | 519 | sugov_update_shared); |
453 | } else { | 520 | } else { |
@@ -495,28 +562,15 @@ static struct cpufreq_governor schedutil_gov = { | |||
495 | .limits = sugov_limits, | 562 | .limits = sugov_limits, |
496 | }; | 563 | }; |
497 | 564 | ||
498 | static int __init sugov_module_init(void) | ||
499 | { | ||
500 | return cpufreq_register_governor(&schedutil_gov); | ||
501 | } | ||
502 | |||
503 | static void __exit sugov_module_exit(void) | ||
504 | { | ||
505 | cpufreq_unregister_governor(&schedutil_gov); | ||
506 | } | ||
507 | |||
508 | MODULE_AUTHOR("Rafael J. Wysocki <rafael.j.wysocki@intel.com>"); | ||
509 | MODULE_DESCRIPTION("Utilization-based CPU frequency selection"); | ||
510 | MODULE_LICENSE("GPL"); | ||
511 | |||
512 | #ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL | 565 | #ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL |
513 | struct cpufreq_governor *cpufreq_default_governor(void) | 566 | struct cpufreq_governor *cpufreq_default_governor(void) |
514 | { | 567 | { |
515 | return &schedutil_gov; | 568 | return &schedutil_gov; |
516 | } | 569 | } |
517 | |||
518 | fs_initcall(sugov_module_init); | ||
519 | #else | ||
520 | module_init(sugov_module_init); | ||
521 | #endif | 570 | #endif |
522 | module_exit(sugov_module_exit); | 571 | |
572 | static int __init sugov_register(void) | ||
573 | { | ||
574 | return cpufreq_register_governor(&schedutil_gov); | ||
575 | } | ||
576 | fs_initcall(sugov_register); | ||
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index a846cf89eb96..5ebee3164e64 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c | |||
@@ -23,10 +23,8 @@ | |||
23 | * task when irq is in progress while we read rq->clock. That is a worthy | 23 | * task when irq is in progress while we read rq->clock. That is a worthy |
24 | * compromise in place of having locks on each irq in account_system_time. | 24 | * compromise in place of having locks on each irq in account_system_time. |
25 | */ | 25 | */ |
26 | DEFINE_PER_CPU(u64, cpu_hardirq_time); | 26 | DEFINE_PER_CPU(struct irqtime, cpu_irqtime); |
27 | DEFINE_PER_CPU(u64, cpu_softirq_time); | ||
28 | 27 | ||
29 | static DEFINE_PER_CPU(u64, irq_start_time); | ||
30 | static int sched_clock_irqtime; | 28 | static int sched_clock_irqtime; |
31 | 29 | ||
32 | void enable_sched_clock_irqtime(void) | 30 | void enable_sched_clock_irqtime(void) |
@@ -39,16 +37,13 @@ void disable_sched_clock_irqtime(void) | |||
39 | sched_clock_irqtime = 0; | 37 | sched_clock_irqtime = 0; |
40 | } | 38 | } |
41 | 39 | ||
42 | #ifndef CONFIG_64BIT | ||
43 | DEFINE_PER_CPU(seqcount_t, irq_time_seq); | ||
44 | #endif /* CONFIG_64BIT */ | ||
45 | |||
46 | /* | 40 | /* |
47 | * Called before incrementing preempt_count on {soft,}irq_enter | 41 | * Called before incrementing preempt_count on {soft,}irq_enter |
48 | * and before decrementing preempt_count on {soft,}irq_exit. | 42 | * and before decrementing preempt_count on {soft,}irq_exit. |
49 | */ | 43 | */ |
50 | void irqtime_account_irq(struct task_struct *curr) | 44 | void irqtime_account_irq(struct task_struct *curr) |
51 | { | 45 | { |
46 | struct irqtime *irqtime = this_cpu_ptr(&cpu_irqtime); | ||
52 | s64 delta; | 47 | s64 delta; |
53 | int cpu; | 48 | int cpu; |
54 | 49 | ||
@@ -56,10 +51,10 @@ void irqtime_account_irq(struct task_struct *curr) | |||
56 | return; | 51 | return; |
57 | 52 | ||
58 | cpu = smp_processor_id(); | 53 | cpu = smp_processor_id(); |
59 | delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time); | 54 | delta = sched_clock_cpu(cpu) - irqtime->irq_start_time; |
60 | __this_cpu_add(irq_start_time, delta); | 55 | irqtime->irq_start_time += delta; |
61 | 56 | ||
62 | irq_time_write_begin(); | 57 | u64_stats_update_begin(&irqtime->sync); |
63 | /* | 58 | /* |
64 | * We do not account for softirq time from ksoftirqd here. | 59 | * We do not account for softirq time from ksoftirqd here. |
65 | * We want to continue accounting softirq time to ksoftirqd thread | 60 | * We want to continue accounting softirq time to ksoftirqd thread |
@@ -67,42 +62,36 @@ void irqtime_account_irq(struct task_struct *curr) | |||
67 | * that do not consume any time, but still wants to run. | 62 | * that do not consume any time, but still wants to run. |
68 | */ | 63 | */ |
69 | if (hardirq_count()) | 64 | if (hardirq_count()) |
70 | __this_cpu_add(cpu_hardirq_time, delta); | 65 | irqtime->hardirq_time += delta; |
71 | else if (in_serving_softirq() && curr != this_cpu_ksoftirqd()) | 66 | else if (in_serving_softirq() && curr != this_cpu_ksoftirqd()) |
72 | __this_cpu_add(cpu_softirq_time, delta); | 67 | irqtime->softirq_time += delta; |
73 | 68 | ||
74 | irq_time_write_end(); | 69 | u64_stats_update_end(&irqtime->sync); |
75 | } | 70 | } |
76 | EXPORT_SYMBOL_GPL(irqtime_account_irq); | 71 | EXPORT_SYMBOL_GPL(irqtime_account_irq); |
77 | 72 | ||
78 | static cputime_t irqtime_account_hi_update(cputime_t maxtime) | 73 | static cputime_t irqtime_account_update(u64 irqtime, int idx, cputime_t maxtime) |
79 | { | 74 | { |
80 | u64 *cpustat = kcpustat_this_cpu->cpustat; | 75 | u64 *cpustat = kcpustat_this_cpu->cpustat; |
81 | unsigned long flags; | ||
82 | cputime_t irq_cputime; | 76 | cputime_t irq_cputime; |
83 | 77 | ||
84 | local_irq_save(flags); | 78 | irq_cputime = nsecs_to_cputime64(irqtime) - cpustat[idx]; |
85 | irq_cputime = nsecs_to_cputime64(this_cpu_read(cpu_hardirq_time)) - | ||
86 | cpustat[CPUTIME_IRQ]; | ||
87 | irq_cputime = min(irq_cputime, maxtime); | 79 | irq_cputime = min(irq_cputime, maxtime); |
88 | cpustat[CPUTIME_IRQ] += irq_cputime; | 80 | cpustat[idx] += irq_cputime; |
89 | local_irq_restore(flags); | 81 | |
90 | return irq_cputime; | 82 | return irq_cputime; |
91 | } | 83 | } |
92 | 84 | ||
93 | static cputime_t irqtime_account_si_update(cputime_t maxtime) | 85 | static cputime_t irqtime_account_hi_update(cputime_t maxtime) |
94 | { | 86 | { |
95 | u64 *cpustat = kcpustat_this_cpu->cpustat; | 87 | return irqtime_account_update(__this_cpu_read(cpu_irqtime.hardirq_time), |
96 | unsigned long flags; | 88 | CPUTIME_IRQ, maxtime); |
97 | cputime_t softirq_cputime; | 89 | } |
98 | 90 | ||
99 | local_irq_save(flags); | 91 | static cputime_t irqtime_account_si_update(cputime_t maxtime) |
100 | softirq_cputime = nsecs_to_cputime64(this_cpu_read(cpu_softirq_time)) - | 92 | { |
101 | cpustat[CPUTIME_SOFTIRQ]; | 93 | return irqtime_account_update(__this_cpu_read(cpu_irqtime.softirq_time), |
102 | softirq_cputime = min(softirq_cputime, maxtime); | 94 | CPUTIME_SOFTIRQ, maxtime); |
103 | cpustat[CPUTIME_SOFTIRQ] += softirq_cputime; | ||
104 | local_irq_restore(flags); | ||
105 | return softirq_cputime; | ||
106 | } | 95 | } |
107 | 96 | ||
108 | #else /* CONFIG_IRQ_TIME_ACCOUNTING */ | 97 | #else /* CONFIG_IRQ_TIME_ACCOUNTING */ |
@@ -295,6 +284,9 @@ static inline cputime_t account_other_time(cputime_t max) | |||
295 | { | 284 | { |
296 | cputime_t accounted; | 285 | cputime_t accounted; |
297 | 286 | ||
287 | /* Shall be converted to a lockdep-enabled lightweight check */ | ||
288 | WARN_ON_ONCE(!irqs_disabled()); | ||
289 | |||
298 | accounted = steal_account_process_time(max); | 290 | accounted = steal_account_process_time(max); |
299 | 291 | ||
300 | if (accounted < max) | 292 | if (accounted < max) |
@@ -306,6 +298,26 @@ static inline cputime_t account_other_time(cputime_t max) | |||
306 | return accounted; | 298 | return accounted; |
307 | } | 299 | } |
308 | 300 | ||
301 | #ifdef CONFIG_64BIT | ||
302 | static inline u64 read_sum_exec_runtime(struct task_struct *t) | ||
303 | { | ||
304 | return t->se.sum_exec_runtime; | ||
305 | } | ||
306 | #else | ||
307 | static u64 read_sum_exec_runtime(struct task_struct *t) | ||
308 | { | ||
309 | u64 ns; | ||
310 | struct rq_flags rf; | ||
311 | struct rq *rq; | ||
312 | |||
313 | rq = task_rq_lock(t, &rf); | ||
314 | ns = t->se.sum_exec_runtime; | ||
315 | task_rq_unlock(rq, t, &rf); | ||
316 | |||
317 | return ns; | ||
318 | } | ||
319 | #endif | ||
320 | |||
309 | /* | 321 | /* |
310 | * Accumulate raw cputime values of dead tasks (sig->[us]time) and live | 322 | * Accumulate raw cputime values of dead tasks (sig->[us]time) and live |
311 | * tasks (sum on group iteration) belonging to @tsk's group. | 323 | * tasks (sum on group iteration) belonging to @tsk's group. |
@@ -318,6 +330,17 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) | |||
318 | unsigned int seq, nextseq; | 330 | unsigned int seq, nextseq; |
319 | unsigned long flags; | 331 | unsigned long flags; |
320 | 332 | ||
333 | /* | ||
334 | * Update current task runtime to account pending time since last | ||
335 | * scheduler action or thread_group_cputime() call. This thread group | ||
336 | * might have other running tasks on different CPUs, but updating | ||
337 | * their runtime can affect syscall performance, so we skip account | ||
338 | * those pending times and rely only on values updated on tick or | ||
339 | * other scheduler action. | ||
340 | */ | ||
341 | if (same_thread_group(current, tsk)) | ||
342 | (void) task_sched_runtime(current); | ||
343 | |||
321 | rcu_read_lock(); | 344 | rcu_read_lock(); |
322 | /* Attempt a lockless read on the first round. */ | 345 | /* Attempt a lockless read on the first round. */ |
323 | nextseq = 0; | 346 | nextseq = 0; |
@@ -332,7 +355,7 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) | |||
332 | task_cputime(t, &utime, &stime); | 355 | task_cputime(t, &utime, &stime); |
333 | times->utime += utime; | 356 | times->utime += utime; |
334 | times->stime += stime; | 357 | times->stime += stime; |
335 | times->sum_exec_runtime += task_sched_runtime(t); | 358 | times->sum_exec_runtime += read_sum_exec_runtime(t); |
336 | } | 359 | } |
337 | /* If lockless access failed, take the lock. */ | 360 | /* If lockless access failed, take the lock. */ |
338 | nextseq = 1; | 361 | nextseq = 1; |
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 1ce8867283dc..37e2449186c4 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c | |||
@@ -243,10 +243,8 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq); | |||
243 | static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p) | 243 | static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p) |
244 | { | 244 | { |
245 | struct rq *later_rq = NULL; | 245 | struct rq *later_rq = NULL; |
246 | bool fallback = false; | ||
247 | 246 | ||
248 | later_rq = find_lock_later_rq(p, rq); | 247 | later_rq = find_lock_later_rq(p, rq); |
249 | |||
250 | if (!later_rq) { | 248 | if (!later_rq) { |
251 | int cpu; | 249 | int cpu; |
252 | 250 | ||
@@ -254,7 +252,6 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p | |||
254 | * If we cannot preempt any rq, fall back to pick any | 252 | * If we cannot preempt any rq, fall back to pick any |
255 | * online cpu. | 253 | * online cpu. |
256 | */ | 254 | */ |
257 | fallback = true; | ||
258 | cpu = cpumask_any_and(cpu_active_mask, tsk_cpus_allowed(p)); | 255 | cpu = cpumask_any_and(cpu_active_mask, tsk_cpus_allowed(p)); |
259 | if (cpu >= nr_cpu_ids) { | 256 | if (cpu >= nr_cpu_ids) { |
260 | /* | 257 | /* |
@@ -274,16 +271,7 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p | |||
274 | double_lock_balance(rq, later_rq); | 271 | double_lock_balance(rq, later_rq); |
275 | } | 272 | } |
276 | 273 | ||
277 | /* | ||
278 | * By now the task is replenished and enqueued; migrate it. | ||
279 | */ | ||
280 | deactivate_task(rq, p, 0); | ||
281 | set_task_cpu(p, later_rq->cpu); | 274 | set_task_cpu(p, later_rq->cpu); |
282 | activate_task(later_rq, p, 0); | ||
283 | |||
284 | if (!fallback) | ||
285 | resched_curr(later_rq); | ||
286 | |||
287 | double_unlock_balance(later_rq, rq); | 275 | double_unlock_balance(later_rq, rq); |
288 | 276 | ||
289 | return later_rq; | 277 | return later_rq; |
@@ -346,12 +334,12 @@ static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p, | |||
346 | * one, and to (try to!) reconcile itself with its own scheduling | 334 | * one, and to (try to!) reconcile itself with its own scheduling |
347 | * parameters. | 335 | * parameters. |
348 | */ | 336 | */ |
349 | static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se, | 337 | static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se) |
350 | struct sched_dl_entity *pi_se) | ||
351 | { | 338 | { |
352 | struct dl_rq *dl_rq = dl_rq_of_se(dl_se); | 339 | struct dl_rq *dl_rq = dl_rq_of_se(dl_se); |
353 | struct rq *rq = rq_of_dl_rq(dl_rq); | 340 | struct rq *rq = rq_of_dl_rq(dl_rq); |
354 | 341 | ||
342 | WARN_ON(dl_se->dl_boosted); | ||
355 | WARN_ON(dl_time_before(rq_clock(rq), dl_se->deadline)); | 343 | WARN_ON(dl_time_before(rq_clock(rq), dl_se->deadline)); |
356 | 344 | ||
357 | /* | 345 | /* |
@@ -367,8 +355,8 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se, | |||
367 | * future; in fact, we must consider execution overheads (time | 355 | * future; in fact, we must consider execution overheads (time |
368 | * spent on hardirq context, etc.). | 356 | * spent on hardirq context, etc.). |
369 | */ | 357 | */ |
370 | dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; | 358 | dl_se->deadline = rq_clock(rq) + dl_se->dl_deadline; |
371 | dl_se->runtime = pi_se->dl_runtime; | 359 | dl_se->runtime = dl_se->dl_runtime; |
372 | } | 360 | } |
373 | 361 | ||
374 | /* | 362 | /* |
@@ -641,29 +629,31 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) | |||
641 | goto unlock; | 629 | goto unlock; |
642 | } | 630 | } |
643 | 631 | ||
644 | enqueue_task_dl(rq, p, ENQUEUE_REPLENISH); | ||
645 | if (dl_task(rq->curr)) | ||
646 | check_preempt_curr_dl(rq, p, 0); | ||
647 | else | ||
648 | resched_curr(rq); | ||
649 | |||
650 | #ifdef CONFIG_SMP | 632 | #ifdef CONFIG_SMP |
651 | /* | ||
652 | * Perform balancing operations here; after the replenishments. We | ||
653 | * cannot drop rq->lock before this, otherwise the assertion in | ||
654 | * start_dl_timer() about not missing updates is not true. | ||
655 | * | ||
656 | * If we find that the rq the task was on is no longer available, we | ||
657 | * need to select a new rq. | ||
658 | * | ||
659 | * XXX figure out if select_task_rq_dl() deals with offline cpus. | ||
660 | */ | ||
661 | if (unlikely(!rq->online)) { | 633 | if (unlikely(!rq->online)) { |
634 | /* | ||
635 | * If the runqueue is no longer available, migrate the | ||
636 | * task elsewhere. This necessarily changes rq. | ||
637 | */ | ||
662 | lockdep_unpin_lock(&rq->lock, rf.cookie); | 638 | lockdep_unpin_lock(&rq->lock, rf.cookie); |
663 | rq = dl_task_offline_migration(rq, p); | 639 | rq = dl_task_offline_migration(rq, p); |
664 | rf.cookie = lockdep_pin_lock(&rq->lock); | 640 | rf.cookie = lockdep_pin_lock(&rq->lock); |
641 | |||
642 | /* | ||
643 | * Now that the task has been migrated to the new RQ and we | ||
644 | * have that locked, proceed as normal and enqueue the task | ||
645 | * there. | ||
646 | */ | ||
665 | } | 647 | } |
648 | #endif | ||
649 | |||
650 | enqueue_task_dl(rq, p, ENQUEUE_REPLENISH); | ||
651 | if (dl_task(rq->curr)) | ||
652 | check_preempt_curr_dl(rq, p, 0); | ||
653 | else | ||
654 | resched_curr(rq); | ||
666 | 655 | ||
656 | #ifdef CONFIG_SMP | ||
667 | /* | 657 | /* |
668 | * Queueing this task back might have overloaded rq, check if we need | 658 | * Queueing this task back might have overloaded rq, check if we need |
669 | * to kick someone away. | 659 | * to kick someone away. |
@@ -735,9 +725,8 @@ static void update_curr_dl(struct rq *rq) | |||
735 | return; | 725 | return; |
736 | } | 726 | } |
737 | 727 | ||
738 | /* kick cpufreq (see the comment in linux/cpufreq.h). */ | 728 | /* kick cpufreq (see the comment in kernel/sched/sched.h). */ |
739 | if (cpu_of(rq) == smp_processor_id()) | 729 | cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_DL); |
740 | cpufreq_trigger_update(rq_clock(rq)); | ||
741 | 730 | ||
742 | schedstat_set(curr->se.statistics.exec_max, | 731 | schedstat_set(curr->se.statistics.exec_max, |
743 | max(curr->se.statistics.exec_max, delta_exec)); | 732 | max(curr->se.statistics.exec_max, delta_exec)); |
@@ -798,7 +787,7 @@ static void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline) | |||
798 | if (dl_rq->earliest_dl.curr == 0 || | 787 | if (dl_rq->earliest_dl.curr == 0 || |
799 | dl_time_before(deadline, dl_rq->earliest_dl.curr)) { | 788 | dl_time_before(deadline, dl_rq->earliest_dl.curr)) { |
800 | dl_rq->earliest_dl.curr = deadline; | 789 | dl_rq->earliest_dl.curr = deadline; |
801 | cpudl_set(&rq->rd->cpudl, rq->cpu, deadline, 1); | 790 | cpudl_set(&rq->rd->cpudl, rq->cpu, deadline); |
802 | } | 791 | } |
803 | } | 792 | } |
804 | 793 | ||
@@ -813,14 +802,14 @@ static void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline) | |||
813 | if (!dl_rq->dl_nr_running) { | 802 | if (!dl_rq->dl_nr_running) { |
814 | dl_rq->earliest_dl.curr = 0; | 803 | dl_rq->earliest_dl.curr = 0; |
815 | dl_rq->earliest_dl.next = 0; | 804 | dl_rq->earliest_dl.next = 0; |
816 | cpudl_set(&rq->rd->cpudl, rq->cpu, 0, 0); | 805 | cpudl_clear(&rq->rd->cpudl, rq->cpu); |
817 | } else { | 806 | } else { |
818 | struct rb_node *leftmost = dl_rq->rb_leftmost; | 807 | struct rb_node *leftmost = dl_rq->rb_leftmost; |
819 | struct sched_dl_entity *entry; | 808 | struct sched_dl_entity *entry; |
820 | 809 | ||
821 | entry = rb_entry(leftmost, struct sched_dl_entity, rb_node); | 810 | entry = rb_entry(leftmost, struct sched_dl_entity, rb_node); |
822 | dl_rq->earliest_dl.curr = entry->deadline; | 811 | dl_rq->earliest_dl.curr = entry->deadline; |
823 | cpudl_set(&rq->rd->cpudl, rq->cpu, entry->deadline, 1); | 812 | cpudl_set(&rq->rd->cpudl, rq->cpu, entry->deadline); |
824 | } | 813 | } |
825 | } | 814 | } |
826 | 815 | ||
@@ -1671,7 +1660,7 @@ static void rq_online_dl(struct rq *rq) | |||
1671 | 1660 | ||
1672 | cpudl_set_freecpu(&rq->rd->cpudl, rq->cpu); | 1661 | cpudl_set_freecpu(&rq->rd->cpudl, rq->cpu); |
1673 | if (rq->dl.dl_nr_running > 0) | 1662 | if (rq->dl.dl_nr_running > 0) |
1674 | cpudl_set(&rq->rd->cpudl, rq->cpu, rq->dl.earliest_dl.curr, 1); | 1663 | cpudl_set(&rq->rd->cpudl, rq->cpu, rq->dl.earliest_dl.curr); |
1675 | } | 1664 | } |
1676 | 1665 | ||
1677 | /* Assumes rq->lock is held */ | 1666 | /* Assumes rq->lock is held */ |
@@ -1680,7 +1669,7 @@ static void rq_offline_dl(struct rq *rq) | |||
1680 | if (rq->dl.overloaded) | 1669 | if (rq->dl.overloaded) |
1681 | dl_clear_overload(rq); | 1670 | dl_clear_overload(rq); |
1682 | 1671 | ||
1683 | cpudl_set(&rq->rd->cpudl, rq->cpu, 0, 0); | 1672 | cpudl_clear(&rq->rd->cpudl, rq->cpu); |
1684 | cpudl_clear_freecpu(&rq->rd->cpudl, rq->cpu); | 1673 | cpudl_clear_freecpu(&rq->rd->cpudl, rq->cpu); |
1685 | } | 1674 | } |
1686 | 1675 | ||
@@ -1723,10 +1712,20 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p) | |||
1723 | */ | 1712 | */ |
1724 | static void switched_to_dl(struct rq *rq, struct task_struct *p) | 1713 | static void switched_to_dl(struct rq *rq, struct task_struct *p) |
1725 | { | 1714 | { |
1715 | |||
1716 | /* If p is not queued we will update its parameters at next wakeup. */ | ||
1717 | if (!task_on_rq_queued(p)) | ||
1718 | return; | ||
1719 | |||
1720 | /* | ||
1721 | * If p is boosted we already updated its params in | ||
1722 | * rt_mutex_setprio()->enqueue_task(..., ENQUEUE_REPLENISH), | ||
1723 | * p's deadline being now already after rq_clock(rq). | ||
1724 | */ | ||
1726 | if (dl_time_before(p->dl.deadline, rq_clock(rq))) | 1725 | if (dl_time_before(p->dl.deadline, rq_clock(rq))) |
1727 | setup_new_dl_entity(&p->dl, &p->dl); | 1726 | setup_new_dl_entity(&p->dl); |
1728 | 1727 | ||
1729 | if (task_on_rq_queued(p) && rq->curr != p) { | 1728 | if (rq->curr != p) { |
1730 | #ifdef CONFIG_SMP | 1729 | #ifdef CONFIG_SMP |
1731 | if (tsk_nr_cpus_allowed(p) > 1 && rq->dl.overloaded) | 1730 | if (tsk_nr_cpus_allowed(p) > 1 && rq->dl.overloaded) |
1732 | queue_push_tasks(rq); | 1731 | queue_push_tasks(rq); |
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 2a0a9995256d..fa178b62ea79 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c | |||
@@ -369,8 +369,12 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group | |||
369 | 369 | ||
370 | #define P(F) \ | 370 | #define P(F) \ |
371 | SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)F) | 371 | SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)F) |
372 | #define P_SCHEDSTAT(F) \ | ||
373 | SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)schedstat_val(F)) | ||
372 | #define PN(F) \ | 374 | #define PN(F) \ |
373 | SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F)) | 375 | SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F)) |
376 | #define PN_SCHEDSTAT(F) \ | ||
377 | SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)schedstat_val(F))) | ||
374 | 378 | ||
375 | if (!se) | 379 | if (!se) |
376 | return; | 380 | return; |
@@ -378,26 +382,27 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group | |||
378 | PN(se->exec_start); | 382 | PN(se->exec_start); |
379 | PN(se->vruntime); | 383 | PN(se->vruntime); |
380 | PN(se->sum_exec_runtime); | 384 | PN(se->sum_exec_runtime); |
381 | #ifdef CONFIG_SCHEDSTATS | ||
382 | if (schedstat_enabled()) { | 385 | if (schedstat_enabled()) { |
383 | PN(se->statistics.wait_start); | 386 | PN_SCHEDSTAT(se->statistics.wait_start); |
384 | PN(se->statistics.sleep_start); | 387 | PN_SCHEDSTAT(se->statistics.sleep_start); |
385 | PN(se->statistics.block_start); | 388 | PN_SCHEDSTAT(se->statistics.block_start); |
386 | PN(se->statistics.sleep_max); | 389 | PN_SCHEDSTAT(se->statistics.sleep_max); |
387 | PN(se->statistics.block_max); | 390 | PN_SCHEDSTAT(se->statistics.block_max); |
388 | PN(se->statistics.exec_max); | 391 | PN_SCHEDSTAT(se->statistics.exec_max); |
389 | PN(se->statistics.slice_max); | 392 | PN_SCHEDSTAT(se->statistics.slice_max); |
390 | PN(se->statistics.wait_max); | 393 | PN_SCHEDSTAT(se->statistics.wait_max); |
391 | PN(se->statistics.wait_sum); | 394 | PN_SCHEDSTAT(se->statistics.wait_sum); |
392 | P(se->statistics.wait_count); | 395 | P_SCHEDSTAT(se->statistics.wait_count); |
393 | } | 396 | } |
394 | #endif | ||
395 | P(se->load.weight); | 397 | P(se->load.weight); |
396 | #ifdef CONFIG_SMP | 398 | #ifdef CONFIG_SMP |
397 | P(se->avg.load_avg); | 399 | P(se->avg.load_avg); |
398 | P(se->avg.util_avg); | 400 | P(se->avg.util_avg); |
399 | #endif | 401 | #endif |
402 | |||
403 | #undef PN_SCHEDSTAT | ||
400 | #undef PN | 404 | #undef PN |
405 | #undef P_SCHEDSTAT | ||
401 | #undef P | 406 | #undef P |
402 | } | 407 | } |
403 | #endif | 408 | #endif |
@@ -410,7 +415,8 @@ static char *task_group_path(struct task_group *tg) | |||
410 | if (autogroup_path(tg, group_path, PATH_MAX)) | 415 | if (autogroup_path(tg, group_path, PATH_MAX)) |
411 | return group_path; | 416 | return group_path; |
412 | 417 | ||
413 | return cgroup_path(tg->css.cgroup, group_path, PATH_MAX); | 418 | cgroup_path(tg->css.cgroup, group_path, PATH_MAX); |
419 | return group_path; | ||
414 | } | 420 | } |
415 | #endif | 421 | #endif |
416 | 422 | ||
@@ -429,9 +435,9 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) | |||
429 | p->prio); | 435 | p->prio); |
430 | 436 | ||
431 | SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld", | 437 | SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld", |
432 | SPLIT_NS(schedstat_val(p, se.statistics.wait_sum)), | 438 | SPLIT_NS(schedstat_val_or_zero(p->se.statistics.wait_sum)), |
433 | SPLIT_NS(p->se.sum_exec_runtime), | 439 | SPLIT_NS(p->se.sum_exec_runtime), |
434 | SPLIT_NS(schedstat_val(p, se.statistics.sum_sleep_runtime))); | 440 | SPLIT_NS(schedstat_val_or_zero(p->se.statistics.sum_sleep_runtime))); |
435 | 441 | ||
436 | #ifdef CONFIG_NUMA_BALANCING | 442 | #ifdef CONFIG_NUMA_BALANCING |
437 | SEQ_printf(m, " %d %d", task_node(p), task_numa_group_id(p)); | 443 | SEQ_printf(m, " %d %d", task_node(p), task_numa_group_id(p)); |
@@ -626,9 +632,7 @@ do { \ | |||
626 | #undef P64 | 632 | #undef P64 |
627 | #endif | 633 | #endif |
628 | 634 | ||
629 | #ifdef CONFIG_SCHEDSTATS | 635 | #define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, schedstat_val(rq->n)); |
630 | #define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, rq->n); | ||
631 | |||
632 | if (schedstat_enabled()) { | 636 | if (schedstat_enabled()) { |
633 | P(yld_count); | 637 | P(yld_count); |
634 | P(sched_count); | 638 | P(sched_count); |
@@ -636,9 +640,8 @@ do { \ | |||
636 | P(ttwu_count); | 640 | P(ttwu_count); |
637 | P(ttwu_local); | 641 | P(ttwu_local); |
638 | } | 642 | } |
639 | |||
640 | #undef P | 643 | #undef P |
641 | #endif | 644 | |
642 | spin_lock_irqsave(&sched_debug_lock, flags); | 645 | spin_lock_irqsave(&sched_debug_lock, flags); |
643 | print_cfs_stats(m, cpu); | 646 | print_cfs_stats(m, cpu); |
644 | print_rt_stats(m, cpu); | 647 | print_rt_stats(m, cpu); |
@@ -868,10 +871,14 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) | |||
868 | SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)F) | 871 | SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)F) |
869 | #define P(F) \ | 872 | #define P(F) \ |
870 | SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)p->F) | 873 | SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)p->F) |
874 | #define P_SCHEDSTAT(F) \ | ||
875 | SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)schedstat_val(p->F)) | ||
871 | #define __PN(F) \ | 876 | #define __PN(F) \ |
872 | SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F)) | 877 | SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F)) |
873 | #define PN(F) \ | 878 | #define PN(F) \ |
874 | SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F)) | 879 | SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F)) |
880 | #define PN_SCHEDSTAT(F) \ | ||
881 | SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)schedstat_val(p->F))) | ||
875 | 882 | ||
876 | PN(se.exec_start); | 883 | PN(se.exec_start); |
877 | PN(se.vruntime); | 884 | PN(se.vruntime); |
@@ -881,37 +888,36 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) | |||
881 | 888 | ||
882 | P(se.nr_migrations); | 889 | P(se.nr_migrations); |
883 | 890 | ||
884 | #ifdef CONFIG_SCHEDSTATS | ||
885 | if (schedstat_enabled()) { | 891 | if (schedstat_enabled()) { |
886 | u64 avg_atom, avg_per_cpu; | 892 | u64 avg_atom, avg_per_cpu; |
887 | 893 | ||
888 | PN(se.statistics.sum_sleep_runtime); | 894 | PN_SCHEDSTAT(se.statistics.sum_sleep_runtime); |
889 | PN(se.statistics.wait_start); | 895 | PN_SCHEDSTAT(se.statistics.wait_start); |
890 | PN(se.statistics.sleep_start); | 896 | PN_SCHEDSTAT(se.statistics.sleep_start); |
891 | PN(se.statistics.block_start); | 897 | PN_SCHEDSTAT(se.statistics.block_start); |
892 | PN(se.statistics.sleep_max); | 898 | PN_SCHEDSTAT(se.statistics.sleep_max); |
893 | PN(se.statistics.block_max); | 899 | PN_SCHEDSTAT(se.statistics.block_max); |
894 | PN(se.statistics.exec_max); | 900 | PN_SCHEDSTAT(se.statistics.exec_max); |
895 | PN(se.statistics.slice_max); | 901 | PN_SCHEDSTAT(se.statistics.slice_max); |
896 | PN(se.statistics.wait_max); | 902 | PN_SCHEDSTAT(se.statistics.wait_max); |
897 | PN(se.statistics.wait_sum); | 903 | PN_SCHEDSTAT(se.statistics.wait_sum); |
898 | P(se.statistics.wait_count); | 904 | P_SCHEDSTAT(se.statistics.wait_count); |
899 | PN(se.statistics.iowait_sum); | 905 | PN_SCHEDSTAT(se.statistics.iowait_sum); |
900 | P(se.statistics.iowait_count); | 906 | P_SCHEDSTAT(se.statistics.iowait_count); |
901 | P(se.statistics.nr_migrations_cold); | 907 | P_SCHEDSTAT(se.statistics.nr_migrations_cold); |
902 | P(se.statistics.nr_failed_migrations_affine); | 908 | P_SCHEDSTAT(se.statistics.nr_failed_migrations_affine); |
903 | P(se.statistics.nr_failed_migrations_running); | 909 | P_SCHEDSTAT(se.statistics.nr_failed_migrations_running); |
904 | P(se.statistics.nr_failed_migrations_hot); | 910 | P_SCHEDSTAT(se.statistics.nr_failed_migrations_hot); |
905 | P(se.statistics.nr_forced_migrations); | 911 | P_SCHEDSTAT(se.statistics.nr_forced_migrations); |
906 | P(se.statistics.nr_wakeups); | 912 | P_SCHEDSTAT(se.statistics.nr_wakeups); |
907 | P(se.statistics.nr_wakeups_sync); | 913 | P_SCHEDSTAT(se.statistics.nr_wakeups_sync); |
908 | P(se.statistics.nr_wakeups_migrate); | 914 | P_SCHEDSTAT(se.statistics.nr_wakeups_migrate); |
909 | P(se.statistics.nr_wakeups_local); | 915 | P_SCHEDSTAT(se.statistics.nr_wakeups_local); |
910 | P(se.statistics.nr_wakeups_remote); | 916 | P_SCHEDSTAT(se.statistics.nr_wakeups_remote); |
911 | P(se.statistics.nr_wakeups_affine); | 917 | P_SCHEDSTAT(se.statistics.nr_wakeups_affine); |
912 | P(se.statistics.nr_wakeups_affine_attempts); | 918 | P_SCHEDSTAT(se.statistics.nr_wakeups_affine_attempts); |
913 | P(se.statistics.nr_wakeups_passive); | 919 | P_SCHEDSTAT(se.statistics.nr_wakeups_passive); |
914 | P(se.statistics.nr_wakeups_idle); | 920 | P_SCHEDSTAT(se.statistics.nr_wakeups_idle); |
915 | 921 | ||
916 | avg_atom = p->se.sum_exec_runtime; | 922 | avg_atom = p->se.sum_exec_runtime; |
917 | if (nr_switches) | 923 | if (nr_switches) |
@@ -930,7 +936,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) | |||
930 | __PN(avg_atom); | 936 | __PN(avg_atom); |
931 | __PN(avg_per_cpu); | 937 | __PN(avg_per_cpu); |
932 | } | 938 | } |
933 | #endif | 939 | |
934 | __P(nr_switches); | 940 | __P(nr_switches); |
935 | SEQ_printf(m, "%-45s:%21Ld\n", | 941 | SEQ_printf(m, "%-45s:%21Ld\n", |
936 | "nr_voluntary_switches", (long long)p->nvcsw); | 942 | "nr_voluntary_switches", (long long)p->nvcsw); |
@@ -947,8 +953,10 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) | |||
947 | #endif | 953 | #endif |
948 | P(policy); | 954 | P(policy); |
949 | P(prio); | 955 | P(prio); |
956 | #undef PN_SCHEDSTAT | ||
950 | #undef PN | 957 | #undef PN |
951 | #undef __PN | 958 | #undef __PN |
959 | #undef P_SCHEDSTAT | ||
952 | #undef P | 960 | #undef P |
953 | #undef __P | 961 | #undef __P |
954 | 962 | ||
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 039de34f1521..c242944f5cbd 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c | |||
@@ -114,6 +114,12 @@ unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL; | |||
114 | unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL; | 114 | unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL; |
115 | #endif | 115 | #endif |
116 | 116 | ||
117 | /* | ||
118 | * The margin used when comparing utilization with CPU capacity: | ||
119 | * util * 1024 < capacity * margin | ||
120 | */ | ||
121 | unsigned int capacity_margin = 1280; /* ~20% */ | ||
122 | |||
117 | static inline void update_load_add(struct load_weight *lw, unsigned long inc) | 123 | static inline void update_load_add(struct load_weight *lw, unsigned long inc) |
118 | { | 124 | { |
119 | lw->weight += inc; | 125 | lw->weight += inc; |
@@ -256,9 +262,7 @@ static inline struct rq *rq_of(struct cfs_rq *cfs_rq) | |||
256 | 262 | ||
257 | static inline struct task_struct *task_of(struct sched_entity *se) | 263 | static inline struct task_struct *task_of(struct sched_entity *se) |
258 | { | 264 | { |
259 | #ifdef CONFIG_SCHED_DEBUG | 265 | SCHED_WARN_ON(!entity_is_task(se)); |
260 | WARN_ON_ONCE(!entity_is_task(se)); | ||
261 | #endif | ||
262 | return container_of(se, struct task_struct, se); | 266 | return container_of(se, struct task_struct, se); |
263 | } | 267 | } |
264 | 268 | ||
@@ -456,17 +460,23 @@ static inline int entity_before(struct sched_entity *a, | |||
456 | 460 | ||
457 | static void update_min_vruntime(struct cfs_rq *cfs_rq) | 461 | static void update_min_vruntime(struct cfs_rq *cfs_rq) |
458 | { | 462 | { |
463 | struct sched_entity *curr = cfs_rq->curr; | ||
464 | |||
459 | u64 vruntime = cfs_rq->min_vruntime; | 465 | u64 vruntime = cfs_rq->min_vruntime; |
460 | 466 | ||
461 | if (cfs_rq->curr) | 467 | if (curr) { |
462 | vruntime = cfs_rq->curr->vruntime; | 468 | if (curr->on_rq) |
469 | vruntime = curr->vruntime; | ||
470 | else | ||
471 | curr = NULL; | ||
472 | } | ||
463 | 473 | ||
464 | if (cfs_rq->rb_leftmost) { | 474 | if (cfs_rq->rb_leftmost) { |
465 | struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost, | 475 | struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost, |
466 | struct sched_entity, | 476 | struct sched_entity, |
467 | run_node); | 477 | run_node); |
468 | 478 | ||
469 | if (!cfs_rq->curr) | 479 | if (!curr) |
470 | vruntime = se->vruntime; | 480 | vruntime = se->vruntime; |
471 | else | 481 | else |
472 | vruntime = min_vruntime(vruntime, se->vruntime); | 482 | vruntime = min_vruntime(vruntime, se->vruntime); |
@@ -656,7 +666,7 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
656 | } | 666 | } |
657 | 667 | ||
658 | #ifdef CONFIG_SMP | 668 | #ifdef CONFIG_SMP |
659 | static int select_idle_sibling(struct task_struct *p, int cpu); | 669 | static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu); |
660 | static unsigned long task_h_load(struct task_struct *p); | 670 | static unsigned long task_h_load(struct task_struct *p); |
661 | 671 | ||
662 | /* | 672 | /* |
@@ -680,7 +690,14 @@ void init_entity_runnable_average(struct sched_entity *se) | |||
680 | * will definitely be update (after enqueue). | 690 | * will definitely be update (after enqueue). |
681 | */ | 691 | */ |
682 | sa->period_contrib = 1023; | 692 | sa->period_contrib = 1023; |
683 | sa->load_avg = scale_load_down(se->load.weight); | 693 | /* |
694 | * Tasks are intialized with full load to be seen as heavy tasks until | ||
695 | * they get a chance to stabilize to their real load level. | ||
696 | * Group entities are intialized with zero load to reflect the fact that | ||
697 | * nothing has been attached to the task group yet. | ||
698 | */ | ||
699 | if (entity_is_task(se)) | ||
700 | sa->load_avg = scale_load_down(se->load.weight); | ||
684 | sa->load_sum = sa->load_avg * LOAD_AVG_MAX; | 701 | sa->load_sum = sa->load_avg * LOAD_AVG_MAX; |
685 | /* | 702 | /* |
686 | * At this point, util_avg won't be used in select_task_rq_fair anyway | 703 | * At this point, util_avg won't be used in select_task_rq_fair anyway |
@@ -726,7 +743,6 @@ void post_init_entity_util_avg(struct sched_entity *se) | |||
726 | struct sched_avg *sa = &se->avg; | 743 | struct sched_avg *sa = &se->avg; |
727 | long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2; | 744 | long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2; |
728 | u64 now = cfs_rq_clock_task(cfs_rq); | 745 | u64 now = cfs_rq_clock_task(cfs_rq); |
729 | int tg_update; | ||
730 | 746 | ||
731 | if (cap > 0) { | 747 | if (cap > 0) { |
732 | if (cfs_rq->avg.util_avg != 0) { | 748 | if (cfs_rq->avg.util_avg != 0) { |
@@ -759,10 +775,9 @@ void post_init_entity_util_avg(struct sched_entity *se) | |||
759 | } | 775 | } |
760 | } | 776 | } |
761 | 777 | ||
762 | tg_update = update_cfs_rq_load_avg(now, cfs_rq, false); | 778 | update_cfs_rq_load_avg(now, cfs_rq, false); |
763 | attach_entity_load_avg(cfs_rq, se); | 779 | attach_entity_load_avg(cfs_rq, se); |
764 | if (tg_update) | 780 | update_tg_load_avg(cfs_rq, false); |
765 | update_tg_load_avg(cfs_rq, false); | ||
766 | } | 781 | } |
767 | 782 | ||
768 | #else /* !CONFIG_SMP */ | 783 | #else /* !CONFIG_SMP */ |
@@ -799,7 +814,7 @@ static void update_curr(struct cfs_rq *cfs_rq) | |||
799 | max(delta_exec, curr->statistics.exec_max)); | 814 | max(delta_exec, curr->statistics.exec_max)); |
800 | 815 | ||
801 | curr->sum_exec_runtime += delta_exec; | 816 | curr->sum_exec_runtime += delta_exec; |
802 | schedstat_add(cfs_rq, exec_clock, delta_exec); | 817 | schedstat_add(cfs_rq->exec_clock, delta_exec); |
803 | 818 | ||
804 | curr->vruntime += calc_delta_fair(delta_exec, curr); | 819 | curr->vruntime += calc_delta_fair(delta_exec, curr); |
805 | update_min_vruntime(cfs_rq); | 820 | update_min_vruntime(cfs_rq); |
@@ -820,26 +835,34 @@ static void update_curr_fair(struct rq *rq) | |||
820 | update_curr(cfs_rq_of(&rq->curr->se)); | 835 | update_curr(cfs_rq_of(&rq->curr->se)); |
821 | } | 836 | } |
822 | 837 | ||
823 | #ifdef CONFIG_SCHEDSTATS | ||
824 | static inline void | 838 | static inline void |
825 | update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) | 839 | update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) |
826 | { | 840 | { |
827 | u64 wait_start = rq_clock(rq_of(cfs_rq)); | 841 | u64 wait_start, prev_wait_start; |
842 | |||
843 | if (!schedstat_enabled()) | ||
844 | return; | ||
845 | |||
846 | wait_start = rq_clock(rq_of(cfs_rq)); | ||
847 | prev_wait_start = schedstat_val(se->statistics.wait_start); | ||
828 | 848 | ||
829 | if (entity_is_task(se) && task_on_rq_migrating(task_of(se)) && | 849 | if (entity_is_task(se) && task_on_rq_migrating(task_of(se)) && |
830 | likely(wait_start > se->statistics.wait_start)) | 850 | likely(wait_start > prev_wait_start)) |
831 | wait_start -= se->statistics.wait_start; | 851 | wait_start -= prev_wait_start; |
832 | 852 | ||
833 | se->statistics.wait_start = wait_start; | 853 | schedstat_set(se->statistics.wait_start, wait_start); |
834 | } | 854 | } |
835 | 855 | ||
836 | static void | 856 | static inline void |
837 | update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) | 857 | update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) |
838 | { | 858 | { |
839 | struct task_struct *p; | 859 | struct task_struct *p; |
840 | u64 delta; | 860 | u64 delta; |
841 | 861 | ||
842 | delta = rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start; | 862 | if (!schedstat_enabled()) |
863 | return; | ||
864 | |||
865 | delta = rq_clock(rq_of(cfs_rq)) - schedstat_val(se->statistics.wait_start); | ||
843 | 866 | ||
844 | if (entity_is_task(se)) { | 867 | if (entity_is_task(se)) { |
845 | p = task_of(se); | 868 | p = task_of(se); |
@@ -849,35 +872,114 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
849 | * time stamp can be adjusted to accumulate wait time | 872 | * time stamp can be adjusted to accumulate wait time |
850 | * prior to migration. | 873 | * prior to migration. |
851 | */ | 874 | */ |
852 | se->statistics.wait_start = delta; | 875 | schedstat_set(se->statistics.wait_start, delta); |
853 | return; | 876 | return; |
854 | } | 877 | } |
855 | trace_sched_stat_wait(p, delta); | 878 | trace_sched_stat_wait(p, delta); |
856 | } | 879 | } |
857 | 880 | ||
858 | se->statistics.wait_max = max(se->statistics.wait_max, delta); | 881 | schedstat_set(se->statistics.wait_max, |
859 | se->statistics.wait_count++; | 882 | max(schedstat_val(se->statistics.wait_max), delta)); |
860 | se->statistics.wait_sum += delta; | 883 | schedstat_inc(se->statistics.wait_count); |
861 | se->statistics.wait_start = 0; | 884 | schedstat_add(se->statistics.wait_sum, delta); |
885 | schedstat_set(se->statistics.wait_start, 0); | ||
886 | } | ||
887 | |||
888 | static inline void | ||
889 | update_stats_enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) | ||
890 | { | ||
891 | struct task_struct *tsk = NULL; | ||
892 | u64 sleep_start, block_start; | ||
893 | |||
894 | if (!schedstat_enabled()) | ||
895 | return; | ||
896 | |||
897 | sleep_start = schedstat_val(se->statistics.sleep_start); | ||
898 | block_start = schedstat_val(se->statistics.block_start); | ||
899 | |||
900 | if (entity_is_task(se)) | ||
901 | tsk = task_of(se); | ||
902 | |||
903 | if (sleep_start) { | ||
904 | u64 delta = rq_clock(rq_of(cfs_rq)) - sleep_start; | ||
905 | |||
906 | if ((s64)delta < 0) | ||
907 | delta = 0; | ||
908 | |||
909 | if (unlikely(delta > schedstat_val(se->statistics.sleep_max))) | ||
910 | schedstat_set(se->statistics.sleep_max, delta); | ||
911 | |||
912 | schedstat_set(se->statistics.sleep_start, 0); | ||
913 | schedstat_add(se->statistics.sum_sleep_runtime, delta); | ||
914 | |||
915 | if (tsk) { | ||
916 | account_scheduler_latency(tsk, delta >> 10, 1); | ||
917 | trace_sched_stat_sleep(tsk, delta); | ||
918 | } | ||
919 | } | ||
920 | if (block_start) { | ||
921 | u64 delta = rq_clock(rq_of(cfs_rq)) - block_start; | ||
922 | |||
923 | if ((s64)delta < 0) | ||
924 | delta = 0; | ||
925 | |||
926 | if (unlikely(delta > schedstat_val(se->statistics.block_max))) | ||
927 | schedstat_set(se->statistics.block_max, delta); | ||
928 | |||
929 | schedstat_set(se->statistics.block_start, 0); | ||
930 | schedstat_add(se->statistics.sum_sleep_runtime, delta); | ||
931 | |||
932 | if (tsk) { | ||
933 | if (tsk->in_iowait) { | ||
934 | schedstat_add(se->statistics.iowait_sum, delta); | ||
935 | schedstat_inc(se->statistics.iowait_count); | ||
936 | trace_sched_stat_iowait(tsk, delta); | ||
937 | } | ||
938 | |||
939 | trace_sched_stat_blocked(tsk, delta); | ||
940 | |||
941 | /* | ||
942 | * Blocking time is in units of nanosecs, so shift by | ||
943 | * 20 to get a milliseconds-range estimation of the | ||
944 | * amount of time that the task spent sleeping: | ||
945 | */ | ||
946 | if (unlikely(prof_on == SLEEP_PROFILING)) { | ||
947 | profile_hits(SLEEP_PROFILING, | ||
948 | (void *)get_wchan(tsk), | ||
949 | delta >> 20); | ||
950 | } | ||
951 | account_scheduler_latency(tsk, delta >> 10, 0); | ||
952 | } | ||
953 | } | ||
862 | } | 954 | } |
863 | 955 | ||
864 | /* | 956 | /* |
865 | * Task is being enqueued - update stats: | 957 | * Task is being enqueued - update stats: |
866 | */ | 958 | */ |
867 | static inline void | 959 | static inline void |
868 | update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) | 960 | update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) |
869 | { | 961 | { |
962 | if (!schedstat_enabled()) | ||
963 | return; | ||
964 | |||
870 | /* | 965 | /* |
871 | * Are we enqueueing a waiting task? (for current tasks | 966 | * Are we enqueueing a waiting task? (for current tasks |
872 | * a dequeue/enqueue event is a NOP) | 967 | * a dequeue/enqueue event is a NOP) |
873 | */ | 968 | */ |
874 | if (se != cfs_rq->curr) | 969 | if (se != cfs_rq->curr) |
875 | update_stats_wait_start(cfs_rq, se); | 970 | update_stats_wait_start(cfs_rq, se); |
971 | |||
972 | if (flags & ENQUEUE_WAKEUP) | ||
973 | update_stats_enqueue_sleeper(cfs_rq, se); | ||
876 | } | 974 | } |
877 | 975 | ||
878 | static inline void | 976 | static inline void |
879 | update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) | 977 | update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) |
880 | { | 978 | { |
979 | |||
980 | if (!schedstat_enabled()) | ||
981 | return; | ||
982 | |||
881 | /* | 983 | /* |
882 | * Mark the end of the wait period if dequeueing a | 984 | * Mark the end of the wait period if dequeueing a |
883 | * waiting task: | 985 | * waiting task: |
@@ -885,40 +987,18 @@ update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) | |||
885 | if (se != cfs_rq->curr) | 987 | if (se != cfs_rq->curr) |
886 | update_stats_wait_end(cfs_rq, se); | 988 | update_stats_wait_end(cfs_rq, se); |
887 | 989 | ||
888 | if (flags & DEQUEUE_SLEEP) { | 990 | if ((flags & DEQUEUE_SLEEP) && entity_is_task(se)) { |
889 | if (entity_is_task(se)) { | 991 | struct task_struct *tsk = task_of(se); |
890 | struct task_struct *tsk = task_of(se); | ||
891 | 992 | ||
892 | if (tsk->state & TASK_INTERRUPTIBLE) | 993 | if (tsk->state & TASK_INTERRUPTIBLE) |
893 | se->statistics.sleep_start = rq_clock(rq_of(cfs_rq)); | 994 | schedstat_set(se->statistics.sleep_start, |
894 | if (tsk->state & TASK_UNINTERRUPTIBLE) | 995 | rq_clock(rq_of(cfs_rq))); |
895 | se->statistics.block_start = rq_clock(rq_of(cfs_rq)); | 996 | if (tsk->state & TASK_UNINTERRUPTIBLE) |
896 | } | 997 | schedstat_set(se->statistics.block_start, |
998 | rq_clock(rq_of(cfs_rq))); | ||
897 | } | 999 | } |
898 | |||
899 | } | ||
900 | #else | ||
901 | static inline void | ||
902 | update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) | ||
903 | { | ||
904 | } | 1000 | } |
905 | 1001 | ||
906 | static inline void | ||
907 | update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) | ||
908 | { | ||
909 | } | ||
910 | |||
911 | static inline void | ||
912 | update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) | ||
913 | { | ||
914 | } | ||
915 | |||
916 | static inline void | ||
917 | update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) | ||
918 | { | ||
919 | } | ||
920 | #endif | ||
921 | |||
922 | /* | 1002 | /* |
923 | * We are picking a new current task - update its stats: | 1003 | * We are picking a new current task - update its stats: |
924 | */ | 1004 | */ |
@@ -1513,8 +1593,16 @@ balance: | |||
1513 | * One idle CPU per node is evaluated for a task numa move. | 1593 | * One idle CPU per node is evaluated for a task numa move. |
1514 | * Call select_idle_sibling to maybe find a better one. | 1594 | * Call select_idle_sibling to maybe find a better one. |
1515 | */ | 1595 | */ |
1516 | if (!cur) | 1596 | if (!cur) { |
1517 | env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu); | 1597 | /* |
1598 | * select_idle_siblings() uses an per-cpu cpumask that | ||
1599 | * can be used from IRQ context. | ||
1600 | */ | ||
1601 | local_irq_disable(); | ||
1602 | env->dst_cpu = select_idle_sibling(env->p, env->src_cpu, | ||
1603 | env->dst_cpu); | ||
1604 | local_irq_enable(); | ||
1605 | } | ||
1518 | 1606 | ||
1519 | assign: | 1607 | assign: |
1520 | task_numa_assign(env, cur, imp); | 1608 | task_numa_assign(env, cur, imp); |
@@ -2292,7 +2380,7 @@ void task_numa_work(struct callback_head *work) | |||
2292 | unsigned long nr_pte_updates = 0; | 2380 | unsigned long nr_pte_updates = 0; |
2293 | long pages, virtpages; | 2381 | long pages, virtpages; |
2294 | 2382 | ||
2295 | WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work)); | 2383 | SCHED_WARN_ON(p != container_of(work, struct task_struct, numa_work)); |
2296 | 2384 | ||
2297 | work->next = work; /* protect against double add */ | 2385 | work->next = work; /* protect against double add */ |
2298 | /* | 2386 | /* |
@@ -2803,9 +2891,21 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa, | |||
2803 | } | 2891 | } |
2804 | 2892 | ||
2805 | #ifdef CONFIG_FAIR_GROUP_SCHED | 2893 | #ifdef CONFIG_FAIR_GROUP_SCHED |
2806 | /* | 2894 | /** |
2807 | * Updating tg's load_avg is necessary before update_cfs_share (which is done) | 2895 | * update_tg_load_avg - update the tg's load avg |
2808 | * and effective_load (which is not done because it is too costly). | 2896 | * @cfs_rq: the cfs_rq whose avg changed |
2897 | * @force: update regardless of how small the difference | ||
2898 | * | ||
2899 | * This function 'ensures': tg->load_avg := \Sum tg->cfs_rq[]->avg.load. | ||
2900 | * However, because tg->load_avg is a global value there are performance | ||
2901 | * considerations. | ||
2902 | * | ||
2903 | * In order to avoid having to look at the other cfs_rq's, we use a | ||
2904 | * differential update where we store the last value we propagated. This in | ||
2905 | * turn allows skipping updates if the differential is 'small'. | ||
2906 | * | ||
2907 | * Updating tg's load_avg is necessary before update_cfs_share() (which is | ||
2908 | * done) and effective_load() (which is not done because it is too costly). | ||
2809 | */ | 2909 | */ |
2810 | static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) | 2910 | static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) |
2811 | { | 2911 | { |
@@ -2875,12 +2975,7 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {} | |||
2875 | 2975 | ||
2876 | static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq) | 2976 | static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq) |
2877 | { | 2977 | { |
2878 | struct rq *rq = rq_of(cfs_rq); | 2978 | if (&this_rq()->cfs == cfs_rq) { |
2879 | int cpu = cpu_of(rq); | ||
2880 | |||
2881 | if (cpu == smp_processor_id() && &rq->cfs == cfs_rq) { | ||
2882 | unsigned long max = rq->cpu_capacity_orig; | ||
2883 | |||
2884 | /* | 2979 | /* |
2885 | * There are a few boundary cases this might miss but it should | 2980 | * There are a few boundary cases this might miss but it should |
2886 | * get called often enough that that should (hopefully) not be | 2981 | * get called often enough that that should (hopefully) not be |
@@ -2897,8 +2992,7 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq) | |||
2897 | * | 2992 | * |
2898 | * See cpu_util(). | 2993 | * See cpu_util(). |
2899 | */ | 2994 | */ |
2900 | cpufreq_update_util(rq_clock(rq), | 2995 | cpufreq_update_util(rq_of(cfs_rq), 0); |
2901 | min(cfs_rq->avg.util_avg, max), max); | ||
2902 | } | 2996 | } |
2903 | } | 2997 | } |
2904 | 2998 | ||
@@ -2931,10 +3025,10 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq) | |||
2931 | * | 3025 | * |
2932 | * cfs_rq->avg is used for task_h_load() and update_cfs_share() for example. | 3026 | * cfs_rq->avg is used for task_h_load() and update_cfs_share() for example. |
2933 | * | 3027 | * |
2934 | * Returns true if the load decayed or we removed utilization. It is expected | 3028 | * Returns true if the load decayed or we removed load. |
2935 | * that one calls update_tg_load_avg() on this condition, but after you've | 3029 | * |
2936 | * modified the cfs_rq avg (attach/detach), such that we propagate the new | 3030 | * Since both these conditions indicate a changed cfs_rq->avg.load we should |
2937 | * avg up. | 3031 | * call update_tg_load_avg() when this function returns true. |
2938 | */ | 3032 | */ |
2939 | static inline int | 3033 | static inline int |
2940 | update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) | 3034 | update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) |
@@ -3159,10 +3253,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) | |||
3159 | 3253 | ||
3160 | static inline void update_load_avg(struct sched_entity *se, int not_used) | 3254 | static inline void update_load_avg(struct sched_entity *se, int not_used) |
3161 | { | 3255 | { |
3162 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | 3256 | cpufreq_update_util(rq_of(cfs_rq_of(se)), 0); |
3163 | struct rq *rq = rq_of(cfs_rq); | ||
3164 | |||
3165 | cpufreq_trigger_update(rq_clock(rq)); | ||
3166 | } | 3257 | } |
3167 | 3258 | ||
3168 | static inline void | 3259 | static inline void |
@@ -3183,68 +3274,6 @@ static inline int idle_balance(struct rq *rq) | |||
3183 | 3274 | ||
3184 | #endif /* CONFIG_SMP */ | 3275 | #endif /* CONFIG_SMP */ |
3185 | 3276 | ||
3186 | static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) | ||
3187 | { | ||
3188 | #ifdef CONFIG_SCHEDSTATS | ||
3189 | struct task_struct *tsk = NULL; | ||
3190 | |||
3191 | if (entity_is_task(se)) | ||
3192 | tsk = task_of(se); | ||
3193 | |||
3194 | if (se->statistics.sleep_start) { | ||
3195 | u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.sleep_start; | ||
3196 | |||
3197 | if ((s64)delta < 0) | ||
3198 | delta = 0; | ||
3199 | |||
3200 | if (unlikely(delta > se->statistics.sleep_max)) | ||
3201 | se->statistics.sleep_max = delta; | ||
3202 | |||
3203 | se->statistics.sleep_start = 0; | ||
3204 | se->statistics.sum_sleep_runtime += delta; | ||
3205 | |||
3206 | if (tsk) { | ||
3207 | account_scheduler_latency(tsk, delta >> 10, 1); | ||
3208 | trace_sched_stat_sleep(tsk, delta); | ||
3209 | } | ||
3210 | } | ||
3211 | if (se->statistics.block_start) { | ||
3212 | u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.block_start; | ||
3213 | |||
3214 | if ((s64)delta < 0) | ||
3215 | delta = 0; | ||
3216 | |||
3217 | if (unlikely(delta > se->statistics.block_max)) | ||
3218 | se->statistics.block_max = delta; | ||
3219 | |||
3220 | se->statistics.block_start = 0; | ||
3221 | se->statistics.sum_sleep_runtime += delta; | ||
3222 | |||
3223 | if (tsk) { | ||
3224 | if (tsk->in_iowait) { | ||
3225 | se->statistics.iowait_sum += delta; | ||
3226 | se->statistics.iowait_count++; | ||
3227 | trace_sched_stat_iowait(tsk, delta); | ||
3228 | } | ||
3229 | |||
3230 | trace_sched_stat_blocked(tsk, delta); | ||
3231 | |||
3232 | /* | ||
3233 | * Blocking time is in units of nanosecs, so shift by | ||
3234 | * 20 to get a milliseconds-range estimation of the | ||
3235 | * amount of time that the task spent sleeping: | ||
3236 | */ | ||
3237 | if (unlikely(prof_on == SLEEP_PROFILING)) { | ||
3238 | profile_hits(SLEEP_PROFILING, | ||
3239 | (void *)get_wchan(tsk), | ||
3240 | delta >> 20); | ||
3241 | } | ||
3242 | account_scheduler_latency(tsk, delta >> 10, 0); | ||
3243 | } | ||
3244 | } | ||
3245 | #endif | ||
3246 | } | ||
3247 | |||
3248 | static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se) | 3277 | static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se) |
3249 | { | 3278 | { |
3250 | #ifdef CONFIG_SCHED_DEBUG | 3279 | #ifdef CONFIG_SCHED_DEBUG |
@@ -3254,7 +3283,7 @@ static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
3254 | d = -d; | 3283 | d = -d; |
3255 | 3284 | ||
3256 | if (d > 3*sysctl_sched_latency) | 3285 | if (d > 3*sysctl_sched_latency) |
3257 | schedstat_inc(cfs_rq, nr_spread_over); | 3286 | schedstat_inc(cfs_rq->nr_spread_over); |
3258 | #endif | 3287 | #endif |
3259 | } | 3288 | } |
3260 | 3289 | ||
@@ -3371,17 +3400,12 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) | |||
3371 | account_entity_enqueue(cfs_rq, se); | 3400 | account_entity_enqueue(cfs_rq, se); |
3372 | update_cfs_shares(cfs_rq); | 3401 | update_cfs_shares(cfs_rq); |
3373 | 3402 | ||
3374 | if (flags & ENQUEUE_WAKEUP) { | 3403 | if (flags & ENQUEUE_WAKEUP) |
3375 | place_entity(cfs_rq, se, 0); | 3404 | place_entity(cfs_rq, se, 0); |
3376 | if (schedstat_enabled()) | ||
3377 | enqueue_sleeper(cfs_rq, se); | ||
3378 | } | ||
3379 | 3405 | ||
3380 | check_schedstat_required(); | 3406 | check_schedstat_required(); |
3381 | if (schedstat_enabled()) { | 3407 | update_stats_enqueue(cfs_rq, se, flags); |
3382 | update_stats_enqueue(cfs_rq, se); | 3408 | check_spread(cfs_rq, se); |
3383 | check_spread(cfs_rq, se); | ||
3384 | } | ||
3385 | if (!curr) | 3409 | if (!curr) |
3386 | __enqueue_entity(cfs_rq, se); | 3410 | __enqueue_entity(cfs_rq, se); |
3387 | se->on_rq = 1; | 3411 | se->on_rq = 1; |
@@ -3448,8 +3472,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) | |||
3448 | update_curr(cfs_rq); | 3472 | update_curr(cfs_rq); |
3449 | dequeue_entity_load_avg(cfs_rq, se); | 3473 | dequeue_entity_load_avg(cfs_rq, se); |
3450 | 3474 | ||
3451 | if (schedstat_enabled()) | 3475 | update_stats_dequeue(cfs_rq, se, flags); |
3452 | update_stats_dequeue(cfs_rq, se, flags); | ||
3453 | 3476 | ||
3454 | clear_buddies(cfs_rq, se); | 3477 | clear_buddies(cfs_rq, se); |
3455 | 3478 | ||
@@ -3459,9 +3482,10 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) | |||
3459 | account_entity_dequeue(cfs_rq, se); | 3482 | account_entity_dequeue(cfs_rq, se); |
3460 | 3483 | ||
3461 | /* | 3484 | /* |
3462 | * Normalize the entity after updating the min_vruntime because the | 3485 | * Normalize after update_curr(); which will also have moved |
3463 | * update can refer to the ->curr item and we need to reflect this | 3486 | * min_vruntime if @se is the one holding it back. But before doing |
3464 | * movement in our normalized position. | 3487 | * update_min_vruntime() again, which will discount @se's position and |
3488 | * can move min_vruntime forward still more. | ||
3465 | */ | 3489 | */ |
3466 | if (!(flags & DEQUEUE_SLEEP)) | 3490 | if (!(flags & DEQUEUE_SLEEP)) |
3467 | se->vruntime -= cfs_rq->min_vruntime; | 3491 | se->vruntime -= cfs_rq->min_vruntime; |
@@ -3469,8 +3493,16 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) | |||
3469 | /* return excess runtime on last dequeue */ | 3493 | /* return excess runtime on last dequeue */ |
3470 | return_cfs_rq_runtime(cfs_rq); | 3494 | return_cfs_rq_runtime(cfs_rq); |
3471 | 3495 | ||
3472 | update_min_vruntime(cfs_rq); | ||
3473 | update_cfs_shares(cfs_rq); | 3496 | update_cfs_shares(cfs_rq); |
3497 | |||
3498 | /* | ||
3499 | * Now advance min_vruntime if @se was the entity holding it back, | ||
3500 | * except when: DEQUEUE_SAVE && !DEQUEUE_MOVE, in this case we'll be | ||
3501 | * put back on, and if we advance min_vruntime, we'll be placed back | ||
3502 | * further than we started -- ie. we'll be penalized. | ||
3503 | */ | ||
3504 | if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) == DEQUEUE_SAVE) | ||
3505 | update_min_vruntime(cfs_rq); | ||
3474 | } | 3506 | } |
3475 | 3507 | ||
3476 | /* | 3508 | /* |
@@ -3523,25 +3555,25 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
3523 | * a CPU. So account for the time it spent waiting on the | 3555 | * a CPU. So account for the time it spent waiting on the |
3524 | * runqueue. | 3556 | * runqueue. |
3525 | */ | 3557 | */ |
3526 | if (schedstat_enabled()) | 3558 | update_stats_wait_end(cfs_rq, se); |
3527 | update_stats_wait_end(cfs_rq, se); | ||
3528 | __dequeue_entity(cfs_rq, se); | 3559 | __dequeue_entity(cfs_rq, se); |
3529 | update_load_avg(se, 1); | 3560 | update_load_avg(se, 1); |
3530 | } | 3561 | } |
3531 | 3562 | ||
3532 | update_stats_curr_start(cfs_rq, se); | 3563 | update_stats_curr_start(cfs_rq, se); |
3533 | cfs_rq->curr = se; | 3564 | cfs_rq->curr = se; |
3534 | #ifdef CONFIG_SCHEDSTATS | 3565 | |
3535 | /* | 3566 | /* |
3536 | * Track our maximum slice length, if the CPU's load is at | 3567 | * Track our maximum slice length, if the CPU's load is at |
3537 | * least twice that of our own weight (i.e. dont track it | 3568 | * least twice that of our own weight (i.e. dont track it |
3538 | * when there are only lesser-weight tasks around): | 3569 | * when there are only lesser-weight tasks around): |
3539 | */ | 3570 | */ |
3540 | if (schedstat_enabled() && rq_of(cfs_rq)->load.weight >= 2*se->load.weight) { | 3571 | if (schedstat_enabled() && rq_of(cfs_rq)->load.weight >= 2*se->load.weight) { |
3541 | se->statistics.slice_max = max(se->statistics.slice_max, | 3572 | schedstat_set(se->statistics.slice_max, |
3542 | se->sum_exec_runtime - se->prev_sum_exec_runtime); | 3573 | max((u64)schedstat_val(se->statistics.slice_max), |
3574 | se->sum_exec_runtime - se->prev_sum_exec_runtime)); | ||
3543 | } | 3575 | } |
3544 | #endif | 3576 | |
3545 | se->prev_sum_exec_runtime = se->sum_exec_runtime; | 3577 | se->prev_sum_exec_runtime = se->sum_exec_runtime; |
3546 | } | 3578 | } |
3547 | 3579 | ||
@@ -3620,13 +3652,10 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) | |||
3620 | /* throttle cfs_rqs exceeding runtime */ | 3652 | /* throttle cfs_rqs exceeding runtime */ |
3621 | check_cfs_rq_runtime(cfs_rq); | 3653 | check_cfs_rq_runtime(cfs_rq); |
3622 | 3654 | ||
3623 | if (schedstat_enabled()) { | 3655 | check_spread(cfs_rq, prev); |
3624 | check_spread(cfs_rq, prev); | ||
3625 | if (prev->on_rq) | ||
3626 | update_stats_wait_start(cfs_rq, prev); | ||
3627 | } | ||
3628 | 3656 | ||
3629 | if (prev->on_rq) { | 3657 | if (prev->on_rq) { |
3658 | update_stats_wait_start(cfs_rq, prev); | ||
3630 | /* Put 'current' back into the tree. */ | 3659 | /* Put 'current' back into the tree. */ |
3631 | __enqueue_entity(cfs_rq, prev); | 3660 | __enqueue_entity(cfs_rq, prev); |
3632 | /* in !on_rq case, update occurred at dequeue */ | 3661 | /* in !on_rq case, update occurred at dequeue */ |
@@ -4456,9 +4485,9 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p) | |||
4456 | struct sched_entity *se = &p->se; | 4485 | struct sched_entity *se = &p->se; |
4457 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | 4486 | struct cfs_rq *cfs_rq = cfs_rq_of(se); |
4458 | 4487 | ||
4459 | WARN_ON(task_rq(p) != rq); | 4488 | SCHED_WARN_ON(task_rq(p) != rq); |
4460 | 4489 | ||
4461 | if (cfs_rq->nr_running > 1) { | 4490 | if (rq->cfs.h_nr_running > 1) { |
4462 | u64 slice = sched_slice(cfs_rq, se); | 4491 | u64 slice = sched_slice(cfs_rq, se); |
4463 | u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime; | 4492 | u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime; |
4464 | s64 delta = slice - ran; | 4493 | s64 delta = slice - ran; |
@@ -4509,6 +4538,14 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) | |||
4509 | struct cfs_rq *cfs_rq; | 4538 | struct cfs_rq *cfs_rq; |
4510 | struct sched_entity *se = &p->se; | 4539 | struct sched_entity *se = &p->se; |
4511 | 4540 | ||
4541 | /* | ||
4542 | * If in_iowait is set, the code below may not trigger any cpufreq | ||
4543 | * utilization updates, so do it here explicitly with the IOWAIT flag | ||
4544 | * passed. | ||
4545 | */ | ||
4546 | if (p->in_iowait) | ||
4547 | cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_IOWAIT); | ||
4548 | |||
4512 | for_each_sched_entity(se) { | 4549 | for_each_sched_entity(se) { |
4513 | if (se->on_rq) | 4550 | if (se->on_rq) |
4514 | break; | 4551 | break; |
@@ -4605,6 +4642,11 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) | |||
4605 | } | 4642 | } |
4606 | 4643 | ||
4607 | #ifdef CONFIG_SMP | 4644 | #ifdef CONFIG_SMP |
4645 | |||
4646 | /* Working cpumask for: load_balance, load_balance_newidle. */ | ||
4647 | DEFINE_PER_CPU(cpumask_var_t, load_balance_mask); | ||
4648 | DEFINE_PER_CPU(cpumask_var_t, select_idle_mask); | ||
4649 | |||
4608 | #ifdef CONFIG_NO_HZ_COMMON | 4650 | #ifdef CONFIG_NO_HZ_COMMON |
4609 | /* | 4651 | /* |
4610 | * per rq 'load' arrray crap; XXX kill this. | 4652 | * per rq 'load' arrray crap; XXX kill this. |
@@ -5006,9 +5048,9 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg) | |||
5006 | * wl = S * s'_i; see (2) | 5048 | * wl = S * s'_i; see (2) |
5007 | */ | 5049 | */ |
5008 | if (W > 0 && w < W) | 5050 | if (W > 0 && w < W) |
5009 | wl = (w * (long)tg->shares) / W; | 5051 | wl = (w * (long)scale_load_down(tg->shares)) / W; |
5010 | else | 5052 | else |
5011 | wl = tg->shares; | 5053 | wl = scale_load_down(tg->shares); |
5012 | 5054 | ||
5013 | /* | 5055 | /* |
5014 | * Per the above, wl is the new se->load.weight value; since | 5056 | * Per the above, wl is the new se->load.weight value; since |
@@ -5091,18 +5133,18 @@ static int wake_wide(struct task_struct *p) | |||
5091 | return 1; | 5133 | return 1; |
5092 | } | 5134 | } |
5093 | 5135 | ||
5094 | static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) | 5136 | static int wake_affine(struct sched_domain *sd, struct task_struct *p, |
5137 | int prev_cpu, int sync) | ||
5095 | { | 5138 | { |
5096 | s64 this_load, load; | 5139 | s64 this_load, load; |
5097 | s64 this_eff_load, prev_eff_load; | 5140 | s64 this_eff_load, prev_eff_load; |
5098 | int idx, this_cpu, prev_cpu; | 5141 | int idx, this_cpu; |
5099 | struct task_group *tg; | 5142 | struct task_group *tg; |
5100 | unsigned long weight; | 5143 | unsigned long weight; |
5101 | int balanced; | 5144 | int balanced; |
5102 | 5145 | ||
5103 | idx = sd->wake_idx; | 5146 | idx = sd->wake_idx; |
5104 | this_cpu = smp_processor_id(); | 5147 | this_cpu = smp_processor_id(); |
5105 | prev_cpu = task_cpu(p); | ||
5106 | load = source_load(prev_cpu, idx); | 5148 | load = source_load(prev_cpu, idx); |
5107 | this_load = target_load(this_cpu, idx); | 5149 | this_load = target_load(this_cpu, idx); |
5108 | 5150 | ||
@@ -5146,13 +5188,13 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) | |||
5146 | 5188 | ||
5147 | balanced = this_eff_load <= prev_eff_load; | 5189 | balanced = this_eff_load <= prev_eff_load; |
5148 | 5190 | ||
5149 | schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts); | 5191 | schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts); |
5150 | 5192 | ||
5151 | if (!balanced) | 5193 | if (!balanced) |
5152 | return 0; | 5194 | return 0; |
5153 | 5195 | ||
5154 | schedstat_inc(sd, ttwu_move_affine); | 5196 | schedstat_inc(sd->ttwu_move_affine); |
5155 | schedstat_inc(p, se.statistics.nr_wakeups_affine); | 5197 | schedstat_inc(p->se.statistics.nr_wakeups_affine); |
5156 | 5198 | ||
5157 | return 1; | 5199 | return 1; |
5158 | } | 5200 | } |
@@ -5228,6 +5270,10 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) | |||
5228 | int shallowest_idle_cpu = -1; | 5270 | int shallowest_idle_cpu = -1; |
5229 | int i; | 5271 | int i; |
5230 | 5272 | ||
5273 | /* Check if we have any choice: */ | ||
5274 | if (group->group_weight == 1) | ||
5275 | return cpumask_first(sched_group_cpus(group)); | ||
5276 | |||
5231 | /* Traverse only the allowed CPUs */ | 5277 | /* Traverse only the allowed CPUs */ |
5232 | for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) { | 5278 | for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) { |
5233 | if (idle_cpu(i)) { | 5279 | if (idle_cpu(i)) { |
@@ -5265,64 +5311,242 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) | |||
5265 | } | 5311 | } |
5266 | 5312 | ||
5267 | /* | 5313 | /* |
5268 | * Try and locate an idle CPU in the sched_domain. | 5314 | * Implement a for_each_cpu() variant that starts the scan at a given cpu |
5315 | * (@start), and wraps around. | ||
5316 | * | ||
5317 | * This is used to scan for idle CPUs; such that not all CPUs looking for an | ||
5318 | * idle CPU find the same CPU. The down-side is that tasks tend to cycle | ||
5319 | * through the LLC domain. | ||
5320 | * | ||
5321 | * Especially tbench is found sensitive to this. | ||
5322 | */ | ||
5323 | |||
5324 | static int cpumask_next_wrap(int n, const struct cpumask *mask, int start, int *wrapped) | ||
5325 | { | ||
5326 | int next; | ||
5327 | |||
5328 | again: | ||
5329 | next = find_next_bit(cpumask_bits(mask), nr_cpumask_bits, n+1); | ||
5330 | |||
5331 | if (*wrapped) { | ||
5332 | if (next >= start) | ||
5333 | return nr_cpumask_bits; | ||
5334 | } else { | ||
5335 | if (next >= nr_cpumask_bits) { | ||
5336 | *wrapped = 1; | ||
5337 | n = -1; | ||
5338 | goto again; | ||
5339 | } | ||
5340 | } | ||
5341 | |||
5342 | return next; | ||
5343 | } | ||
5344 | |||
5345 | #define for_each_cpu_wrap(cpu, mask, start, wrap) \ | ||
5346 | for ((wrap) = 0, (cpu) = (start)-1; \ | ||
5347 | (cpu) = cpumask_next_wrap((cpu), (mask), (start), &(wrap)), \ | ||
5348 | (cpu) < nr_cpumask_bits; ) | ||
5349 | |||
5350 | #ifdef CONFIG_SCHED_SMT | ||
5351 | |||
5352 | static inline void set_idle_cores(int cpu, int val) | ||
5353 | { | ||
5354 | struct sched_domain_shared *sds; | ||
5355 | |||
5356 | sds = rcu_dereference(per_cpu(sd_llc_shared, cpu)); | ||
5357 | if (sds) | ||
5358 | WRITE_ONCE(sds->has_idle_cores, val); | ||
5359 | } | ||
5360 | |||
5361 | static inline bool test_idle_cores(int cpu, bool def) | ||
5362 | { | ||
5363 | struct sched_domain_shared *sds; | ||
5364 | |||
5365 | sds = rcu_dereference(per_cpu(sd_llc_shared, cpu)); | ||
5366 | if (sds) | ||
5367 | return READ_ONCE(sds->has_idle_cores); | ||
5368 | |||
5369 | return def; | ||
5370 | } | ||
5371 | |||
5372 | /* | ||
5373 | * Scans the local SMT mask to see if the entire core is idle, and records this | ||
5374 | * information in sd_llc_shared->has_idle_cores. | ||
5375 | * | ||
5376 | * Since SMT siblings share all cache levels, inspecting this limited remote | ||
5377 | * state should be fairly cheap. | ||
5378 | */ | ||
5379 | void __update_idle_core(struct rq *rq) | ||
5380 | { | ||
5381 | int core = cpu_of(rq); | ||
5382 | int cpu; | ||
5383 | |||
5384 | rcu_read_lock(); | ||
5385 | if (test_idle_cores(core, true)) | ||
5386 | goto unlock; | ||
5387 | |||
5388 | for_each_cpu(cpu, cpu_smt_mask(core)) { | ||
5389 | if (cpu == core) | ||
5390 | continue; | ||
5391 | |||
5392 | if (!idle_cpu(cpu)) | ||
5393 | goto unlock; | ||
5394 | } | ||
5395 | |||
5396 | set_idle_cores(core, 1); | ||
5397 | unlock: | ||
5398 | rcu_read_unlock(); | ||
5399 | } | ||
5400 | |||
5401 | /* | ||
5402 | * Scan the entire LLC domain for idle cores; this dynamically switches off if | ||
5403 | * there are no idle cores left in the system; tracked through | ||
5404 | * sd_llc->shared->has_idle_cores and enabled through update_idle_core() above. | ||
5405 | */ | ||
5406 | static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target) | ||
5407 | { | ||
5408 | struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask); | ||
5409 | int core, cpu, wrap; | ||
5410 | |||
5411 | if (!static_branch_likely(&sched_smt_present)) | ||
5412 | return -1; | ||
5413 | |||
5414 | if (!test_idle_cores(target, false)) | ||
5415 | return -1; | ||
5416 | |||
5417 | cpumask_and(cpus, sched_domain_span(sd), tsk_cpus_allowed(p)); | ||
5418 | |||
5419 | for_each_cpu_wrap(core, cpus, target, wrap) { | ||
5420 | bool idle = true; | ||
5421 | |||
5422 | for_each_cpu(cpu, cpu_smt_mask(core)) { | ||
5423 | cpumask_clear_cpu(cpu, cpus); | ||
5424 | if (!idle_cpu(cpu)) | ||
5425 | idle = false; | ||
5426 | } | ||
5427 | |||
5428 | if (idle) | ||
5429 | return core; | ||
5430 | } | ||
5431 | |||
5432 | /* | ||
5433 | * Failed to find an idle core; stop looking for one. | ||
5434 | */ | ||
5435 | set_idle_cores(target, 0); | ||
5436 | |||
5437 | return -1; | ||
5438 | } | ||
5439 | |||
5440 | /* | ||
5441 | * Scan the local SMT mask for idle CPUs. | ||
5442 | */ | ||
5443 | static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target) | ||
5444 | { | ||
5445 | int cpu; | ||
5446 | |||
5447 | if (!static_branch_likely(&sched_smt_present)) | ||
5448 | return -1; | ||
5449 | |||
5450 | for_each_cpu(cpu, cpu_smt_mask(target)) { | ||
5451 | if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) | ||
5452 | continue; | ||
5453 | if (idle_cpu(cpu)) | ||
5454 | return cpu; | ||
5455 | } | ||
5456 | |||
5457 | return -1; | ||
5458 | } | ||
5459 | |||
5460 | #else /* CONFIG_SCHED_SMT */ | ||
5461 | |||
5462 | static inline int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target) | ||
5463 | { | ||
5464 | return -1; | ||
5465 | } | ||
5466 | |||
5467 | static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target) | ||
5468 | { | ||
5469 | return -1; | ||
5470 | } | ||
5471 | |||
5472 | #endif /* CONFIG_SCHED_SMT */ | ||
5473 | |||
5474 | /* | ||
5475 | * Scan the LLC domain for idle CPUs; this is dynamically regulated by | ||
5476 | * comparing the average scan cost (tracked in sd->avg_scan_cost) against the | ||
5477 | * average idle time for this rq (as found in rq->avg_idle). | ||
5478 | */ | ||
5479 | static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int target) | ||
5480 | { | ||
5481 | struct sched_domain *this_sd; | ||
5482 | u64 avg_cost, avg_idle = this_rq()->avg_idle; | ||
5483 | u64 time, cost; | ||
5484 | s64 delta; | ||
5485 | int cpu, wrap; | ||
5486 | |||
5487 | this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc)); | ||
5488 | if (!this_sd) | ||
5489 | return -1; | ||
5490 | |||
5491 | avg_cost = this_sd->avg_scan_cost; | ||
5492 | |||
5493 | /* | ||
5494 | * Due to large variance we need a large fuzz factor; hackbench in | ||
5495 | * particularly is sensitive here. | ||
5496 | */ | ||
5497 | if ((avg_idle / 512) < avg_cost) | ||
5498 | return -1; | ||
5499 | |||
5500 | time = local_clock(); | ||
5501 | |||
5502 | for_each_cpu_wrap(cpu, sched_domain_span(sd), target, wrap) { | ||
5503 | if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) | ||
5504 | continue; | ||
5505 | if (idle_cpu(cpu)) | ||
5506 | break; | ||
5507 | } | ||
5508 | |||
5509 | time = local_clock() - time; | ||
5510 | cost = this_sd->avg_scan_cost; | ||
5511 | delta = (s64)(time - cost) / 8; | ||
5512 | this_sd->avg_scan_cost += delta; | ||
5513 | |||
5514 | return cpu; | ||
5515 | } | ||
5516 | |||
5517 | /* | ||
5518 | * Try and locate an idle core/thread in the LLC cache domain. | ||
5269 | */ | 5519 | */ |
5270 | static int select_idle_sibling(struct task_struct *p, int target) | 5520 | static int select_idle_sibling(struct task_struct *p, int prev, int target) |
5271 | { | 5521 | { |
5272 | struct sched_domain *sd; | 5522 | struct sched_domain *sd; |
5273 | struct sched_group *sg; | 5523 | int i; |
5274 | int i = task_cpu(p); | ||
5275 | 5524 | ||
5276 | if (idle_cpu(target)) | 5525 | if (idle_cpu(target)) |
5277 | return target; | 5526 | return target; |
5278 | 5527 | ||
5279 | /* | 5528 | /* |
5280 | * If the prevous cpu is cache affine and idle, don't be stupid. | 5529 | * If the previous cpu is cache affine and idle, don't be stupid. |
5281 | */ | 5530 | */ |
5282 | if (i != target && cpus_share_cache(i, target) && idle_cpu(i)) | 5531 | if (prev != target && cpus_share_cache(prev, target) && idle_cpu(prev)) |
5283 | return i; | 5532 | return prev; |
5284 | 5533 | ||
5285 | /* | ||
5286 | * Otherwise, iterate the domains and find an eligible idle cpu. | ||
5287 | * | ||
5288 | * A completely idle sched group at higher domains is more | ||
5289 | * desirable than an idle group at a lower level, because lower | ||
5290 | * domains have smaller groups and usually share hardware | ||
5291 | * resources which causes tasks to contend on them, e.g. x86 | ||
5292 | * hyperthread siblings in the lowest domain (SMT) can contend | ||
5293 | * on the shared cpu pipeline. | ||
5294 | * | ||
5295 | * However, while we prefer idle groups at higher domains | ||
5296 | * finding an idle cpu at the lowest domain is still better than | ||
5297 | * returning 'target', which we've already established, isn't | ||
5298 | * idle. | ||
5299 | */ | ||
5300 | sd = rcu_dereference(per_cpu(sd_llc, target)); | 5534 | sd = rcu_dereference(per_cpu(sd_llc, target)); |
5301 | for_each_lower_domain(sd) { | 5535 | if (!sd) |
5302 | sg = sd->groups; | 5536 | return target; |
5303 | do { | 5537 | |
5304 | if (!cpumask_intersects(sched_group_cpus(sg), | 5538 | i = select_idle_core(p, sd, target); |
5305 | tsk_cpus_allowed(p))) | 5539 | if ((unsigned)i < nr_cpumask_bits) |
5306 | goto next; | 5540 | return i; |
5307 | 5541 | ||
5308 | /* Ensure the entire group is idle */ | 5542 | i = select_idle_cpu(p, sd, target); |
5309 | for_each_cpu(i, sched_group_cpus(sg)) { | 5543 | if ((unsigned)i < nr_cpumask_bits) |
5310 | if (i == target || !idle_cpu(i)) | 5544 | return i; |
5311 | goto next; | 5545 | |
5312 | } | 5546 | i = select_idle_smt(p, sd, target); |
5547 | if ((unsigned)i < nr_cpumask_bits) | ||
5548 | return i; | ||
5313 | 5549 | ||
5314 | /* | ||
5315 | * It doesn't matter which cpu we pick, the | ||
5316 | * whole group is idle. | ||
5317 | */ | ||
5318 | target = cpumask_first_and(sched_group_cpus(sg), | ||
5319 | tsk_cpus_allowed(p)); | ||
5320 | goto done; | ||
5321 | next: | ||
5322 | sg = sg->next; | ||
5323 | } while (sg != sd->groups); | ||
5324 | } | ||
5325 | done: | ||
5326 | return target; | 5550 | return target; |
5327 | } | 5551 | } |
5328 | 5552 | ||
@@ -5360,6 +5584,32 @@ static int cpu_util(int cpu) | |||
5360 | return (util >= capacity) ? capacity : util; | 5584 | return (util >= capacity) ? capacity : util; |
5361 | } | 5585 | } |
5362 | 5586 | ||
5587 | static inline int task_util(struct task_struct *p) | ||
5588 | { | ||
5589 | return p->se.avg.util_avg; | ||
5590 | } | ||
5591 | |||
5592 | /* | ||
5593 | * Disable WAKE_AFFINE in the case where task @p doesn't fit in the | ||
5594 | * capacity of either the waking CPU @cpu or the previous CPU @prev_cpu. | ||
5595 | * | ||
5596 | * In that case WAKE_AFFINE doesn't make sense and we'll let | ||
5597 | * BALANCE_WAKE sort things out. | ||
5598 | */ | ||
5599 | static int wake_cap(struct task_struct *p, int cpu, int prev_cpu) | ||
5600 | { | ||
5601 | long min_cap, max_cap; | ||
5602 | |||
5603 | min_cap = min(capacity_orig_of(prev_cpu), capacity_orig_of(cpu)); | ||
5604 | max_cap = cpu_rq(cpu)->rd->max_cpu_capacity; | ||
5605 | |||
5606 | /* Minimum capacity is close to max, no need to abort wake_affine */ | ||
5607 | if (max_cap - min_cap < max_cap >> 3) | ||
5608 | return 0; | ||
5609 | |||
5610 | return min_cap * 1024 < task_util(p) * capacity_margin; | ||
5611 | } | ||
5612 | |||
5363 | /* | 5613 | /* |
5364 | * select_task_rq_fair: Select target runqueue for the waking task in domains | 5614 | * select_task_rq_fair: Select target runqueue for the waking task in domains |
5365 | * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE, | 5615 | * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE, |
@@ -5383,7 +5633,8 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f | |||
5383 | 5633 | ||
5384 | if (sd_flag & SD_BALANCE_WAKE) { | 5634 | if (sd_flag & SD_BALANCE_WAKE) { |
5385 | record_wakee(p); | 5635 | record_wakee(p); |
5386 | want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, tsk_cpus_allowed(p)); | 5636 | want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu) |
5637 | && cpumask_test_cpu(cpu, tsk_cpus_allowed(p)); | ||
5387 | } | 5638 | } |
5388 | 5639 | ||
5389 | rcu_read_lock(); | 5640 | rcu_read_lock(); |
@@ -5409,13 +5660,13 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f | |||
5409 | 5660 | ||
5410 | if (affine_sd) { | 5661 | if (affine_sd) { |
5411 | sd = NULL; /* Prefer wake_affine over balance flags */ | 5662 | sd = NULL; /* Prefer wake_affine over balance flags */ |
5412 | if (cpu != prev_cpu && wake_affine(affine_sd, p, sync)) | 5663 | if (cpu != prev_cpu && wake_affine(affine_sd, p, prev_cpu, sync)) |
5413 | new_cpu = cpu; | 5664 | new_cpu = cpu; |
5414 | } | 5665 | } |
5415 | 5666 | ||
5416 | if (!sd) { | 5667 | if (!sd) { |
5417 | if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */ | 5668 | if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */ |
5418 | new_cpu = select_idle_sibling(p, new_cpu); | 5669 | new_cpu = select_idle_sibling(p, prev_cpu, new_cpu); |
5419 | 5670 | ||
5420 | } else while (sd) { | 5671 | } else while (sd) { |
5421 | struct sched_group *group; | 5672 | struct sched_group *group; |
@@ -5939,7 +6190,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp | |||
5939 | * | 6190 | * |
5940 | * The adjacency matrix of the resulting graph is given by: | 6191 | * The adjacency matrix of the resulting graph is given by: |
5941 | * | 6192 | * |
5942 | * log_2 n | 6193 | * log_2 n |
5943 | * A_i,j = \Union (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1) (6) | 6194 | * A_i,j = \Union (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1) (6) |
5944 | * k = 0 | 6195 | * k = 0 |
5945 | * | 6196 | * |
@@ -5985,7 +6236,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp | |||
5985 | * | 6236 | * |
5986 | * [XXX write more on how we solve this.. _after_ merging pjt's patches that | 6237 | * [XXX write more on how we solve this.. _after_ merging pjt's patches that |
5987 | * rewrite all of this once again.] | 6238 | * rewrite all of this once again.] |
5988 | */ | 6239 | */ |
5989 | 6240 | ||
5990 | static unsigned long __read_mostly max_load_balance_interval = HZ/10; | 6241 | static unsigned long __read_mostly max_load_balance_interval = HZ/10; |
5991 | 6242 | ||
@@ -6133,7 +6384,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) | |||
6133 | if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) { | 6384 | if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) { |
6134 | int cpu; | 6385 | int cpu; |
6135 | 6386 | ||
6136 | schedstat_inc(p, se.statistics.nr_failed_migrations_affine); | 6387 | schedstat_inc(p->se.statistics.nr_failed_migrations_affine); |
6137 | 6388 | ||
6138 | env->flags |= LBF_SOME_PINNED; | 6389 | env->flags |= LBF_SOME_PINNED; |
6139 | 6390 | ||
@@ -6164,7 +6415,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) | |||
6164 | env->flags &= ~LBF_ALL_PINNED; | 6415 | env->flags &= ~LBF_ALL_PINNED; |
6165 | 6416 | ||
6166 | if (task_running(env->src_rq, p)) { | 6417 | if (task_running(env->src_rq, p)) { |
6167 | schedstat_inc(p, se.statistics.nr_failed_migrations_running); | 6418 | schedstat_inc(p->se.statistics.nr_failed_migrations_running); |
6168 | return 0; | 6419 | return 0; |
6169 | } | 6420 | } |
6170 | 6421 | ||
@@ -6181,13 +6432,13 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) | |||
6181 | if (tsk_cache_hot <= 0 || | 6432 | if (tsk_cache_hot <= 0 || |
6182 | env->sd->nr_balance_failed > env->sd->cache_nice_tries) { | 6433 | env->sd->nr_balance_failed > env->sd->cache_nice_tries) { |
6183 | if (tsk_cache_hot == 1) { | 6434 | if (tsk_cache_hot == 1) { |
6184 | schedstat_inc(env->sd, lb_hot_gained[env->idle]); | 6435 | schedstat_inc(env->sd->lb_hot_gained[env->idle]); |
6185 | schedstat_inc(p, se.statistics.nr_forced_migrations); | 6436 | schedstat_inc(p->se.statistics.nr_forced_migrations); |
6186 | } | 6437 | } |
6187 | return 1; | 6438 | return 1; |
6188 | } | 6439 | } |
6189 | 6440 | ||
6190 | schedstat_inc(p, se.statistics.nr_failed_migrations_hot); | 6441 | schedstat_inc(p->se.statistics.nr_failed_migrations_hot); |
6191 | return 0; | 6442 | return 0; |
6192 | } | 6443 | } |
6193 | 6444 | ||
@@ -6227,7 +6478,7 @@ static struct task_struct *detach_one_task(struct lb_env *env) | |||
6227 | * so we can safely collect stats here rather than | 6478 | * so we can safely collect stats here rather than |
6228 | * inside detach_tasks(). | 6479 | * inside detach_tasks(). |
6229 | */ | 6480 | */ |
6230 | schedstat_inc(env->sd, lb_gained[env->idle]); | 6481 | schedstat_inc(env->sd->lb_gained[env->idle]); |
6231 | return p; | 6482 | return p; |
6232 | } | 6483 | } |
6233 | return NULL; | 6484 | return NULL; |
@@ -6319,7 +6570,7 @@ next: | |||
6319 | * so we can safely collect detach_one_task() stats here rather | 6570 | * so we can safely collect detach_one_task() stats here rather |
6320 | * than inside detach_one_task(). | 6571 | * than inside detach_one_task(). |
6321 | */ | 6572 | */ |
6322 | schedstat_add(env->sd, lb_gained[env->idle], detached); | 6573 | schedstat_add(env->sd->lb_gained[env->idle], detached); |
6323 | 6574 | ||
6324 | return detached; | 6575 | return detached; |
6325 | } | 6576 | } |
@@ -6647,7 +6898,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu) | |||
6647 | /* | 6898 | /* |
6648 | * !SD_OVERLAP domains can assume that child groups | 6899 | * !SD_OVERLAP domains can assume that child groups |
6649 | * span the current group. | 6900 | * span the current group. |
6650 | */ | 6901 | */ |
6651 | 6902 | ||
6652 | group = child->groups; | 6903 | group = child->groups; |
6653 | do { | 6904 | do { |
@@ -7147,7 +7398,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s | |||
7147 | load_above_capacity = busiest->sum_nr_running * SCHED_CAPACITY_SCALE; | 7398 | load_above_capacity = busiest->sum_nr_running * SCHED_CAPACITY_SCALE; |
7148 | if (load_above_capacity > busiest->group_capacity) { | 7399 | if (load_above_capacity > busiest->group_capacity) { |
7149 | load_above_capacity -= busiest->group_capacity; | 7400 | load_above_capacity -= busiest->group_capacity; |
7150 | load_above_capacity *= NICE_0_LOAD; | 7401 | load_above_capacity *= scale_load_down(NICE_0_LOAD); |
7151 | load_above_capacity /= busiest->group_capacity; | 7402 | load_above_capacity /= busiest->group_capacity; |
7152 | } else | 7403 | } else |
7153 | load_above_capacity = ~0UL; | 7404 | load_above_capacity = ~0UL; |
@@ -7354,9 +7605,6 @@ static struct rq *find_busiest_queue(struct lb_env *env, | |||
7354 | */ | 7605 | */ |
7355 | #define MAX_PINNED_INTERVAL 512 | 7606 | #define MAX_PINNED_INTERVAL 512 |
7356 | 7607 | ||
7357 | /* Working cpumask for load_balance and load_balance_newidle. */ | ||
7358 | DEFINE_PER_CPU(cpumask_var_t, load_balance_mask); | ||
7359 | |||
7360 | static int need_active_balance(struct lb_env *env) | 7608 | static int need_active_balance(struct lb_env *env) |
7361 | { | 7609 | { |
7362 | struct sched_domain *sd = env->sd; | 7610 | struct sched_domain *sd = env->sd; |
@@ -7460,7 +7708,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, | |||
7460 | 7708 | ||
7461 | cpumask_copy(cpus, cpu_active_mask); | 7709 | cpumask_copy(cpus, cpu_active_mask); |
7462 | 7710 | ||
7463 | schedstat_inc(sd, lb_count[idle]); | 7711 | schedstat_inc(sd->lb_count[idle]); |
7464 | 7712 | ||
7465 | redo: | 7713 | redo: |
7466 | if (!should_we_balance(&env)) { | 7714 | if (!should_we_balance(&env)) { |
@@ -7470,19 +7718,19 @@ redo: | |||
7470 | 7718 | ||
7471 | group = find_busiest_group(&env); | 7719 | group = find_busiest_group(&env); |
7472 | if (!group) { | 7720 | if (!group) { |
7473 | schedstat_inc(sd, lb_nobusyg[idle]); | 7721 | schedstat_inc(sd->lb_nobusyg[idle]); |
7474 | goto out_balanced; | 7722 | goto out_balanced; |
7475 | } | 7723 | } |
7476 | 7724 | ||
7477 | busiest = find_busiest_queue(&env, group); | 7725 | busiest = find_busiest_queue(&env, group); |
7478 | if (!busiest) { | 7726 | if (!busiest) { |
7479 | schedstat_inc(sd, lb_nobusyq[idle]); | 7727 | schedstat_inc(sd->lb_nobusyq[idle]); |
7480 | goto out_balanced; | 7728 | goto out_balanced; |
7481 | } | 7729 | } |
7482 | 7730 | ||
7483 | BUG_ON(busiest == env.dst_rq); | 7731 | BUG_ON(busiest == env.dst_rq); |
7484 | 7732 | ||
7485 | schedstat_add(sd, lb_imbalance[idle], env.imbalance); | 7733 | schedstat_add(sd->lb_imbalance[idle], env.imbalance); |
7486 | 7734 | ||
7487 | env.src_cpu = busiest->cpu; | 7735 | env.src_cpu = busiest->cpu; |
7488 | env.src_rq = busiest; | 7736 | env.src_rq = busiest; |
@@ -7589,7 +7837,7 @@ more_balance: | |||
7589 | } | 7837 | } |
7590 | 7838 | ||
7591 | if (!ld_moved) { | 7839 | if (!ld_moved) { |
7592 | schedstat_inc(sd, lb_failed[idle]); | 7840 | schedstat_inc(sd->lb_failed[idle]); |
7593 | /* | 7841 | /* |
7594 | * Increment the failure counter only on periodic balance. | 7842 | * Increment the failure counter only on periodic balance. |
7595 | * We do not want newidle balance, which can be very | 7843 | * We do not want newidle balance, which can be very |
@@ -7672,7 +7920,7 @@ out_all_pinned: | |||
7672 | * we can't migrate them. Let the imbalance flag set so parent level | 7920 | * we can't migrate them. Let the imbalance flag set so parent level |
7673 | * can try to migrate them. | 7921 | * can try to migrate them. |
7674 | */ | 7922 | */ |
7675 | schedstat_inc(sd, lb_balanced[idle]); | 7923 | schedstat_inc(sd->lb_balanced[idle]); |
7676 | 7924 | ||
7677 | sd->nr_balance_failed = 0; | 7925 | sd->nr_balance_failed = 0; |
7678 | 7926 | ||
@@ -7704,11 +7952,12 @@ get_sd_balance_interval(struct sched_domain *sd, int cpu_busy) | |||
7704 | } | 7952 | } |
7705 | 7953 | ||
7706 | static inline void | 7954 | static inline void |
7707 | update_next_balance(struct sched_domain *sd, int cpu_busy, unsigned long *next_balance) | 7955 | update_next_balance(struct sched_domain *sd, unsigned long *next_balance) |
7708 | { | 7956 | { |
7709 | unsigned long interval, next; | 7957 | unsigned long interval, next; |
7710 | 7958 | ||
7711 | interval = get_sd_balance_interval(sd, cpu_busy); | 7959 | /* used by idle balance, so cpu_busy = 0 */ |
7960 | interval = get_sd_balance_interval(sd, 0); | ||
7712 | next = sd->last_balance + interval; | 7961 | next = sd->last_balance + interval; |
7713 | 7962 | ||
7714 | if (time_after(*next_balance, next)) | 7963 | if (time_after(*next_balance, next)) |
@@ -7738,7 +7987,7 @@ static int idle_balance(struct rq *this_rq) | |||
7738 | rcu_read_lock(); | 7987 | rcu_read_lock(); |
7739 | sd = rcu_dereference_check_sched_domain(this_rq->sd); | 7988 | sd = rcu_dereference_check_sched_domain(this_rq->sd); |
7740 | if (sd) | 7989 | if (sd) |
7741 | update_next_balance(sd, 0, &next_balance); | 7990 | update_next_balance(sd, &next_balance); |
7742 | rcu_read_unlock(); | 7991 | rcu_read_unlock(); |
7743 | 7992 | ||
7744 | goto out; | 7993 | goto out; |
@@ -7756,7 +8005,7 @@ static int idle_balance(struct rq *this_rq) | |||
7756 | continue; | 8005 | continue; |
7757 | 8006 | ||
7758 | if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) { | 8007 | if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) { |
7759 | update_next_balance(sd, 0, &next_balance); | 8008 | update_next_balance(sd, &next_balance); |
7760 | break; | 8009 | break; |
7761 | } | 8010 | } |
7762 | 8011 | ||
@@ -7774,7 +8023,7 @@ static int idle_balance(struct rq *this_rq) | |||
7774 | curr_cost += domain_cost; | 8023 | curr_cost += domain_cost; |
7775 | } | 8024 | } |
7776 | 8025 | ||
7777 | update_next_balance(sd, 0, &next_balance); | 8026 | update_next_balance(sd, &next_balance); |
7778 | 8027 | ||
7779 | /* | 8028 | /* |
7780 | * Stop searching for tasks to pull if there are | 8029 | * Stop searching for tasks to pull if there are |
@@ -7864,15 +8113,15 @@ static int active_load_balance_cpu_stop(void *data) | |||
7864 | .idle = CPU_IDLE, | 8113 | .idle = CPU_IDLE, |
7865 | }; | 8114 | }; |
7866 | 8115 | ||
7867 | schedstat_inc(sd, alb_count); | 8116 | schedstat_inc(sd->alb_count); |
7868 | 8117 | ||
7869 | p = detach_one_task(&env); | 8118 | p = detach_one_task(&env); |
7870 | if (p) { | 8119 | if (p) { |
7871 | schedstat_inc(sd, alb_pushed); | 8120 | schedstat_inc(sd->alb_pushed); |
7872 | /* Active balancing done, reset the failure counter. */ | 8121 | /* Active balancing done, reset the failure counter. */ |
7873 | sd->nr_balance_failed = 0; | 8122 | sd->nr_balance_failed = 0; |
7874 | } else { | 8123 | } else { |
7875 | schedstat_inc(sd, alb_failed); | 8124 | schedstat_inc(sd->alb_failed); |
7876 | } | 8125 | } |
7877 | } | 8126 | } |
7878 | rcu_read_unlock(); | 8127 | rcu_read_unlock(); |
@@ -7964,13 +8213,13 @@ static inline void set_cpu_sd_state_busy(void) | |||
7964 | int cpu = smp_processor_id(); | 8213 | int cpu = smp_processor_id(); |
7965 | 8214 | ||
7966 | rcu_read_lock(); | 8215 | rcu_read_lock(); |
7967 | sd = rcu_dereference(per_cpu(sd_busy, cpu)); | 8216 | sd = rcu_dereference(per_cpu(sd_llc, cpu)); |
7968 | 8217 | ||
7969 | if (!sd || !sd->nohz_idle) | 8218 | if (!sd || !sd->nohz_idle) |
7970 | goto unlock; | 8219 | goto unlock; |
7971 | sd->nohz_idle = 0; | 8220 | sd->nohz_idle = 0; |
7972 | 8221 | ||
7973 | atomic_inc(&sd->groups->sgc->nr_busy_cpus); | 8222 | atomic_inc(&sd->shared->nr_busy_cpus); |
7974 | unlock: | 8223 | unlock: |
7975 | rcu_read_unlock(); | 8224 | rcu_read_unlock(); |
7976 | } | 8225 | } |
@@ -7981,13 +8230,13 @@ void set_cpu_sd_state_idle(void) | |||
7981 | int cpu = smp_processor_id(); | 8230 | int cpu = smp_processor_id(); |
7982 | 8231 | ||
7983 | rcu_read_lock(); | 8232 | rcu_read_lock(); |
7984 | sd = rcu_dereference(per_cpu(sd_busy, cpu)); | 8233 | sd = rcu_dereference(per_cpu(sd_llc, cpu)); |
7985 | 8234 | ||
7986 | if (!sd || sd->nohz_idle) | 8235 | if (!sd || sd->nohz_idle) |
7987 | goto unlock; | 8236 | goto unlock; |
7988 | sd->nohz_idle = 1; | 8237 | sd->nohz_idle = 1; |
7989 | 8238 | ||
7990 | atomic_dec(&sd->groups->sgc->nr_busy_cpus); | 8239 | atomic_dec(&sd->shared->nr_busy_cpus); |
7991 | unlock: | 8240 | unlock: |
7992 | rcu_read_unlock(); | 8241 | rcu_read_unlock(); |
7993 | } | 8242 | } |
@@ -8214,8 +8463,8 @@ end: | |||
8214 | static inline bool nohz_kick_needed(struct rq *rq) | 8463 | static inline bool nohz_kick_needed(struct rq *rq) |
8215 | { | 8464 | { |
8216 | unsigned long now = jiffies; | 8465 | unsigned long now = jiffies; |
8466 | struct sched_domain_shared *sds; | ||
8217 | struct sched_domain *sd; | 8467 | struct sched_domain *sd; |
8218 | struct sched_group_capacity *sgc; | ||
8219 | int nr_busy, cpu = rq->cpu; | 8468 | int nr_busy, cpu = rq->cpu; |
8220 | bool kick = false; | 8469 | bool kick = false; |
8221 | 8470 | ||
@@ -8243,11 +8492,13 @@ static inline bool nohz_kick_needed(struct rq *rq) | |||
8243 | return true; | 8492 | return true; |
8244 | 8493 | ||
8245 | rcu_read_lock(); | 8494 | rcu_read_lock(); |
8246 | sd = rcu_dereference(per_cpu(sd_busy, cpu)); | 8495 | sds = rcu_dereference(per_cpu(sd_llc_shared, cpu)); |
8247 | if (sd) { | 8496 | if (sds) { |
8248 | sgc = sd->groups->sgc; | 8497 | /* |
8249 | nr_busy = atomic_read(&sgc->nr_busy_cpus); | 8498 | * XXX: write a coherent comment on why we do this. |
8250 | 8499 | * See also: http://lkml.kernel.org/r/20111202010832.602203411@sbsiddha-desk.sc.intel.com | |
8500 | */ | ||
8501 | nr_busy = atomic_read(&sds->nr_busy_cpus); | ||
8251 | if (nr_busy > 1) { | 8502 | if (nr_busy > 1) { |
8252 | kick = true; | 8503 | kick = true; |
8253 | goto unlock; | 8504 | goto unlock; |
@@ -8283,7 +8534,7 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { } | |||
8283 | * run_rebalance_domains is triggered when needed from the scheduler tick. | 8534 | * run_rebalance_domains is triggered when needed from the scheduler tick. |
8284 | * Also triggered for nohz idle balancing (with nohz_balancing_kick set). | 8535 | * Also triggered for nohz idle balancing (with nohz_balancing_kick set). |
8285 | */ | 8536 | */ |
8286 | static void run_rebalance_domains(struct softirq_action *h) | 8537 | static __latent_entropy void run_rebalance_domains(struct softirq_action *h) |
8287 | { | 8538 | { |
8288 | struct rq *this_rq = this_rq(); | 8539 | struct rq *this_rq = this_rq(); |
8289 | enum cpu_idle_type idle = this_rq->idle_balance ? | 8540 | enum cpu_idle_type idle = this_rq->idle_balance ? |
@@ -8441,7 +8692,6 @@ static void detach_task_cfs_rq(struct task_struct *p) | |||
8441 | struct sched_entity *se = &p->se; | 8692 | struct sched_entity *se = &p->se; |
8442 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | 8693 | struct cfs_rq *cfs_rq = cfs_rq_of(se); |
8443 | u64 now = cfs_rq_clock_task(cfs_rq); | 8694 | u64 now = cfs_rq_clock_task(cfs_rq); |
8444 | int tg_update; | ||
8445 | 8695 | ||
8446 | if (!vruntime_normalized(p)) { | 8696 | if (!vruntime_normalized(p)) { |
8447 | /* | 8697 | /* |
@@ -8453,10 +8703,9 @@ static void detach_task_cfs_rq(struct task_struct *p) | |||
8453 | } | 8703 | } |
8454 | 8704 | ||
8455 | /* Catch up with the cfs_rq and remove our load when we leave */ | 8705 | /* Catch up with the cfs_rq and remove our load when we leave */ |
8456 | tg_update = update_cfs_rq_load_avg(now, cfs_rq, false); | 8706 | update_cfs_rq_load_avg(now, cfs_rq, false); |
8457 | detach_entity_load_avg(cfs_rq, se); | 8707 | detach_entity_load_avg(cfs_rq, se); |
8458 | if (tg_update) | 8708 | update_tg_load_avg(cfs_rq, false); |
8459 | update_tg_load_avg(cfs_rq, false); | ||
8460 | } | 8709 | } |
8461 | 8710 | ||
8462 | static void attach_task_cfs_rq(struct task_struct *p) | 8711 | static void attach_task_cfs_rq(struct task_struct *p) |
@@ -8464,7 +8713,6 @@ static void attach_task_cfs_rq(struct task_struct *p) | |||
8464 | struct sched_entity *se = &p->se; | 8713 | struct sched_entity *se = &p->se; |
8465 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | 8714 | struct cfs_rq *cfs_rq = cfs_rq_of(se); |
8466 | u64 now = cfs_rq_clock_task(cfs_rq); | 8715 | u64 now = cfs_rq_clock_task(cfs_rq); |
8467 | int tg_update; | ||
8468 | 8716 | ||
8469 | #ifdef CONFIG_FAIR_GROUP_SCHED | 8717 | #ifdef CONFIG_FAIR_GROUP_SCHED |
8470 | /* | 8718 | /* |
@@ -8475,10 +8723,9 @@ static void attach_task_cfs_rq(struct task_struct *p) | |||
8475 | #endif | 8723 | #endif |
8476 | 8724 | ||
8477 | /* Synchronize task with its cfs_rq */ | 8725 | /* Synchronize task with its cfs_rq */ |
8478 | tg_update = update_cfs_rq_load_avg(now, cfs_rq, false); | 8726 | update_cfs_rq_load_avg(now, cfs_rq, false); |
8479 | attach_entity_load_avg(cfs_rq, se); | 8727 | attach_entity_load_avg(cfs_rq, se); |
8480 | if (tg_update) | 8728 | update_tg_load_avg(cfs_rq, false); |
8481 | update_tg_load_avg(cfs_rq, false); | ||
8482 | 8729 | ||
8483 | if (!vruntime_normalized(p)) | 8730 | if (!vruntime_normalized(p)) |
8484 | se->vruntime += cfs_rq->min_vruntime; | 8731 | se->vruntime += cfs_rq->min_vruntime; |
@@ -8592,7 +8839,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) | |||
8592 | { | 8839 | { |
8593 | struct sched_entity *se; | 8840 | struct sched_entity *se; |
8594 | struct cfs_rq *cfs_rq; | 8841 | struct cfs_rq *cfs_rq; |
8595 | struct rq *rq; | ||
8596 | int i; | 8842 | int i; |
8597 | 8843 | ||
8598 | tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL); | 8844 | tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL); |
@@ -8607,8 +8853,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) | |||
8607 | init_cfs_bandwidth(tg_cfs_bandwidth(tg)); | 8853 | init_cfs_bandwidth(tg_cfs_bandwidth(tg)); |
8608 | 8854 | ||
8609 | for_each_possible_cpu(i) { | 8855 | for_each_possible_cpu(i) { |
8610 | rq = cpu_rq(i); | ||
8611 | |||
8612 | cfs_rq = kzalloc_node(sizeof(struct cfs_rq), | 8856 | cfs_rq = kzalloc_node(sizeof(struct cfs_rq), |
8613 | GFP_KERNEL, cpu_to_node(i)); | 8857 | GFP_KERNEL, cpu_to_node(i)); |
8614 | if (!cfs_rq) | 8858 | if (!cfs_rq) |
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 9fb873cfc75c..1d8718d5300d 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c | |||
@@ -16,6 +16,9 @@ | |||
16 | 16 | ||
17 | #include "sched.h" | 17 | #include "sched.h" |
18 | 18 | ||
19 | /* Linker adds these: start and end of __cpuidle functions */ | ||
20 | extern char __cpuidle_text_start[], __cpuidle_text_end[]; | ||
21 | |||
19 | /** | 22 | /** |
20 | * sched_idle_set_state - Record idle state for the current CPU. | 23 | * sched_idle_set_state - Record idle state for the current CPU. |
21 | * @idle_state: State to record. | 24 | * @idle_state: State to record. |
@@ -53,7 +56,7 @@ static int __init cpu_idle_nopoll_setup(char *__unused) | |||
53 | __setup("hlt", cpu_idle_nopoll_setup); | 56 | __setup("hlt", cpu_idle_nopoll_setup); |
54 | #endif | 57 | #endif |
55 | 58 | ||
56 | static inline int cpu_idle_poll(void) | 59 | static noinline int __cpuidle cpu_idle_poll(void) |
57 | { | 60 | { |
58 | rcu_idle_enter(); | 61 | rcu_idle_enter(); |
59 | trace_cpu_idle_rcuidle(0, smp_processor_id()); | 62 | trace_cpu_idle_rcuidle(0, smp_processor_id()); |
@@ -84,7 +87,7 @@ void __weak arch_cpu_idle(void) | |||
84 | * | 87 | * |
85 | * To use when the cpuidle framework cannot be used. | 88 | * To use when the cpuidle framework cannot be used. |
86 | */ | 89 | */ |
87 | void default_idle_call(void) | 90 | void __cpuidle default_idle_call(void) |
88 | { | 91 | { |
89 | if (current_clr_polling_and_test()) { | 92 | if (current_clr_polling_and_test()) { |
90 | local_irq_enable(); | 93 | local_irq_enable(); |
@@ -271,6 +274,12 @@ static void cpu_idle_loop(void) | |||
271 | } | 274 | } |
272 | } | 275 | } |
273 | 276 | ||
277 | bool cpu_in_idle(unsigned long pc) | ||
278 | { | ||
279 | return pc >= (unsigned long)__cpuidle_text_start && | ||
280 | pc < (unsigned long)__cpuidle_text_end; | ||
281 | } | ||
282 | |||
274 | void cpu_startup_entry(enum cpuhp_state state) | 283 | void cpu_startup_entry(enum cpuhp_state state) |
275 | { | 284 | { |
276 | /* | 285 | /* |
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c index 2ce5458bbe1d..5405d3feb112 100644 --- a/kernel/sched/idle_task.c +++ b/kernel/sched/idle_task.c | |||
@@ -27,8 +27,8 @@ static struct task_struct * | |||
27 | pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie) | 27 | pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie) |
28 | { | 28 | { |
29 | put_prev_task(rq, prev); | 29 | put_prev_task(rq, prev); |
30 | 30 | update_idle_core(rq); | |
31 | schedstat_inc(rq, sched_goidle); | 31 | schedstat_inc(rq->sched_goidle); |
32 | return rq->idle; | 32 | return rq->idle; |
33 | } | 33 | } |
34 | 34 | ||
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index d5690b722691..2516b8df6dbb 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c | |||
@@ -957,9 +957,8 @@ static void update_curr_rt(struct rq *rq) | |||
957 | if (unlikely((s64)delta_exec <= 0)) | 957 | if (unlikely((s64)delta_exec <= 0)) |
958 | return; | 958 | return; |
959 | 959 | ||
960 | /* Kick cpufreq (see the comment in linux/cpufreq.h). */ | 960 | /* Kick cpufreq (see the comment in kernel/sched/sched.h). */ |
961 | if (cpu_of(rq) == smp_processor_id()) | 961 | cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_RT); |
962 | cpufreq_trigger_update(rq_clock(rq)); | ||
963 | 962 | ||
964 | schedstat_set(curr->se.statistics.exec_max, | 963 | schedstat_set(curr->se.statistics.exec_max, |
965 | max(curr->se.statistics.exec_max, delta_exec)); | 964 | max(curr->se.statistics.exec_max, delta_exec)); |
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index c64fc5114004..055f935d4421 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h | |||
@@ -2,6 +2,7 @@ | |||
2 | #include <linux/sched.h> | 2 | #include <linux/sched.h> |
3 | #include <linux/sched/sysctl.h> | 3 | #include <linux/sched/sysctl.h> |
4 | #include <linux/sched/rt.h> | 4 | #include <linux/sched/rt.h> |
5 | #include <linux/u64_stats_sync.h> | ||
5 | #include <linux/sched/deadline.h> | 6 | #include <linux/sched/deadline.h> |
6 | #include <linux/binfmts.h> | 7 | #include <linux/binfmts.h> |
7 | #include <linux/mutex.h> | 8 | #include <linux/mutex.h> |
@@ -15,6 +16,12 @@ | |||
15 | #include "cpudeadline.h" | 16 | #include "cpudeadline.h" |
16 | #include "cpuacct.h" | 17 | #include "cpuacct.h" |
17 | 18 | ||
19 | #ifdef CONFIG_SCHED_DEBUG | ||
20 | #define SCHED_WARN_ON(x) WARN_ONCE(x, #x) | ||
21 | #else | ||
22 | #define SCHED_WARN_ON(x) ((void)(x)) | ||
23 | #endif | ||
24 | |||
18 | struct rq; | 25 | struct rq; |
19 | struct cpuidle_state; | 26 | struct cpuidle_state; |
20 | 27 | ||
@@ -565,6 +572,8 @@ struct root_domain { | |||
565 | */ | 572 | */ |
566 | cpumask_var_t rto_mask; | 573 | cpumask_var_t rto_mask; |
567 | struct cpupri cpupri; | 574 | struct cpupri cpupri; |
575 | |||
576 | unsigned long max_cpu_capacity; | ||
568 | }; | 577 | }; |
569 | 578 | ||
570 | extern struct root_domain def_root_domain; | 579 | extern struct root_domain def_root_domain; |
@@ -597,7 +606,6 @@ struct rq { | |||
597 | #ifdef CONFIG_SMP | 606 | #ifdef CONFIG_SMP |
598 | unsigned long last_load_update_tick; | 607 | unsigned long last_load_update_tick; |
599 | #endif /* CONFIG_SMP */ | 608 | #endif /* CONFIG_SMP */ |
600 | u64 nohz_stamp; | ||
601 | unsigned long nohz_flags; | 609 | unsigned long nohz_flags; |
602 | #endif /* CONFIG_NO_HZ_COMMON */ | 610 | #endif /* CONFIG_NO_HZ_COMMON */ |
603 | #ifdef CONFIG_NO_HZ_FULL | 611 | #ifdef CONFIG_NO_HZ_FULL |
@@ -723,6 +731,23 @@ static inline int cpu_of(struct rq *rq) | |||
723 | #endif | 731 | #endif |
724 | } | 732 | } |
725 | 733 | ||
734 | |||
735 | #ifdef CONFIG_SCHED_SMT | ||
736 | |||
737 | extern struct static_key_false sched_smt_present; | ||
738 | |||
739 | extern void __update_idle_core(struct rq *rq); | ||
740 | |||
741 | static inline void update_idle_core(struct rq *rq) | ||
742 | { | ||
743 | if (static_branch_unlikely(&sched_smt_present)) | ||
744 | __update_idle_core(rq); | ||
745 | } | ||
746 | |||
747 | #else | ||
748 | static inline void update_idle_core(struct rq *rq) { } | ||
749 | #endif | ||
750 | |||
726 | DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); | 751 | DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); |
727 | 752 | ||
728 | #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) | 753 | #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) |
@@ -857,8 +882,8 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag) | |||
857 | DECLARE_PER_CPU(struct sched_domain *, sd_llc); | 882 | DECLARE_PER_CPU(struct sched_domain *, sd_llc); |
858 | DECLARE_PER_CPU(int, sd_llc_size); | 883 | DECLARE_PER_CPU(int, sd_llc_size); |
859 | DECLARE_PER_CPU(int, sd_llc_id); | 884 | DECLARE_PER_CPU(int, sd_llc_id); |
885 | DECLARE_PER_CPU(struct sched_domain_shared *, sd_llc_shared); | ||
860 | DECLARE_PER_CPU(struct sched_domain *, sd_numa); | 886 | DECLARE_PER_CPU(struct sched_domain *, sd_numa); |
861 | DECLARE_PER_CPU(struct sched_domain *, sd_busy); | ||
862 | DECLARE_PER_CPU(struct sched_domain *, sd_asym); | 887 | DECLARE_PER_CPU(struct sched_domain *, sd_asym); |
863 | 888 | ||
864 | struct sched_group_capacity { | 889 | struct sched_group_capacity { |
@@ -870,10 +895,6 @@ struct sched_group_capacity { | |||
870 | unsigned int capacity; | 895 | unsigned int capacity; |
871 | unsigned long next_update; | 896 | unsigned long next_update; |
872 | int imbalance; /* XXX unrelated to capacity but shared group state */ | 897 | int imbalance; /* XXX unrelated to capacity but shared group state */ |
873 | /* | ||
874 | * Number of busy cpus in this group. | ||
875 | */ | ||
876 | atomic_t nr_busy_cpus; | ||
877 | 898 | ||
878 | unsigned long cpumask[0]; /* iteration mask */ | 899 | unsigned long cpumask[0]; /* iteration mask */ |
879 | }; | 900 | }; |
@@ -1000,7 +1021,11 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) | |||
1000 | * per-task data have been completed by this moment. | 1021 | * per-task data have been completed by this moment. |
1001 | */ | 1022 | */ |
1002 | smp_wmb(); | 1023 | smp_wmb(); |
1024 | #ifdef CONFIG_THREAD_INFO_IN_TASK | ||
1025 | p->cpu = cpu; | ||
1026 | #else | ||
1003 | task_thread_info(p)->cpu = cpu; | 1027 | task_thread_info(p)->cpu = cpu; |
1028 | #endif | ||
1004 | p->wake_cpu = cpu; | 1029 | p->wake_cpu = cpu; |
1005 | #endif | 1030 | #endif |
1006 | } | 1031 | } |
@@ -1260,6 +1285,11 @@ static inline void put_prev_task(struct rq *rq, struct task_struct *prev) | |||
1260 | prev->sched_class->put_prev_task(rq, prev); | 1285 | prev->sched_class->put_prev_task(rq, prev); |
1261 | } | 1286 | } |
1262 | 1287 | ||
1288 | static inline void set_curr_task(struct rq *rq, struct task_struct *curr) | ||
1289 | { | ||
1290 | curr->sched_class->set_curr_task(rq); | ||
1291 | } | ||
1292 | |||
1263 | #define sched_class_highest (&stop_sched_class) | 1293 | #define sched_class_highest (&stop_sched_class) |
1264 | #define for_each_class(class) \ | 1294 | #define for_each_class(class) \ |
1265 | for (class = sched_class_highest; class; class = class->next) | 1295 | for (class = sched_class_highest; class; class = class->next) |
@@ -1290,7 +1320,7 @@ static inline void idle_set_state(struct rq *rq, | |||
1290 | 1320 | ||
1291 | static inline struct cpuidle_state *idle_get_state(struct rq *rq) | 1321 | static inline struct cpuidle_state *idle_get_state(struct rq *rq) |
1292 | { | 1322 | { |
1293 | WARN_ON(!rcu_read_lock_held()); | 1323 | SCHED_WARN_ON(!rcu_read_lock_held()); |
1294 | return rq->idle_state; | 1324 | return rq->idle_state; |
1295 | } | 1325 | } |
1296 | #else | 1326 | #else |
@@ -1710,52 +1740,28 @@ static inline void nohz_balance_exit_idle(unsigned int cpu) { } | |||
1710 | #endif | 1740 | #endif |
1711 | 1741 | ||
1712 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING | 1742 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING |
1743 | struct irqtime { | ||
1744 | u64 hardirq_time; | ||
1745 | u64 softirq_time; | ||
1746 | u64 irq_start_time; | ||
1747 | struct u64_stats_sync sync; | ||
1748 | }; | ||
1713 | 1749 | ||
1714 | DECLARE_PER_CPU(u64, cpu_hardirq_time); | 1750 | DECLARE_PER_CPU(struct irqtime, cpu_irqtime); |
1715 | DECLARE_PER_CPU(u64, cpu_softirq_time); | ||
1716 | |||
1717 | #ifndef CONFIG_64BIT | ||
1718 | DECLARE_PER_CPU(seqcount_t, irq_time_seq); | ||
1719 | |||
1720 | static inline void irq_time_write_begin(void) | ||
1721 | { | ||
1722 | __this_cpu_inc(irq_time_seq.sequence); | ||
1723 | smp_wmb(); | ||
1724 | } | ||
1725 | |||
1726 | static inline void irq_time_write_end(void) | ||
1727 | { | ||
1728 | smp_wmb(); | ||
1729 | __this_cpu_inc(irq_time_seq.sequence); | ||
1730 | } | ||
1731 | 1751 | ||
1732 | static inline u64 irq_time_read(int cpu) | 1752 | static inline u64 irq_time_read(int cpu) |
1733 | { | 1753 | { |
1734 | u64 irq_time; | 1754 | struct irqtime *irqtime = &per_cpu(cpu_irqtime, cpu); |
1735 | unsigned seq; | 1755 | unsigned int seq; |
1756 | u64 total; | ||
1736 | 1757 | ||
1737 | do { | 1758 | do { |
1738 | seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu)); | 1759 | seq = __u64_stats_fetch_begin(&irqtime->sync); |
1739 | irq_time = per_cpu(cpu_softirq_time, cpu) + | 1760 | total = irqtime->softirq_time + irqtime->hardirq_time; |
1740 | per_cpu(cpu_hardirq_time, cpu); | 1761 | } while (__u64_stats_fetch_retry(&irqtime->sync, seq)); |
1741 | } while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq)); | ||
1742 | |||
1743 | return irq_time; | ||
1744 | } | ||
1745 | #else /* CONFIG_64BIT */ | ||
1746 | static inline void irq_time_write_begin(void) | ||
1747 | { | ||
1748 | } | ||
1749 | |||
1750 | static inline void irq_time_write_end(void) | ||
1751 | { | ||
1752 | } | ||
1753 | 1762 | ||
1754 | static inline u64 irq_time_read(int cpu) | 1763 | return total; |
1755 | { | ||
1756 | return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu); | ||
1757 | } | 1764 | } |
1758 | #endif /* CONFIG_64BIT */ | ||
1759 | #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ | 1765 | #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ |
1760 | 1766 | ||
1761 | #ifdef CONFIG_CPU_FREQ | 1767 | #ifdef CONFIG_CPU_FREQ |
@@ -1763,27 +1769,13 @@ DECLARE_PER_CPU(struct update_util_data *, cpufreq_update_util_data); | |||
1763 | 1769 | ||
1764 | /** | 1770 | /** |
1765 | * cpufreq_update_util - Take a note about CPU utilization changes. | 1771 | * cpufreq_update_util - Take a note about CPU utilization changes. |
1766 | * @time: Current time. | 1772 | * @rq: Runqueue to carry out the update for. |
1767 | * @util: Current utilization. | 1773 | * @flags: Update reason flags. |
1768 | * @max: Utilization ceiling. | ||
1769 | * | 1774 | * |
1770 | * This function is called by the scheduler on every invocation of | 1775 | * This function is called by the scheduler on the CPU whose utilization is |
1771 | * update_load_avg() on the CPU whose utilization is being updated. | 1776 | * being updated. |
1772 | * | 1777 | * |
1773 | * It can only be called from RCU-sched read-side critical sections. | 1778 | * It can only be called from RCU-sched read-side critical sections. |
1774 | */ | ||
1775 | static inline void cpufreq_update_util(u64 time, unsigned long util, unsigned long max) | ||
1776 | { | ||
1777 | struct update_util_data *data; | ||
1778 | |||
1779 | data = rcu_dereference_sched(*this_cpu_ptr(&cpufreq_update_util_data)); | ||
1780 | if (data) | ||
1781 | data->func(data, time, util, max); | ||
1782 | } | ||
1783 | |||
1784 | /** | ||
1785 | * cpufreq_trigger_update - Trigger CPU performance state evaluation if needed. | ||
1786 | * @time: Current time. | ||
1787 | * | 1779 | * |
1788 | * The way cpufreq is currently arranged requires it to evaluate the CPU | 1780 | * The way cpufreq is currently arranged requires it to evaluate the CPU |
1789 | * performance state (frequency/voltage) on a regular basis to prevent it from | 1781 | * performance state (frequency/voltage) on a regular basis to prevent it from |
@@ -1797,13 +1789,23 @@ static inline void cpufreq_update_util(u64 time, unsigned long util, unsigned lo | |||
1797 | * but that really is a band-aid. Going forward it should be replaced with | 1789 | * but that really is a band-aid. Going forward it should be replaced with |
1798 | * solutions targeted more specifically at RT and DL tasks. | 1790 | * solutions targeted more specifically at RT and DL tasks. |
1799 | */ | 1791 | */ |
1800 | static inline void cpufreq_trigger_update(u64 time) | 1792 | static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) |
1793 | { | ||
1794 | struct update_util_data *data; | ||
1795 | |||
1796 | data = rcu_dereference_sched(*this_cpu_ptr(&cpufreq_update_util_data)); | ||
1797 | if (data) | ||
1798 | data->func(data, rq_clock(rq), flags); | ||
1799 | } | ||
1800 | |||
1801 | static inline void cpufreq_update_this_cpu(struct rq *rq, unsigned int flags) | ||
1801 | { | 1802 | { |
1802 | cpufreq_update_util(time, ULONG_MAX, 0); | 1803 | if (cpu_of(rq) == smp_processor_id()) |
1804 | cpufreq_update_util(rq, flags); | ||
1803 | } | 1805 | } |
1804 | #else | 1806 | #else |
1805 | static inline void cpufreq_update_util(u64 time, unsigned long util, unsigned long max) {} | 1807 | static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {} |
1806 | static inline void cpufreq_trigger_update(u64 time) {} | 1808 | static inline void cpufreq_update_this_cpu(struct rq *rq, unsigned int flags) {} |
1807 | #endif /* CONFIG_CPU_FREQ */ | 1809 | #endif /* CONFIG_CPU_FREQ */ |
1808 | 1810 | ||
1809 | #ifdef arch_scale_freq_capacity | 1811 | #ifdef arch_scale_freq_capacity |
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index 78955cbea31c..34659a853505 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h | |||
@@ -29,11 +29,12 @@ rq_sched_info_dequeued(struct rq *rq, unsigned long long delta) | |||
29 | if (rq) | 29 | if (rq) |
30 | rq->rq_sched_info.run_delay += delta; | 30 | rq->rq_sched_info.run_delay += delta; |
31 | } | 31 | } |
32 | # define schedstat_enabled() static_branch_unlikely(&sched_schedstats) | 32 | #define schedstat_enabled() static_branch_unlikely(&sched_schedstats) |
33 | # define schedstat_inc(rq, field) do { if (schedstat_enabled()) { (rq)->field++; } } while (0) | 33 | #define schedstat_inc(var) do { if (schedstat_enabled()) { var++; } } while (0) |
34 | # define schedstat_add(rq, field, amt) do { if (schedstat_enabled()) { (rq)->field += (amt); } } while (0) | 34 | #define schedstat_add(var, amt) do { if (schedstat_enabled()) { var += (amt); } } while (0) |
35 | # define schedstat_set(var, val) do { if (schedstat_enabled()) { var = (val); } } while (0) | 35 | #define schedstat_set(var, val) do { if (schedstat_enabled()) { var = (val); } } while (0) |
36 | # define schedstat_val(rq, field) ((schedstat_enabled()) ? (rq)->field : 0) | 36 | #define schedstat_val(var) (var) |
37 | #define schedstat_val_or_zero(var) ((schedstat_enabled()) ? (var) : 0) | ||
37 | 38 | ||
38 | #else /* !CONFIG_SCHEDSTATS */ | 39 | #else /* !CONFIG_SCHEDSTATS */ |
39 | static inline void | 40 | static inline void |
@@ -45,12 +46,13 @@ rq_sched_info_dequeued(struct rq *rq, unsigned long long delta) | |||
45 | static inline void | 46 | static inline void |
46 | rq_sched_info_depart(struct rq *rq, unsigned long long delta) | 47 | rq_sched_info_depart(struct rq *rq, unsigned long long delta) |
47 | {} | 48 | {} |
48 | # define schedstat_enabled() 0 | 49 | #define schedstat_enabled() 0 |
49 | # define schedstat_inc(rq, field) do { } while (0) | 50 | #define schedstat_inc(var) do { } while (0) |
50 | # define schedstat_add(rq, field, amt) do { } while (0) | 51 | #define schedstat_add(var, amt) do { } while (0) |
51 | # define schedstat_set(var, val) do { } while (0) | 52 | #define schedstat_set(var, val) do { } while (0) |
52 | # define schedstat_val(rq, field) 0 | 53 | #define schedstat_val(var) 0 |
53 | #endif | 54 | #define schedstat_val_or_zero(var) 0 |
55 | #endif /* CONFIG_SCHEDSTATS */ | ||
54 | 56 | ||
55 | #ifdef CONFIG_SCHED_INFO | 57 | #ifdef CONFIG_SCHED_INFO |
56 | static inline void sched_info_reset_dequeued(struct task_struct *t) | 58 | static inline void sched_info_reset_dequeued(struct task_struct *t) |
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index f15d6b6a538a..9453efe9b25a 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c | |||
@@ -196,27 +196,48 @@ prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state) | |||
196 | } | 196 | } |
197 | EXPORT_SYMBOL(prepare_to_wait_exclusive); | 197 | EXPORT_SYMBOL(prepare_to_wait_exclusive); |
198 | 198 | ||
199 | long prepare_to_wait_event(wait_queue_head_t *q, wait_queue_t *wait, int state) | 199 | void init_wait_entry(wait_queue_t *wait, int flags) |
200 | { | 200 | { |
201 | unsigned long flags; | 201 | wait->flags = flags; |
202 | |||
203 | if (signal_pending_state(state, current)) | ||
204 | return -ERESTARTSYS; | ||
205 | |||
206 | wait->private = current; | 202 | wait->private = current; |
207 | wait->func = autoremove_wake_function; | 203 | wait->func = autoremove_wake_function; |
204 | INIT_LIST_HEAD(&wait->task_list); | ||
205 | } | ||
206 | EXPORT_SYMBOL(init_wait_entry); | ||
207 | |||
208 | long prepare_to_wait_event(wait_queue_head_t *q, wait_queue_t *wait, int state) | ||
209 | { | ||
210 | unsigned long flags; | ||
211 | long ret = 0; | ||
208 | 212 | ||
209 | spin_lock_irqsave(&q->lock, flags); | 213 | spin_lock_irqsave(&q->lock, flags); |
210 | if (list_empty(&wait->task_list)) { | 214 | if (unlikely(signal_pending_state(state, current))) { |
211 | if (wait->flags & WQ_FLAG_EXCLUSIVE) | 215 | /* |
212 | __add_wait_queue_tail(q, wait); | 216 | * Exclusive waiter must not fail if it was selected by wakeup, |
213 | else | 217 | * it should "consume" the condition we were waiting for. |
214 | __add_wait_queue(q, wait); | 218 | * |
219 | * The caller will recheck the condition and return success if | ||
220 | * we were already woken up, we can not miss the event because | ||
221 | * wakeup locks/unlocks the same q->lock. | ||
222 | * | ||
223 | * But we need to ensure that set-condition + wakeup after that | ||
224 | * can't see us, it should wake up another exclusive waiter if | ||
225 | * we fail. | ||
226 | */ | ||
227 | list_del_init(&wait->task_list); | ||
228 | ret = -ERESTARTSYS; | ||
229 | } else { | ||
230 | if (list_empty(&wait->task_list)) { | ||
231 | if (wait->flags & WQ_FLAG_EXCLUSIVE) | ||
232 | __add_wait_queue_tail(q, wait); | ||
233 | else | ||
234 | __add_wait_queue(q, wait); | ||
235 | } | ||
236 | set_current_state(state); | ||
215 | } | 237 | } |
216 | set_current_state(state); | ||
217 | spin_unlock_irqrestore(&q->lock, flags); | 238 | spin_unlock_irqrestore(&q->lock, flags); |
218 | 239 | ||
219 | return 0; | 240 | return ret; |
220 | } | 241 | } |
221 | EXPORT_SYMBOL(prepare_to_wait_event); | 242 | EXPORT_SYMBOL(prepare_to_wait_event); |
222 | 243 | ||
@@ -255,39 +276,6 @@ void finish_wait(wait_queue_head_t *q, wait_queue_t *wait) | |||
255 | } | 276 | } |
256 | EXPORT_SYMBOL(finish_wait); | 277 | EXPORT_SYMBOL(finish_wait); |
257 | 278 | ||
258 | /** | ||
259 | * abort_exclusive_wait - abort exclusive waiting in a queue | ||
260 | * @q: waitqueue waited on | ||
261 | * @wait: wait descriptor | ||
262 | * @mode: runstate of the waiter to be woken | ||
263 | * @key: key to identify a wait bit queue or %NULL | ||
264 | * | ||
265 | * Sets current thread back to running state and removes | ||
266 | * the wait descriptor from the given waitqueue if still | ||
267 | * queued. | ||
268 | * | ||
269 | * Wakes up the next waiter if the caller is concurrently | ||
270 | * woken up through the queue. | ||
271 | * | ||
272 | * This prevents waiter starvation where an exclusive waiter | ||
273 | * aborts and is woken up concurrently and no one wakes up | ||
274 | * the next waiter. | ||
275 | */ | ||
276 | void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait, | ||
277 | unsigned int mode, void *key) | ||
278 | { | ||
279 | unsigned long flags; | ||
280 | |||
281 | __set_current_state(TASK_RUNNING); | ||
282 | spin_lock_irqsave(&q->lock, flags); | ||
283 | if (!list_empty(&wait->task_list)) | ||
284 | list_del_init(&wait->task_list); | ||
285 | else if (waitqueue_active(q)) | ||
286 | __wake_up_locked_key(q, mode, key); | ||
287 | spin_unlock_irqrestore(&q->lock, flags); | ||
288 | } | ||
289 | EXPORT_SYMBOL(abort_exclusive_wait); | ||
290 | |||
291 | int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key) | 279 | int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key) |
292 | { | 280 | { |
293 | int ret = default_wake_function(wait, mode, sync, key); | 281 | int ret = default_wake_function(wait, mode, sync, key); |
@@ -425,20 +413,29 @@ int __sched | |||
425 | __wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q, | 413 | __wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q, |
426 | wait_bit_action_f *action, unsigned mode) | 414 | wait_bit_action_f *action, unsigned mode) |
427 | { | 415 | { |
428 | do { | 416 | int ret = 0; |
429 | int ret; | ||
430 | 417 | ||
418 | for (;;) { | ||
431 | prepare_to_wait_exclusive(wq, &q->wait, mode); | 419 | prepare_to_wait_exclusive(wq, &q->wait, mode); |
432 | if (!test_bit(q->key.bit_nr, q->key.flags)) | 420 | if (test_bit(q->key.bit_nr, q->key.flags)) { |
433 | continue; | 421 | ret = action(&q->key, mode); |
434 | ret = action(&q->key, mode); | 422 | /* |
435 | if (!ret) | 423 | * See the comment in prepare_to_wait_event(). |
436 | continue; | 424 | * finish_wait() does not necessarily takes wq->lock, |
437 | abort_exclusive_wait(wq, &q->wait, mode, &q->key); | 425 | * but test_and_set_bit() implies mb() which pairs with |
438 | return ret; | 426 | * smp_mb__after_atomic() before wake_up_page(). |
439 | } while (test_and_set_bit(q->key.bit_nr, q->key.flags)); | 427 | */ |
440 | finish_wait(wq, &q->wait); | 428 | if (ret) |
441 | return 0; | 429 | finish_wait(wq, &q->wait); |
430 | } | ||
431 | if (!test_and_set_bit(q->key.bit_nr, q->key.flags)) { | ||
432 | if (!ret) | ||
433 | finish_wait(wq, &q->wait); | ||
434 | return 0; | ||
435 | } else if (ret) { | ||
436 | return ret; | ||
437 | } | ||
438 | } | ||
442 | } | 439 | } |
443 | EXPORT_SYMBOL(__wait_on_bit_lock); | 440 | EXPORT_SYMBOL(__wait_on_bit_lock); |
444 | 441 | ||
@@ -483,16 +480,6 @@ void wake_up_bit(void *word, int bit) | |||
483 | } | 480 | } |
484 | EXPORT_SYMBOL(wake_up_bit); | 481 | EXPORT_SYMBOL(wake_up_bit); |
485 | 482 | ||
486 | wait_queue_head_t *bit_waitqueue(void *word, int bit) | ||
487 | { | ||
488 | const int shift = BITS_PER_LONG == 32 ? 5 : 6; | ||
489 | const struct zone *zone = page_zone(virt_to_page(word)); | ||
490 | unsigned long val = (unsigned long)word << shift | bit; | ||
491 | |||
492 | return &zone->wait_table[hash_long(val, zone->wait_table_bits)]; | ||
493 | } | ||
494 | EXPORT_SYMBOL(bit_waitqueue); | ||
495 | |||
496 | /* | 483 | /* |
497 | * Manipulate the atomic_t address to produce a better bit waitqueue table hash | 484 | * Manipulate the atomic_t address to produce a better bit waitqueue table hash |
498 | * index (we're keying off bit -1, but that would produce a horrible hash | 485 | * index (we're keying off bit -1, but that would produce a horrible hash |
diff --git a/kernel/signal.c b/kernel/signal.c index af21afc00d08..75761acc77cf 100644 --- a/kernel/signal.c +++ b/kernel/signal.c | |||
@@ -3044,6 +3044,11 @@ void kernel_sigaction(int sig, __sighandler_t action) | |||
3044 | } | 3044 | } |
3045 | EXPORT_SYMBOL(kernel_sigaction); | 3045 | EXPORT_SYMBOL(kernel_sigaction); |
3046 | 3046 | ||
3047 | void __weak sigaction_compat_abi(struct k_sigaction *act, | ||
3048 | struct k_sigaction *oact) | ||
3049 | { | ||
3050 | } | ||
3051 | |||
3047 | int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact) | 3052 | int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact) |
3048 | { | 3053 | { |
3049 | struct task_struct *p = current, *t; | 3054 | struct task_struct *p = current, *t; |
@@ -3059,6 +3064,8 @@ int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact) | |||
3059 | if (oact) | 3064 | if (oact) |
3060 | *oact = *k; | 3065 | *oact = *k; |
3061 | 3066 | ||
3067 | sigaction_compat_abi(act, oact); | ||
3068 | |||
3062 | if (act) { | 3069 | if (act) { |
3063 | sigdelsetmask(&act->sa.sa_mask, | 3070 | sigdelsetmask(&act->sa.sa_mask, |
3064 | sigmask(SIGKILL) | sigmask(SIGSTOP)); | 3071 | sigmask(SIGKILL) | sigmask(SIGSTOP)); |
diff --git a/kernel/smp.c b/kernel/smp.c index 3aa642d39c03..bba3b201668d 100644 --- a/kernel/smp.c +++ b/kernel/smp.c | |||
@@ -14,6 +14,7 @@ | |||
14 | #include <linux/smp.h> | 14 | #include <linux/smp.h> |
15 | #include <linux/cpu.h> | 15 | #include <linux/cpu.h> |
16 | #include <linux/sched.h> | 16 | #include <linux/sched.h> |
17 | #include <linux/hypervisor.h> | ||
17 | 18 | ||
18 | #include "smpboot.h" | 19 | #include "smpboot.h" |
19 | 20 | ||
@@ -724,3 +725,54 @@ void wake_up_all_idle_cpus(void) | |||
724 | preempt_enable(); | 725 | preempt_enable(); |
725 | } | 726 | } |
726 | EXPORT_SYMBOL_GPL(wake_up_all_idle_cpus); | 727 | EXPORT_SYMBOL_GPL(wake_up_all_idle_cpus); |
728 | |||
729 | /** | ||
730 | * smp_call_on_cpu - Call a function on a specific cpu | ||
731 | * | ||
732 | * Used to call a function on a specific cpu and wait for it to return. | ||
733 | * Optionally make sure the call is done on a specified physical cpu via vcpu | ||
734 | * pinning in order to support virtualized environments. | ||
735 | */ | ||
736 | struct smp_call_on_cpu_struct { | ||
737 | struct work_struct work; | ||
738 | struct completion done; | ||
739 | int (*func)(void *); | ||
740 | void *data; | ||
741 | int ret; | ||
742 | int cpu; | ||
743 | }; | ||
744 | |||
745 | static void smp_call_on_cpu_callback(struct work_struct *work) | ||
746 | { | ||
747 | struct smp_call_on_cpu_struct *sscs; | ||
748 | |||
749 | sscs = container_of(work, struct smp_call_on_cpu_struct, work); | ||
750 | if (sscs->cpu >= 0) | ||
751 | hypervisor_pin_vcpu(sscs->cpu); | ||
752 | sscs->ret = sscs->func(sscs->data); | ||
753 | if (sscs->cpu >= 0) | ||
754 | hypervisor_pin_vcpu(-1); | ||
755 | |||
756 | complete(&sscs->done); | ||
757 | } | ||
758 | |||
759 | int smp_call_on_cpu(unsigned int cpu, int (*func)(void *), void *par, bool phys) | ||
760 | { | ||
761 | struct smp_call_on_cpu_struct sscs = { | ||
762 | .done = COMPLETION_INITIALIZER_ONSTACK(sscs.done), | ||
763 | .func = func, | ||
764 | .data = par, | ||
765 | .cpu = phys ? cpu : -1, | ||
766 | }; | ||
767 | |||
768 | INIT_WORK_ONSTACK(&sscs.work, smp_call_on_cpu_callback); | ||
769 | |||
770 | if (cpu >= nr_cpu_ids || !cpu_online(cpu)) | ||
771 | return -ENXIO; | ||
772 | |||
773 | queue_work_on(cpu, system_wq, &sscs.work); | ||
774 | wait_for_completion(&sscs.done); | ||
775 | |||
776 | return sscs.ret; | ||
777 | } | ||
778 | EXPORT_SYMBOL_GPL(smp_call_on_cpu); | ||
diff --git a/kernel/smpboot.c b/kernel/smpboot.c index 13bc43d1fb22..4a5c6e73ecd4 100644 --- a/kernel/smpboot.c +++ b/kernel/smpboot.c | |||
@@ -186,6 +186,11 @@ __smpboot_create_thread(struct smp_hotplug_thread *ht, unsigned int cpu) | |||
186 | kfree(td); | 186 | kfree(td); |
187 | return PTR_ERR(tsk); | 187 | return PTR_ERR(tsk); |
188 | } | 188 | } |
189 | /* | ||
190 | * Park the thread so that it could start right on the CPU | ||
191 | * when it is available. | ||
192 | */ | ||
193 | kthread_park(tsk); | ||
189 | get_task_struct(tsk); | 194 | get_task_struct(tsk); |
190 | *per_cpu_ptr(ht->store, cpu) = tsk; | 195 | *per_cpu_ptr(ht->store, cpu) = tsk; |
191 | if (ht->create) { | 196 | if (ht->create) { |
diff --git a/kernel/softirq.c b/kernel/softirq.c index 17caf4b63342..744fa611cae0 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c | |||
@@ -58,7 +58,7 @@ static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp | |||
58 | DEFINE_PER_CPU(struct task_struct *, ksoftirqd); | 58 | DEFINE_PER_CPU(struct task_struct *, ksoftirqd); |
59 | 59 | ||
60 | const char * const softirq_to_name[NR_SOFTIRQS] = { | 60 | const char * const softirq_to_name[NR_SOFTIRQS] = { |
61 | "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "BLOCK_IOPOLL", | 61 | "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "IRQ_POLL", |
62 | "TASKLET", "SCHED", "HRTIMER", "RCU" | 62 | "TASKLET", "SCHED", "HRTIMER", "RCU" |
63 | }; | 63 | }; |
64 | 64 | ||
@@ -78,6 +78,17 @@ static void wakeup_softirqd(void) | |||
78 | } | 78 | } |
79 | 79 | ||
80 | /* | 80 | /* |
81 | * If ksoftirqd is scheduled, we do not want to process pending softirqs | ||
82 | * right now. Let ksoftirqd handle this at its own rate, to get fairness. | ||
83 | */ | ||
84 | static bool ksoftirqd_running(void) | ||
85 | { | ||
86 | struct task_struct *tsk = __this_cpu_read(ksoftirqd); | ||
87 | |||
88 | return tsk && (tsk->state == TASK_RUNNING); | ||
89 | } | ||
90 | |||
91 | /* | ||
81 | * preempt_count and SOFTIRQ_OFFSET usage: | 92 | * preempt_count and SOFTIRQ_OFFSET usage: |
82 | * - preempt_count is changed by SOFTIRQ_OFFSET on entering or leaving | 93 | * - preempt_count is changed by SOFTIRQ_OFFSET on entering or leaving |
83 | * softirq processing. | 94 | * softirq processing. |
@@ -313,7 +324,7 @@ asmlinkage __visible void do_softirq(void) | |||
313 | 324 | ||
314 | pending = local_softirq_pending(); | 325 | pending = local_softirq_pending(); |
315 | 326 | ||
316 | if (pending) | 327 | if (pending && !ksoftirqd_running()) |
317 | do_softirq_own_stack(); | 328 | do_softirq_own_stack(); |
318 | 329 | ||
319 | local_irq_restore(flags); | 330 | local_irq_restore(flags); |
@@ -340,6 +351,9 @@ void irq_enter(void) | |||
340 | 351 | ||
341 | static inline void invoke_softirq(void) | 352 | static inline void invoke_softirq(void) |
342 | { | 353 | { |
354 | if (ksoftirqd_running()) | ||
355 | return; | ||
356 | |||
343 | if (!force_irqthreads) { | 357 | if (!force_irqthreads) { |
344 | #ifdef CONFIG_HAVE_IRQ_EXIT_ON_IRQ_STACK | 358 | #ifdef CONFIG_HAVE_IRQ_EXIT_ON_IRQ_STACK |
345 | /* | 359 | /* |
@@ -482,7 +496,7 @@ void __tasklet_hi_schedule_first(struct tasklet_struct *t) | |||
482 | } | 496 | } |
483 | EXPORT_SYMBOL(__tasklet_hi_schedule_first); | 497 | EXPORT_SYMBOL(__tasklet_hi_schedule_first); |
484 | 498 | ||
485 | static void tasklet_action(struct softirq_action *a) | 499 | static __latent_entropy void tasklet_action(struct softirq_action *a) |
486 | { | 500 | { |
487 | struct tasklet_struct *list; | 501 | struct tasklet_struct *list; |
488 | 502 | ||
@@ -518,7 +532,7 @@ static void tasklet_action(struct softirq_action *a) | |||
518 | } | 532 | } |
519 | } | 533 | } |
520 | 534 | ||
521 | static void tasklet_hi_action(struct softirq_action *a) | 535 | static __latent_entropy void tasklet_hi_action(struct softirq_action *a) |
522 | { | 536 | { |
523 | struct tasklet_struct *list; | 537 | struct tasklet_struct *list; |
524 | 538 | ||
@@ -700,7 +714,7 @@ void tasklet_kill_immediate(struct tasklet_struct *t, unsigned int cpu) | |||
700 | BUG(); | 714 | BUG(); |
701 | } | 715 | } |
702 | 716 | ||
703 | static void takeover_tasklets(unsigned int cpu) | 717 | static int takeover_tasklets(unsigned int cpu) |
704 | { | 718 | { |
705 | /* CPU is dead, so no lock needed. */ | 719 | /* CPU is dead, so no lock needed. */ |
706 | local_irq_disable(); | 720 | local_irq_disable(); |
@@ -723,27 +737,12 @@ static void takeover_tasklets(unsigned int cpu) | |||
723 | raise_softirq_irqoff(HI_SOFTIRQ); | 737 | raise_softirq_irqoff(HI_SOFTIRQ); |
724 | 738 | ||
725 | local_irq_enable(); | 739 | local_irq_enable(); |
740 | return 0; | ||
726 | } | 741 | } |
742 | #else | ||
743 | #define takeover_tasklets NULL | ||
727 | #endif /* CONFIG_HOTPLUG_CPU */ | 744 | #endif /* CONFIG_HOTPLUG_CPU */ |
728 | 745 | ||
729 | static int cpu_callback(struct notifier_block *nfb, unsigned long action, | ||
730 | void *hcpu) | ||
731 | { | ||
732 | switch (action) { | ||
733 | #ifdef CONFIG_HOTPLUG_CPU | ||
734 | case CPU_DEAD: | ||
735 | case CPU_DEAD_FROZEN: | ||
736 | takeover_tasklets((unsigned long)hcpu); | ||
737 | break; | ||
738 | #endif /* CONFIG_HOTPLUG_CPU */ | ||
739 | } | ||
740 | return NOTIFY_OK; | ||
741 | } | ||
742 | |||
743 | static struct notifier_block cpu_nfb = { | ||
744 | .notifier_call = cpu_callback | ||
745 | }; | ||
746 | |||
747 | static struct smp_hotplug_thread softirq_threads = { | 746 | static struct smp_hotplug_thread softirq_threads = { |
748 | .store = &ksoftirqd, | 747 | .store = &ksoftirqd, |
749 | .thread_should_run = ksoftirqd_should_run, | 748 | .thread_should_run = ksoftirqd_should_run, |
@@ -753,8 +752,8 @@ static struct smp_hotplug_thread softirq_threads = { | |||
753 | 752 | ||
754 | static __init int spawn_ksoftirqd(void) | 753 | static __init int spawn_ksoftirqd(void) |
755 | { | 754 | { |
756 | register_cpu_notifier(&cpu_nfb); | 755 | cpuhp_setup_state_nocalls(CPUHP_SOFTIRQ_DEAD, "softirq:dead", NULL, |
757 | 756 | takeover_tasklets); | |
758 | BUG_ON(smpboot_register_percpu_thread(&softirq_threads)); | 757 | BUG_ON(smpboot_register_percpu_thread(&softirq_threads)); |
759 | 758 | ||
760 | return 0; | 759 | return 0; |
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 4a1ca5f6da7e..ec9ab2f01489 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c | |||
@@ -20,7 +20,6 @@ | |||
20 | #include <linux/kallsyms.h> | 20 | #include <linux/kallsyms.h> |
21 | #include <linux/smpboot.h> | 21 | #include <linux/smpboot.h> |
22 | #include <linux/atomic.h> | 22 | #include <linux/atomic.h> |
23 | #include <linux/lglock.h> | ||
24 | #include <linux/nmi.h> | 23 | #include <linux/nmi.h> |
25 | 24 | ||
26 | /* | 25 | /* |
@@ -47,13 +46,9 @@ struct cpu_stopper { | |||
47 | static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper); | 46 | static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper); |
48 | static bool stop_machine_initialized = false; | 47 | static bool stop_machine_initialized = false; |
49 | 48 | ||
50 | /* | 49 | /* static data for stop_cpus */ |
51 | * Avoids a race between stop_two_cpus and global stop_cpus, where | 50 | static DEFINE_MUTEX(stop_cpus_mutex); |
52 | * the stoppers could get queued up in reverse order, leading to | 51 | static bool stop_cpus_in_progress; |
53 | * system deadlock. Using an lglock means stop_two_cpus remains | ||
54 | * relatively cheap. | ||
55 | */ | ||
56 | DEFINE_STATIC_LGLOCK(stop_cpus_lock); | ||
57 | 52 | ||
58 | static void cpu_stop_init_done(struct cpu_stop_done *done, unsigned int nr_todo) | 53 | static void cpu_stop_init_done(struct cpu_stop_done *done, unsigned int nr_todo) |
59 | { | 54 | { |
@@ -126,6 +121,11 @@ int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg) | |||
126 | cpu_stop_init_done(&done, 1); | 121 | cpu_stop_init_done(&done, 1); |
127 | if (!cpu_stop_queue_work(cpu, &work)) | 122 | if (!cpu_stop_queue_work(cpu, &work)) |
128 | return -ENOENT; | 123 | return -ENOENT; |
124 | /* | ||
125 | * In case @cpu == smp_proccessor_id() we can avoid a sleep+wakeup | ||
126 | * cycle by doing a preemption: | ||
127 | */ | ||
128 | cond_resched(); | ||
129 | wait_for_completion(&done.completion); | 129 | wait_for_completion(&done.completion); |
130 | return done.ret; | 130 | return done.ret; |
131 | } | 131 | } |
@@ -230,14 +230,26 @@ static int cpu_stop_queue_two_works(int cpu1, struct cpu_stop_work *work1, | |||
230 | struct cpu_stopper *stopper1 = per_cpu_ptr(&cpu_stopper, cpu1); | 230 | struct cpu_stopper *stopper1 = per_cpu_ptr(&cpu_stopper, cpu1); |
231 | struct cpu_stopper *stopper2 = per_cpu_ptr(&cpu_stopper, cpu2); | 231 | struct cpu_stopper *stopper2 = per_cpu_ptr(&cpu_stopper, cpu2); |
232 | int err; | 232 | int err; |
233 | 233 | retry: | |
234 | lg_double_lock(&stop_cpus_lock, cpu1, cpu2); | ||
235 | spin_lock_irq(&stopper1->lock); | 234 | spin_lock_irq(&stopper1->lock); |
236 | spin_lock_nested(&stopper2->lock, SINGLE_DEPTH_NESTING); | 235 | spin_lock_nested(&stopper2->lock, SINGLE_DEPTH_NESTING); |
237 | 236 | ||
238 | err = -ENOENT; | 237 | err = -ENOENT; |
239 | if (!stopper1->enabled || !stopper2->enabled) | 238 | if (!stopper1->enabled || !stopper2->enabled) |
240 | goto unlock; | 239 | goto unlock; |
240 | /* | ||
241 | * Ensure that if we race with __stop_cpus() the stoppers won't get | ||
242 | * queued up in reverse order leading to system deadlock. | ||
243 | * | ||
244 | * We can't miss stop_cpus_in_progress if queue_stop_cpus_work() has | ||
245 | * queued a work on cpu1 but not on cpu2, we hold both locks. | ||
246 | * | ||
247 | * It can be falsely true but it is safe to spin until it is cleared, | ||
248 | * queue_stop_cpus_work() does everything under preempt_disable(). | ||
249 | */ | ||
250 | err = -EDEADLK; | ||
251 | if (unlikely(stop_cpus_in_progress)) | ||
252 | goto unlock; | ||
241 | 253 | ||
242 | err = 0; | 254 | err = 0; |
243 | __cpu_stop_queue_work(stopper1, work1); | 255 | __cpu_stop_queue_work(stopper1, work1); |
@@ -245,8 +257,12 @@ static int cpu_stop_queue_two_works(int cpu1, struct cpu_stop_work *work1, | |||
245 | unlock: | 257 | unlock: |
246 | spin_unlock(&stopper2->lock); | 258 | spin_unlock(&stopper2->lock); |
247 | spin_unlock_irq(&stopper1->lock); | 259 | spin_unlock_irq(&stopper1->lock); |
248 | lg_double_unlock(&stop_cpus_lock, cpu1, cpu2); | ||
249 | 260 | ||
261 | if (unlikely(err == -EDEADLK)) { | ||
262 | while (stop_cpus_in_progress) | ||
263 | cpu_relax(); | ||
264 | goto retry; | ||
265 | } | ||
250 | return err; | 266 | return err; |
251 | } | 267 | } |
252 | /** | 268 | /** |
@@ -316,9 +332,6 @@ bool stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg, | |||
316 | return cpu_stop_queue_work(cpu, work_buf); | 332 | return cpu_stop_queue_work(cpu, work_buf); |
317 | } | 333 | } |
318 | 334 | ||
319 | /* static data for stop_cpus */ | ||
320 | static DEFINE_MUTEX(stop_cpus_mutex); | ||
321 | |||
322 | static bool queue_stop_cpus_work(const struct cpumask *cpumask, | 335 | static bool queue_stop_cpus_work(const struct cpumask *cpumask, |
323 | cpu_stop_fn_t fn, void *arg, | 336 | cpu_stop_fn_t fn, void *arg, |
324 | struct cpu_stop_done *done) | 337 | struct cpu_stop_done *done) |
@@ -332,7 +345,8 @@ static bool queue_stop_cpus_work(const struct cpumask *cpumask, | |||
332 | * preempted by a stopper which might wait for other stoppers | 345 | * preempted by a stopper which might wait for other stoppers |
333 | * to enter @fn which can lead to deadlock. | 346 | * to enter @fn which can lead to deadlock. |
334 | */ | 347 | */ |
335 | lg_global_lock(&stop_cpus_lock); | 348 | preempt_disable(); |
349 | stop_cpus_in_progress = true; | ||
336 | for_each_cpu(cpu, cpumask) { | 350 | for_each_cpu(cpu, cpumask) { |
337 | work = &per_cpu(cpu_stopper.stop_work, cpu); | 351 | work = &per_cpu(cpu_stopper.stop_work, cpu); |
338 | work->fn = fn; | 352 | work->fn = fn; |
@@ -341,7 +355,8 @@ static bool queue_stop_cpus_work(const struct cpumask *cpumask, | |||
341 | if (cpu_stop_queue_work(cpu, work)) | 355 | if (cpu_stop_queue_work(cpu, work)) |
342 | queued = true; | 356 | queued = true; |
343 | } | 357 | } |
344 | lg_global_unlock(&stop_cpus_lock); | 358 | stop_cpus_in_progress = false; |
359 | preempt_enable(); | ||
345 | 360 | ||
346 | return queued; | 361 | return queued; |
347 | } | 362 | } |
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 2c5e3a8e00d7..635482e60ca3 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c | |||
@@ -250,3 +250,8 @@ cond_syscall(sys_execveat); | |||
250 | 250 | ||
251 | /* membarrier */ | 251 | /* membarrier */ |
252 | cond_syscall(sys_membarrier); | 252 | cond_syscall(sys_membarrier); |
253 | |||
254 | /* memory protection keys */ | ||
255 | cond_syscall(sys_pkey_mprotect); | ||
256 | cond_syscall(sys_pkey_alloc); | ||
257 | cond_syscall(sys_pkey_free); | ||
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index a13bbdaab47d..706309f9ed84 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
@@ -65,6 +65,7 @@ | |||
65 | #include <linux/sched/sysctl.h> | 65 | #include <linux/sched/sysctl.h> |
66 | #include <linux/kexec.h> | 66 | #include <linux/kexec.h> |
67 | #include <linux/bpf.h> | 67 | #include <linux/bpf.h> |
68 | #include <linux/mount.h> | ||
68 | 69 | ||
69 | #include <asm/uaccess.h> | 70 | #include <asm/uaccess.h> |
70 | #include <asm/processor.h> | 71 | #include <asm/processor.h> |
@@ -106,9 +107,8 @@ extern unsigned int core_pipe_limit; | |||
106 | extern int pid_max; | 107 | extern int pid_max; |
107 | extern int pid_max_min, pid_max_max; | 108 | extern int pid_max_min, pid_max_max; |
108 | extern int percpu_pagelist_fraction; | 109 | extern int percpu_pagelist_fraction; |
109 | extern int compat_log; | ||
110 | extern int latencytop_enabled; | 110 | extern int latencytop_enabled; |
111 | extern int sysctl_nr_open_min, sysctl_nr_open_max; | 111 | extern unsigned int sysctl_nr_open_min, sysctl_nr_open_max; |
112 | #ifndef CONFIG_MMU | 112 | #ifndef CONFIG_MMU |
113 | extern int sysctl_nr_trim_pages; | 113 | extern int sysctl_nr_trim_pages; |
114 | #endif | 114 | #endif |
@@ -1084,15 +1084,6 @@ static struct ctl_table kern_table[] = { | |||
1084 | .extra1 = &neg_one, | 1084 | .extra1 = &neg_one, |
1085 | }, | 1085 | }, |
1086 | #endif | 1086 | #endif |
1087 | #ifdef CONFIG_COMPAT | ||
1088 | { | ||
1089 | .procname = "compat-log", | ||
1090 | .data = &compat_log, | ||
1091 | .maxlen = sizeof (int), | ||
1092 | .mode = 0644, | ||
1093 | .proc_handler = proc_dointvec, | ||
1094 | }, | ||
1095 | #endif | ||
1096 | #ifdef CONFIG_RT_MUTEXES | 1087 | #ifdef CONFIG_RT_MUTEXES |
1097 | { | 1088 | { |
1098 | .procname = "max_lock_depth", | 1089 | .procname = "max_lock_depth", |
@@ -1692,7 +1683,7 @@ static struct ctl_table fs_table[] = { | |||
1692 | { | 1683 | { |
1693 | .procname = "nr_open", | 1684 | .procname = "nr_open", |
1694 | .data = &sysctl_nr_open, | 1685 | .data = &sysctl_nr_open, |
1695 | .maxlen = sizeof(int), | 1686 | .maxlen = sizeof(unsigned int), |
1696 | .mode = 0644, | 1687 | .mode = 0644, |
1697 | .proc_handler = proc_dointvec_minmax, | 1688 | .proc_handler = proc_dointvec_minmax, |
1698 | .extra1 = &sysctl_nr_open_min, | 1689 | .extra1 = &sysctl_nr_open_min, |
@@ -1838,6 +1829,14 @@ static struct ctl_table fs_table[] = { | |||
1838 | .mode = 0644, | 1829 | .mode = 0644, |
1839 | .proc_handler = proc_doulongvec_minmax, | 1830 | .proc_handler = proc_doulongvec_minmax, |
1840 | }, | 1831 | }, |
1832 | { | ||
1833 | .procname = "mount-max", | ||
1834 | .data = &sysctl_mount_max, | ||
1835 | .maxlen = sizeof(unsigned int), | ||
1836 | .mode = 0644, | ||
1837 | .proc_handler = proc_dointvec_minmax, | ||
1838 | .extra1 = &one, | ||
1839 | }, | ||
1841 | { } | 1840 | { } |
1842 | }; | 1841 | }; |
1843 | 1842 | ||
diff --git a/kernel/taskstats.c b/kernel/taskstats.c index b3f05ee20d18..cbb387a265db 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c | |||
@@ -54,7 +54,11 @@ static const struct nla_policy taskstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1 | |||
54 | [TASKSTATS_CMD_ATTR_REGISTER_CPUMASK] = { .type = NLA_STRING }, | 54 | [TASKSTATS_CMD_ATTR_REGISTER_CPUMASK] = { .type = NLA_STRING }, |
55 | [TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK] = { .type = NLA_STRING },}; | 55 | [TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK] = { .type = NLA_STRING },}; |
56 | 56 | ||
57 | static const struct nla_policy cgroupstats_cmd_get_policy[CGROUPSTATS_CMD_ATTR_MAX+1] = { | 57 | /* |
58 | * We have to use TASKSTATS_CMD_ATTR_MAX here, it is the maxattr in the family. | ||
59 | * Make sure they are always aligned. | ||
60 | */ | ||
61 | static const struct nla_policy cgroupstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1] = { | ||
58 | [CGROUPSTATS_CMD_ATTR_FD] = { .type = NLA_U32 }, | 62 | [CGROUPSTATS_CMD_ATTR_FD] = { .type = NLA_U32 }, |
59 | }; | 63 | }; |
60 | 64 | ||
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index c3aad685bbc0..12dd190634ab 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c | |||
@@ -542,7 +542,6 @@ static int alarm_clock_get(clockid_t which_clock, struct timespec *tp) | |||
542 | static int alarm_timer_create(struct k_itimer *new_timer) | 542 | static int alarm_timer_create(struct k_itimer *new_timer) |
543 | { | 543 | { |
544 | enum alarmtimer_type type; | 544 | enum alarmtimer_type type; |
545 | struct alarm_base *base; | ||
546 | 545 | ||
547 | if (!alarmtimer_get_rtcdev()) | 546 | if (!alarmtimer_get_rtcdev()) |
548 | return -ENOTSUPP; | 547 | return -ENOTSUPP; |
@@ -551,7 +550,6 @@ static int alarm_timer_create(struct k_itimer *new_timer) | |||
551 | return -EPERM; | 550 | return -EPERM; |
552 | 551 | ||
553 | type = clock2alarm(new_timer->it_clock); | 552 | type = clock2alarm(new_timer->it_clock); |
554 | base = &alarm_bases[type]; | ||
555 | alarm_init(&new_timer->it.alarm.alarmtimer, type, alarm_handle_timer); | 553 | alarm_init(&new_timer->it.alarm.alarmtimer, type, alarm_handle_timer); |
556 | return 0; | 554 | return 0; |
557 | } | 555 | } |
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 6a5a310a1a53..7e4fad75acaa 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c | |||
@@ -600,9 +600,18 @@ static void __clocksource_select(bool skipcur) | |||
600 | */ | 600 | */ |
601 | if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) && oneshot) { | 601 | if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) && oneshot) { |
602 | /* Override clocksource cannot be used. */ | 602 | /* Override clocksource cannot be used. */ |
603 | pr_warn("Override clocksource %s is not HRT compatible - cannot switch while in HRT/NOHZ mode\n", | 603 | if (cs->flags & CLOCK_SOURCE_UNSTABLE) { |
604 | cs->name); | 604 | pr_warn("Override clocksource %s is unstable and not HRT compatible - cannot switch while in HRT/NOHZ mode\n", |
605 | override_name[0] = 0; | 605 | cs->name); |
606 | override_name[0] = 0; | ||
607 | } else { | ||
608 | /* | ||
609 | * The override cannot be currently verified. | ||
610 | * Deferring to let the watchdog check. | ||
611 | */ | ||
612 | pr_info("Override clocksource %s is not currently HRT compatible - deferring\n", | ||
613 | cs->name); | ||
614 | } | ||
606 | } else | 615 | } else |
607 | /* Override clocksource can be used. */ | 616 | /* Override clocksource can be used. */ |
608 | best = cs; | 617 | best = cs; |
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 9ba7c820fc23..bb5ec425dfe0 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c | |||
@@ -307,7 +307,7 @@ EXPORT_SYMBOL_GPL(__ktime_divns); | |||
307 | */ | 307 | */ |
308 | ktime_t ktime_add_safe(const ktime_t lhs, const ktime_t rhs) | 308 | ktime_t ktime_add_safe(const ktime_t lhs, const ktime_t rhs) |
309 | { | 309 | { |
310 | ktime_t res = ktime_add(lhs, rhs); | 310 | ktime_t res = ktime_add_unsafe(lhs, rhs); |
311 | 311 | ||
312 | /* | 312 | /* |
313 | * We use KTIME_SEC_MAX here, the maximum timeout which we can | 313 | * We use KTIME_SEC_MAX here, the maximum timeout which we can |
@@ -703,7 +703,7 @@ static void clock_was_set_work(struct work_struct *work) | |||
703 | static DECLARE_WORK(hrtimer_work, clock_was_set_work); | 703 | static DECLARE_WORK(hrtimer_work, clock_was_set_work); |
704 | 704 | ||
705 | /* | 705 | /* |
706 | * Called from timekeeping and resume code to reprogramm the hrtimer | 706 | * Called from timekeeping and resume code to reprogram the hrtimer |
707 | * interrupt device on all cpus. | 707 | * interrupt device on all cpus. |
708 | */ | 708 | */ |
709 | void clock_was_set_delayed(void) | 709 | void clock_was_set_delayed(void) |
@@ -1241,7 +1241,7 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base, | |||
1241 | 1241 | ||
1242 | /* | 1242 | /* |
1243 | * Note: We clear the running state after enqueue_hrtimer and | 1243 | * Note: We clear the running state after enqueue_hrtimer and |
1244 | * we do not reprogramm the event hardware. Happens either in | 1244 | * we do not reprogram the event hardware. Happens either in |
1245 | * hrtimer_start_range_ns() or in hrtimer_interrupt() | 1245 | * hrtimer_start_range_ns() or in hrtimer_interrupt() |
1246 | * | 1246 | * |
1247 | * Note: Because we dropped the cpu_base->lock above, | 1247 | * Note: Because we dropped the cpu_base->lock above, |
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 2ec7c00228f3..3bcb61b52f6c 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c | |||
@@ -186,10 +186,13 @@ static bool check_tick_dependency(atomic_t *dep) | |||
186 | return false; | 186 | return false; |
187 | } | 187 | } |
188 | 188 | ||
189 | static bool can_stop_full_tick(struct tick_sched *ts) | 189 | static bool can_stop_full_tick(int cpu, struct tick_sched *ts) |
190 | { | 190 | { |
191 | WARN_ON_ONCE(!irqs_disabled()); | 191 | WARN_ON_ONCE(!irqs_disabled()); |
192 | 192 | ||
193 | if (unlikely(!cpu_online(cpu))) | ||
194 | return false; | ||
195 | |||
193 | if (check_tick_dependency(&tick_dep_mask)) | 196 | if (check_tick_dependency(&tick_dep_mask)) |
194 | return false; | 197 | return false; |
195 | 198 | ||
@@ -843,7 +846,7 @@ static void tick_nohz_full_update_tick(struct tick_sched *ts) | |||
843 | if (!ts->tick_stopped && ts->nohz_mode == NOHZ_MODE_INACTIVE) | 846 | if (!ts->tick_stopped && ts->nohz_mode == NOHZ_MODE_INACTIVE) |
844 | return; | 847 | return; |
845 | 848 | ||
846 | if (can_stop_full_tick(ts)) | 849 | if (can_stop_full_tick(cpu, ts)) |
847 | tick_nohz_stop_sched_tick(ts, ktime_get(), cpu); | 850 | tick_nohz_stop_sched_tick(ts, ktime_get(), cpu); |
848 | else if (ts->tick_stopped) | 851 | else if (ts->tick_stopped) |
849 | tick_nohz_restart_sched_tick(ts, ktime_get()); | 852 | tick_nohz_restart_sched_tick(ts, ktime_get()); |
diff --git a/kernel/time/time.c b/kernel/time/time.c index 667b9335f5d6..bd62fb8e8e77 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c | |||
@@ -780,7 +780,7 @@ struct timespec64 timespec64_add_safe(const struct timespec64 lhs, | |||
780 | { | 780 | { |
781 | struct timespec64 res; | 781 | struct timespec64 res; |
782 | 782 | ||
783 | set_normalized_timespec64(&res, lhs.tv_sec + rhs.tv_sec, | 783 | set_normalized_timespec64(&res, (timeu64_t) lhs.tv_sec + rhs.tv_sec, |
784 | lhs.tv_nsec + rhs.tv_nsec); | 784 | lhs.tv_nsec + rhs.tv_nsec); |
785 | 785 | ||
786 | if (unlikely(res.tv_sec < lhs.tv_sec || res.tv_sec < rhs.tv_sec)) { | 786 | if (unlikely(res.tv_sec < lhs.tv_sec || res.tv_sec < rhs.tv_sec)) { |
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index e07fb093f819..37dec7e3db43 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c | |||
@@ -403,8 +403,11 @@ static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf) | |||
403 | tkr = tkf->base + (seq & 0x01); | 403 | tkr = tkf->base + (seq & 0x01); |
404 | now = ktime_to_ns(tkr->base); | 404 | now = ktime_to_ns(tkr->base); |
405 | 405 | ||
406 | now += clocksource_delta(tkr->read(tkr->clock), | 406 | now += timekeeping_delta_to_ns(tkr, |
407 | tkr->cycle_last, tkr->mask); | 407 | clocksource_delta( |
408 | tkr->read(tkr->clock), | ||
409 | tkr->cycle_last, | ||
410 | tkr->mask)); | ||
408 | } while (read_seqcount_retry(&tkf->seq, seq)); | 411 | } while (read_seqcount_retry(&tkf->seq, seq)); |
409 | 412 | ||
410 | return now; | 413 | return now; |
diff --git a/kernel/time/timekeeping_debug.c b/kernel/time/timekeeping_debug.c index 107310a6f36f..ca9fb800336b 100644 --- a/kernel/time/timekeeping_debug.c +++ b/kernel/time/timekeeping_debug.c | |||
@@ -75,5 +75,7 @@ void tk_debug_account_sleep_time(struct timespec64 *t) | |||
75 | int bin = min(fls(t->tv_sec), NUM_BINS-1); | 75 | int bin = min(fls(t->tv_sec), NUM_BINS-1); |
76 | 76 | ||
77 | sleep_time_bin[bin]++; | 77 | sleep_time_bin[bin]++; |
78 | pr_info("Suspended for %lld.%03lu seconds\n", (s64)t->tv_sec, | ||
79 | t->tv_nsec / NSEC_PER_MSEC); | ||
78 | } | 80 | } |
79 | 81 | ||
diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 32bf6f75a8fe..c611c47de884 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c | |||
@@ -878,7 +878,7 @@ static inline struct timer_base *get_timer_base(u32 tflags) | |||
878 | 878 | ||
879 | #ifdef CONFIG_NO_HZ_COMMON | 879 | #ifdef CONFIG_NO_HZ_COMMON |
880 | static inline struct timer_base * | 880 | static inline struct timer_base * |
881 | __get_target_base(struct timer_base *base, unsigned tflags) | 881 | get_target_base(struct timer_base *base, unsigned tflags) |
882 | { | 882 | { |
883 | #ifdef CONFIG_SMP | 883 | #ifdef CONFIG_SMP |
884 | if ((tflags & TIMER_PINNED) || !base->migration_enabled) | 884 | if ((tflags & TIMER_PINNED) || !base->migration_enabled) |
@@ -891,25 +891,27 @@ __get_target_base(struct timer_base *base, unsigned tflags) | |||
891 | 891 | ||
892 | static inline void forward_timer_base(struct timer_base *base) | 892 | static inline void forward_timer_base(struct timer_base *base) |
893 | { | 893 | { |
894 | unsigned long jnow = READ_ONCE(jiffies); | ||
895 | |||
894 | /* | 896 | /* |
895 | * We only forward the base when it's idle and we have a delta between | 897 | * We only forward the base when it's idle and we have a delta between |
896 | * base clock and jiffies. | 898 | * base clock and jiffies. |
897 | */ | 899 | */ |
898 | if (!base->is_idle || (long) (jiffies - base->clk) < 2) | 900 | if (!base->is_idle || (long) (jnow - base->clk) < 2) |
899 | return; | 901 | return; |
900 | 902 | ||
901 | /* | 903 | /* |
902 | * If the next expiry value is > jiffies, then we fast forward to | 904 | * If the next expiry value is > jiffies, then we fast forward to |
903 | * jiffies otherwise we forward to the next expiry value. | 905 | * jiffies otherwise we forward to the next expiry value. |
904 | */ | 906 | */ |
905 | if (time_after(base->next_expiry, jiffies)) | 907 | if (time_after(base->next_expiry, jnow)) |
906 | base->clk = jiffies; | 908 | base->clk = jnow; |
907 | else | 909 | else |
908 | base->clk = base->next_expiry; | 910 | base->clk = base->next_expiry; |
909 | } | 911 | } |
910 | #else | 912 | #else |
911 | static inline struct timer_base * | 913 | static inline struct timer_base * |
912 | __get_target_base(struct timer_base *base, unsigned tflags) | 914 | get_target_base(struct timer_base *base, unsigned tflags) |
913 | { | 915 | { |
914 | return get_timer_this_cpu_base(tflags); | 916 | return get_timer_this_cpu_base(tflags); |
915 | } | 917 | } |
@@ -917,14 +919,6 @@ __get_target_base(struct timer_base *base, unsigned tflags) | |||
917 | static inline void forward_timer_base(struct timer_base *base) { } | 919 | static inline void forward_timer_base(struct timer_base *base) { } |
918 | #endif | 920 | #endif |
919 | 921 | ||
920 | static inline struct timer_base * | ||
921 | get_target_base(struct timer_base *base, unsigned tflags) | ||
922 | { | ||
923 | struct timer_base *target = __get_target_base(base, tflags); | ||
924 | |||
925 | forward_timer_base(target); | ||
926 | return target; | ||
927 | } | ||
928 | 922 | ||
929 | /* | 923 | /* |
930 | * We are using hashed locking: Holding per_cpu(timer_bases[x]).lock means | 924 | * We are using hashed locking: Holding per_cpu(timer_bases[x]).lock means |
@@ -943,7 +937,14 @@ static struct timer_base *lock_timer_base(struct timer_list *timer, | |||
943 | { | 937 | { |
944 | for (;;) { | 938 | for (;;) { |
945 | struct timer_base *base; | 939 | struct timer_base *base; |
946 | u32 tf = timer->flags; | 940 | u32 tf; |
941 | |||
942 | /* | ||
943 | * We need to use READ_ONCE() here, otherwise the compiler | ||
944 | * might re-read @tf between the check for TIMER_MIGRATING | ||
945 | * and spin_lock(). | ||
946 | */ | ||
947 | tf = READ_ONCE(timer->flags); | ||
947 | 948 | ||
948 | if (!(tf & TIMER_MIGRATING)) { | 949 | if (!(tf & TIMER_MIGRATING)) { |
949 | base = get_timer_base(tf); | 950 | base = get_timer_base(tf); |
@@ -964,6 +965,8 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only) | |||
964 | unsigned long clk = 0, flags; | 965 | unsigned long clk = 0, flags; |
965 | int ret = 0; | 966 | int ret = 0; |
966 | 967 | ||
968 | BUG_ON(!timer->function); | ||
969 | |||
967 | /* | 970 | /* |
968 | * This is a common optimization triggered by the networking code - if | 971 | * This is a common optimization triggered by the networking code - if |
969 | * the timer is re-modified to have the same timeout or ends up in the | 972 | * the timer is re-modified to have the same timeout or ends up in the |
@@ -972,13 +975,16 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only) | |||
972 | if (timer_pending(timer)) { | 975 | if (timer_pending(timer)) { |
973 | if (timer->expires == expires) | 976 | if (timer->expires == expires) |
974 | return 1; | 977 | return 1; |
978 | |||
975 | /* | 979 | /* |
976 | * Take the current timer_jiffies of base, but without holding | 980 | * We lock timer base and calculate the bucket index right |
977 | * the lock! | 981 | * here. If the timer ends up in the same bucket, then we |
982 | * just update the expiry time and avoid the whole | ||
983 | * dequeue/enqueue dance. | ||
978 | */ | 984 | */ |
979 | base = get_timer_base(timer->flags); | 985 | base = lock_timer_base(timer, &flags); |
980 | clk = base->clk; | ||
981 | 986 | ||
987 | clk = base->clk; | ||
982 | idx = calc_wheel_index(expires, clk); | 988 | idx = calc_wheel_index(expires, clk); |
983 | 989 | ||
984 | /* | 990 | /* |
@@ -988,14 +994,14 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only) | |||
988 | */ | 994 | */ |
989 | if (idx == timer_get_idx(timer)) { | 995 | if (idx == timer_get_idx(timer)) { |
990 | timer->expires = expires; | 996 | timer->expires = expires; |
991 | return 1; | 997 | ret = 1; |
998 | goto out_unlock; | ||
992 | } | 999 | } |
1000 | } else { | ||
1001 | base = lock_timer_base(timer, &flags); | ||
993 | } | 1002 | } |
994 | 1003 | ||
995 | timer_stats_timer_set_start_info(timer); | 1004 | timer_stats_timer_set_start_info(timer); |
996 | BUG_ON(!timer->function); | ||
997 | |||
998 | base = lock_timer_base(timer, &flags); | ||
999 | 1005 | ||
1000 | ret = detach_if_pending(timer, base, false); | 1006 | ret = detach_if_pending(timer, base, false); |
1001 | if (!ret && pending_only) | 1007 | if (!ret && pending_only) |
@@ -1025,12 +1031,16 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only) | |||
1025 | } | 1031 | } |
1026 | } | 1032 | } |
1027 | 1033 | ||
1034 | /* Try to forward a stale timer base clock */ | ||
1035 | forward_timer_base(base); | ||
1036 | |||
1028 | timer->expires = expires; | 1037 | timer->expires = expires; |
1029 | /* | 1038 | /* |
1030 | * If 'idx' was calculated above and the base time did not advance | 1039 | * If 'idx' was calculated above and the base time did not advance |
1031 | * between calculating 'idx' and taking the lock, only enqueue_timer() | 1040 | * between calculating 'idx' and possibly switching the base, only |
1032 | * and trigger_dyntick_cpu() is required. Otherwise we need to | 1041 | * enqueue_timer() and trigger_dyntick_cpu() is required. Otherwise |
1033 | * (re)calculate the wheel index via internal_add_timer(). | 1042 | * we need to (re)calculate the wheel index via |
1043 | * internal_add_timer(). | ||
1034 | */ | 1044 | */ |
1035 | if (idx != UINT_MAX && clk == base->clk) { | 1045 | if (idx != UINT_MAX && clk == base->clk) { |
1036 | enqueue_timer(base, timer, idx); | 1046 | enqueue_timer(base, timer, idx); |
@@ -1510,12 +1520,16 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem) | |||
1510 | is_max_delta = (nextevt == base->clk + NEXT_TIMER_MAX_DELTA); | 1520 | is_max_delta = (nextevt == base->clk + NEXT_TIMER_MAX_DELTA); |
1511 | base->next_expiry = nextevt; | 1521 | base->next_expiry = nextevt; |
1512 | /* | 1522 | /* |
1513 | * We have a fresh next event. Check whether we can forward the base: | 1523 | * We have a fresh next event. Check whether we can forward the |
1524 | * base. We can only do that when @basej is past base->clk | ||
1525 | * otherwise we might rewind base->clk. | ||
1514 | */ | 1526 | */ |
1515 | if (time_after(nextevt, jiffies)) | 1527 | if (time_after(basej, base->clk)) { |
1516 | base->clk = jiffies; | 1528 | if (time_after(nextevt, basej)) |
1517 | else if (time_after(nextevt, base->clk)) | 1529 | base->clk = basej; |
1518 | base->clk = nextevt; | 1530 | else if (time_after(nextevt, base->clk)) |
1531 | base->clk = nextevt; | ||
1532 | } | ||
1519 | 1533 | ||
1520 | if (time_before_eq(nextevt, basej)) { | 1534 | if (time_before_eq(nextevt, basej)) { |
1521 | expires = basem; | 1535 | expires = basem; |
@@ -1633,7 +1647,7 @@ static inline void __run_timers(struct timer_base *base) | |||
1633 | /* | 1647 | /* |
1634 | * This function runs timers and the timer-tq in bottom half context. | 1648 | * This function runs timers and the timer-tq in bottom half context. |
1635 | */ | 1649 | */ |
1636 | static void run_timer_softirq(struct softirq_action *h) | 1650 | static __latent_entropy void run_timer_softirq(struct softirq_action *h) |
1637 | { | 1651 | { |
1638 | struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]); | 1652 | struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]); |
1639 | 1653 | ||
diff --git a/kernel/torture.c b/kernel/torture.c index 75961b3decfe..0d887eb62856 100644 --- a/kernel/torture.c +++ b/kernel/torture.c | |||
@@ -43,6 +43,7 @@ | |||
43 | #include <linux/stat.h> | 43 | #include <linux/stat.h> |
44 | #include <linux/slab.h> | 44 | #include <linux/slab.h> |
45 | #include <linux/trace_clock.h> | 45 | #include <linux/trace_clock.h> |
46 | #include <linux/ktime.h> | ||
46 | #include <asm/byteorder.h> | 47 | #include <asm/byteorder.h> |
47 | #include <linux/torture.h> | 48 | #include <linux/torture.h> |
48 | 49 | ||
@@ -446,9 +447,8 @@ EXPORT_SYMBOL_GPL(torture_shuffle_cleanup); | |||
446 | * Variables for auto-shutdown. This allows "lights out" torture runs | 447 | * Variables for auto-shutdown. This allows "lights out" torture runs |
447 | * to be fully scripted. | 448 | * to be fully scripted. |
448 | */ | 449 | */ |
449 | static int shutdown_secs; /* desired test duration in seconds. */ | ||
450 | static struct task_struct *shutdown_task; | 450 | static struct task_struct *shutdown_task; |
451 | static unsigned long shutdown_time; /* jiffies to system shutdown. */ | 451 | static ktime_t shutdown_time; /* time to system shutdown. */ |
452 | static void (*torture_shutdown_hook)(void); | 452 | static void (*torture_shutdown_hook)(void); |
453 | 453 | ||
454 | /* | 454 | /* |
@@ -471,20 +471,20 @@ EXPORT_SYMBOL_GPL(torture_shutdown_absorb); | |||
471 | */ | 471 | */ |
472 | static int torture_shutdown(void *arg) | 472 | static int torture_shutdown(void *arg) |
473 | { | 473 | { |
474 | long delta; | 474 | ktime_t ktime_snap; |
475 | unsigned long jiffies_snap; | ||
476 | 475 | ||
477 | VERBOSE_TOROUT_STRING("torture_shutdown task started"); | 476 | VERBOSE_TOROUT_STRING("torture_shutdown task started"); |
478 | jiffies_snap = jiffies; | 477 | ktime_snap = ktime_get(); |
479 | while (ULONG_CMP_LT(jiffies_snap, shutdown_time) && | 478 | while (ktime_before(ktime_snap, shutdown_time) && |
480 | !torture_must_stop()) { | 479 | !torture_must_stop()) { |
481 | delta = shutdown_time - jiffies_snap; | ||
482 | if (verbose) | 480 | if (verbose) |
483 | pr_alert("%s" TORTURE_FLAG | 481 | pr_alert("%s" TORTURE_FLAG |
484 | "torture_shutdown task: %lu jiffies remaining\n", | 482 | "torture_shutdown task: %llu ms remaining\n", |
485 | torture_type, delta); | 483 | torture_type, |
486 | schedule_timeout_interruptible(delta); | 484 | ktime_ms_delta(shutdown_time, ktime_snap)); |
487 | jiffies_snap = jiffies; | 485 | set_current_state(TASK_INTERRUPTIBLE); |
486 | schedule_hrtimeout(&shutdown_time, HRTIMER_MODE_ABS); | ||
487 | ktime_snap = ktime_get(); | ||
488 | } | 488 | } |
489 | if (torture_must_stop()) { | 489 | if (torture_must_stop()) { |
490 | torture_kthread_stopping("torture_shutdown"); | 490 | torture_kthread_stopping("torture_shutdown"); |
@@ -511,10 +511,9 @@ int torture_shutdown_init(int ssecs, void (*cleanup)(void)) | |||
511 | { | 511 | { |
512 | int ret = 0; | 512 | int ret = 0; |
513 | 513 | ||
514 | shutdown_secs = ssecs; | ||
515 | torture_shutdown_hook = cleanup; | 514 | torture_shutdown_hook = cleanup; |
516 | if (shutdown_secs > 0) { | 515 | if (ssecs > 0) { |
517 | shutdown_time = jiffies + shutdown_secs * HZ; | 516 | shutdown_time = ktime_add(ktime_get(), ktime_set(ssecs, 0)); |
518 | ret = torture_create_kthread(torture_shutdown, NULL, | 517 | ret = torture_create_kthread(torture_shutdown, NULL, |
519 | shutdown_task); | 518 | shutdown_task); |
520 | } | 519 | } |
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index f4b86e8ca1e7..2a96b063d659 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig | |||
@@ -24,11 +24,6 @@ config HAVE_FUNCTION_GRAPH_TRACER | |||
24 | help | 24 | help |
25 | See Documentation/trace/ftrace-design.txt | 25 | See Documentation/trace/ftrace-design.txt |
26 | 26 | ||
27 | config HAVE_FUNCTION_GRAPH_FP_TEST | ||
28 | bool | ||
29 | help | ||
30 | See Documentation/trace/ftrace-design.txt | ||
31 | |||
32 | config HAVE_DYNAMIC_FTRACE | 27 | config HAVE_DYNAMIC_FTRACE |
33 | bool | 28 | bool |
34 | help | 29 | help |
@@ -221,6 +216,41 @@ config SCHED_TRACER | |||
221 | This tracer tracks the latency of the highest priority task | 216 | This tracer tracks the latency of the highest priority task |
222 | to be scheduled in, starting from the point it has woken up. | 217 | to be scheduled in, starting from the point it has woken up. |
223 | 218 | ||
219 | config HWLAT_TRACER | ||
220 | bool "Tracer to detect hardware latencies (like SMIs)" | ||
221 | select GENERIC_TRACER | ||
222 | help | ||
223 | This tracer, when enabled will create one or more kernel threads, | ||
224 | depening on what the cpumask file is set to, which each thread | ||
225 | spinning in a loop looking for interruptions caused by | ||
226 | something other than the kernel. For example, if a | ||
227 | System Management Interrupt (SMI) takes a noticeable amount of | ||
228 | time, this tracer will detect it. This is useful for testing | ||
229 | if a system is reliable for Real Time tasks. | ||
230 | |||
231 | Some files are created in the tracing directory when this | ||
232 | is enabled: | ||
233 | |||
234 | hwlat_detector/width - time in usecs for how long to spin for | ||
235 | hwlat_detector/window - time in usecs between the start of each | ||
236 | iteration | ||
237 | |||
238 | A kernel thread is created that will spin with interrupts disabled | ||
239 | for "width" microseconds in every "widow" cycle. It will not spin | ||
240 | for "window - width" microseconds, where the system can | ||
241 | continue to operate. | ||
242 | |||
243 | The output will appear in the trace and trace_pipe files. | ||
244 | |||
245 | When the tracer is not running, it has no affect on the system, | ||
246 | but when it is running, it can cause the system to be | ||
247 | periodically non responsive. Do not run this tracer on a | ||
248 | production system. | ||
249 | |||
250 | To enable this tracer, echo in "hwlat" into the current_tracer | ||
251 | file. Every time a latency is greater than tracing_thresh, it will | ||
252 | be recorded into the ring buffer. | ||
253 | |||
224 | config ENABLE_DEFAULT_TRACERS | 254 | config ENABLE_DEFAULT_TRACERS |
225 | bool "Trace process context switches and events" | 255 | bool "Trace process context switches and events" |
226 | depends on !GENERIC_TRACER | 256 | depends on !GENERIC_TRACER |
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index d0a1617b52b4..e57980845549 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile | |||
@@ -1,8 +1,4 @@ | |||
1 | 1 | ||
2 | # We are fully aware of the dangers of __builtin_return_address() | ||
3 | FRAME_CFLAGS := $(call cc-disable-warning,frame-address) | ||
4 | KBUILD_CFLAGS += $(FRAME_CFLAGS) | ||
5 | |||
6 | # Do not instrument the tracer itself: | 2 | # Do not instrument the tracer itself: |
7 | 3 | ||
8 | ifdef CONFIG_FUNCTION_TRACER | 4 | ifdef CONFIG_FUNCTION_TRACER |
@@ -41,6 +37,7 @@ obj-$(CONFIG_FUNCTION_TRACER) += trace_functions.o | |||
41 | obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o | 37 | obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o |
42 | obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o | 38 | obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o |
43 | obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o | 39 | obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o |
40 | obj-$(CONFIG_HWLAT_TRACER) += trace_hwlat.o | ||
44 | obj-$(CONFIG_NOP_TRACER) += trace_nop.o | 41 | obj-$(CONFIG_NOP_TRACER) += trace_nop.o |
45 | obj-$(CONFIG_STACK_TRACER) += trace_stack.o | 42 | obj-$(CONFIG_STACK_TRACER) += trace_stack.o |
46 | obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o | 43 | obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o |
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index b20438fdb029..5dcb99281259 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c | |||
@@ -1,4 +1,5 @@ | |||
1 | /* Copyright (c) 2011-2015 PLUMgrid, http://plumgrid.com | 1 | /* Copyright (c) 2011-2015 PLUMgrid, http://plumgrid.com |
2 | * Copyright (c) 2016 Facebook | ||
2 | * | 3 | * |
3 | * This program is free software; you can redistribute it and/or | 4 | * This program is free software; you can redistribute it and/or |
4 | * modify it under the terms of version 2 of the GNU General Public | 5 | * modify it under the terms of version 2 of the GNU General Public |
@@ -8,6 +9,7 @@ | |||
8 | #include <linux/types.h> | 9 | #include <linux/types.h> |
9 | #include <linux/slab.h> | 10 | #include <linux/slab.h> |
10 | #include <linux/bpf.h> | 11 | #include <linux/bpf.h> |
12 | #include <linux/bpf_perf_event.h> | ||
11 | #include <linux/filter.h> | 13 | #include <linux/filter.h> |
12 | #include <linux/uaccess.h> | 14 | #include <linux/uaccess.h> |
13 | #include <linux/ctype.h> | 15 | #include <linux/ctype.h> |
@@ -59,11 +61,9 @@ unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx) | |||
59 | } | 61 | } |
60 | EXPORT_SYMBOL_GPL(trace_call_bpf); | 62 | EXPORT_SYMBOL_GPL(trace_call_bpf); |
61 | 63 | ||
62 | static u64 bpf_probe_read(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) | 64 | BPF_CALL_3(bpf_probe_read, void *, dst, u32, size, const void *, unsafe_ptr) |
63 | { | 65 | { |
64 | void *dst = (void *) (long) r1; | 66 | int ret; |
65 | int ret, size = (int) r2; | ||
66 | void *unsafe_ptr = (void *) (long) r3; | ||
67 | 67 | ||
68 | ret = probe_kernel_read(dst, unsafe_ptr, size); | 68 | ret = probe_kernel_read(dst, unsafe_ptr, size); |
69 | if (unlikely(ret < 0)) | 69 | if (unlikely(ret < 0)) |
@@ -81,12 +81,9 @@ static const struct bpf_func_proto bpf_probe_read_proto = { | |||
81 | .arg3_type = ARG_ANYTHING, | 81 | .arg3_type = ARG_ANYTHING, |
82 | }; | 82 | }; |
83 | 83 | ||
84 | static u64 bpf_probe_write_user(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) | 84 | BPF_CALL_3(bpf_probe_write_user, void *, unsafe_ptr, const void *, src, |
85 | u32, size) | ||
85 | { | 86 | { |
86 | void *unsafe_ptr = (void *) (long) r1; | ||
87 | void *src = (void *) (long) r2; | ||
88 | int size = (int) r3; | ||
89 | |||
90 | /* | 87 | /* |
91 | * Ensure we're in user context which is safe for the helper to | 88 | * Ensure we're in user context which is safe for the helper to |
92 | * run. This helper has no business in a kthread. | 89 | * run. This helper has no business in a kthread. |
@@ -128,9 +125,9 @@ static const struct bpf_func_proto *bpf_get_probe_write_proto(void) | |||
128 | * limited trace_printk() | 125 | * limited trace_printk() |
129 | * only %d %u %x %ld %lu %lx %lld %llu %llx %p %s conversion specifiers allowed | 126 | * only %d %u %x %ld %lu %lx %lld %llu %llx %p %s conversion specifiers allowed |
130 | */ | 127 | */ |
131 | static u64 bpf_trace_printk(u64 r1, u64 fmt_size, u64 r3, u64 r4, u64 r5) | 128 | BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1, |
129 | u64, arg2, u64, arg3) | ||
132 | { | 130 | { |
133 | char *fmt = (char *) (long) r1; | ||
134 | bool str_seen = false; | 131 | bool str_seen = false; |
135 | int mod[3] = {}; | 132 | int mod[3] = {}; |
136 | int fmt_cnt = 0; | 133 | int fmt_cnt = 0; |
@@ -176,16 +173,16 @@ static u64 bpf_trace_printk(u64 r1, u64 fmt_size, u64 r3, u64 r4, u64 r5) | |||
176 | 173 | ||
177 | switch (fmt_cnt) { | 174 | switch (fmt_cnt) { |
178 | case 1: | 175 | case 1: |
179 | unsafe_addr = r3; | 176 | unsafe_addr = arg1; |
180 | r3 = (long) buf; | 177 | arg1 = (long) buf; |
181 | break; | 178 | break; |
182 | case 2: | 179 | case 2: |
183 | unsafe_addr = r4; | 180 | unsafe_addr = arg2; |
184 | r4 = (long) buf; | 181 | arg2 = (long) buf; |
185 | break; | 182 | break; |
186 | case 3: | 183 | case 3: |
187 | unsafe_addr = r5; | 184 | unsafe_addr = arg3; |
188 | r5 = (long) buf; | 185 | arg3 = (long) buf; |
189 | break; | 186 | break; |
190 | } | 187 | } |
191 | buf[0] = 0; | 188 | buf[0] = 0; |
@@ -207,9 +204,9 @@ static u64 bpf_trace_printk(u64 r1, u64 fmt_size, u64 r3, u64 r4, u64 r5) | |||
207 | } | 204 | } |
208 | 205 | ||
209 | return __trace_printk(1/* fake ip will not be printed */, fmt, | 206 | return __trace_printk(1/* fake ip will not be printed */, fmt, |
210 | mod[0] == 2 ? r3 : mod[0] == 1 ? (long) r3 : (u32) r3, | 207 | mod[0] == 2 ? arg1 : mod[0] == 1 ? (long) arg1 : (u32) arg1, |
211 | mod[1] == 2 ? r4 : mod[1] == 1 ? (long) r4 : (u32) r4, | 208 | mod[1] == 2 ? arg2 : mod[1] == 1 ? (long) arg2 : (u32) arg2, |
212 | mod[2] == 2 ? r5 : mod[2] == 1 ? (long) r5 : (u32) r5); | 209 | mod[2] == 2 ? arg3 : mod[2] == 1 ? (long) arg3 : (u32) arg3); |
213 | } | 210 | } |
214 | 211 | ||
215 | static const struct bpf_func_proto bpf_trace_printk_proto = { | 212 | static const struct bpf_func_proto bpf_trace_printk_proto = { |
@@ -231,9 +228,8 @@ const struct bpf_func_proto *bpf_get_trace_printk_proto(void) | |||
231 | return &bpf_trace_printk_proto; | 228 | return &bpf_trace_printk_proto; |
232 | } | 229 | } |
233 | 230 | ||
234 | static u64 bpf_perf_event_read(u64 r1, u64 flags, u64 r3, u64 r4, u64 r5) | 231 | BPF_CALL_2(bpf_perf_event_read, struct bpf_map *, map, u64, flags) |
235 | { | 232 | { |
236 | struct bpf_map *map = (struct bpf_map *) (unsigned long) r1; | ||
237 | struct bpf_array *array = container_of(map, struct bpf_array, map); | 233 | struct bpf_array *array = container_of(map, struct bpf_array, map); |
238 | unsigned int cpu = smp_processor_id(); | 234 | unsigned int cpu = smp_processor_id(); |
239 | u64 index = flags & BPF_F_INDEX_MASK; | 235 | u64 index = flags & BPF_F_INDEX_MASK; |
@@ -310,11 +306,9 @@ __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map, | |||
310 | return 0; | 306 | return 0; |
311 | } | 307 | } |
312 | 308 | ||
313 | static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 flags, u64 r4, u64 size) | 309 | BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map, |
310 | u64, flags, void *, data, u64, size) | ||
314 | { | 311 | { |
315 | struct pt_regs *regs = (struct pt_regs *)(long) r1; | ||
316 | struct bpf_map *map = (struct bpf_map *)(long) r2; | ||
317 | void *data = (void *)(long) r4; | ||
318 | struct perf_raw_record raw = { | 312 | struct perf_raw_record raw = { |
319 | .frag = { | 313 | .frag = { |
320 | .size = size, | 314 | .size = size, |
@@ -365,7 +359,7 @@ u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size, | |||
365 | return __bpf_perf_event_output(regs, map, flags, &raw); | 359 | return __bpf_perf_event_output(regs, map, flags, &raw); |
366 | } | 360 | } |
367 | 361 | ||
368 | static u64 bpf_get_current_task(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) | 362 | BPF_CALL_0(bpf_get_current_task) |
369 | { | 363 | { |
370 | return (long) current; | 364 | return (long) current; |
371 | } | 365 | } |
@@ -376,6 +370,31 @@ static const struct bpf_func_proto bpf_get_current_task_proto = { | |||
376 | .ret_type = RET_INTEGER, | 370 | .ret_type = RET_INTEGER, |
377 | }; | 371 | }; |
378 | 372 | ||
373 | BPF_CALL_2(bpf_current_task_under_cgroup, struct bpf_map *, map, u32, idx) | ||
374 | { | ||
375 | struct bpf_array *array = container_of(map, struct bpf_array, map); | ||
376 | struct cgroup *cgrp; | ||
377 | |||
378 | if (unlikely(in_interrupt())) | ||
379 | return -EINVAL; | ||
380 | if (unlikely(idx >= array->map.max_entries)) | ||
381 | return -E2BIG; | ||
382 | |||
383 | cgrp = READ_ONCE(array->ptrs[idx]); | ||
384 | if (unlikely(!cgrp)) | ||
385 | return -EAGAIN; | ||
386 | |||
387 | return task_under_cgroup_hierarchy(current, cgrp); | ||
388 | } | ||
389 | |||
390 | static const struct bpf_func_proto bpf_current_task_under_cgroup_proto = { | ||
391 | .func = bpf_current_task_under_cgroup, | ||
392 | .gpl_only = false, | ||
393 | .ret_type = RET_INTEGER, | ||
394 | .arg1_type = ARG_CONST_MAP_PTR, | ||
395 | .arg2_type = ARG_ANYTHING, | ||
396 | }; | ||
397 | |||
379 | static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id) | 398 | static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id) |
380 | { | 399 | { |
381 | switch (func_id) { | 400 | switch (func_id) { |
@@ -407,6 +426,10 @@ static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id) | |||
407 | return &bpf_perf_event_read_proto; | 426 | return &bpf_perf_event_read_proto; |
408 | case BPF_FUNC_probe_write_user: | 427 | case BPF_FUNC_probe_write_user: |
409 | return bpf_get_probe_write_proto(); | 428 | return bpf_get_probe_write_proto(); |
429 | case BPF_FUNC_current_task_under_cgroup: | ||
430 | return &bpf_current_task_under_cgroup_proto; | ||
431 | case BPF_FUNC_get_prandom_u32: | ||
432 | return &bpf_get_prandom_u32_proto; | ||
410 | default: | 433 | default: |
411 | return NULL; | 434 | return NULL; |
412 | } | 435 | } |
@@ -447,16 +470,17 @@ static struct bpf_prog_type_list kprobe_tl = { | |||
447 | .type = BPF_PROG_TYPE_KPROBE, | 470 | .type = BPF_PROG_TYPE_KPROBE, |
448 | }; | 471 | }; |
449 | 472 | ||
450 | static u64 bpf_perf_event_output_tp(u64 r1, u64 r2, u64 index, u64 r4, u64 size) | 473 | BPF_CALL_5(bpf_perf_event_output_tp, void *, tp_buff, struct bpf_map *, map, |
474 | u64, flags, void *, data, u64, size) | ||
451 | { | 475 | { |
476 | struct pt_regs *regs = *(struct pt_regs **)tp_buff; | ||
477 | |||
452 | /* | 478 | /* |
453 | * r1 points to perf tracepoint buffer where first 8 bytes are hidden | 479 | * r1 points to perf tracepoint buffer where first 8 bytes are hidden |
454 | * from bpf program and contain a pointer to 'struct pt_regs'. Fetch it | 480 | * from bpf program and contain a pointer to 'struct pt_regs'. Fetch it |
455 | * from there and call the same bpf_perf_event_output() helper | 481 | * from there and call the same bpf_perf_event_output() helper inline. |
456 | */ | 482 | */ |
457 | u64 ctx = *(long *)(uintptr_t)r1; | 483 | return ____bpf_perf_event_output(regs, map, flags, data, size); |
458 | |||
459 | return bpf_perf_event_output(ctx, r2, index, r4, size); | ||
460 | } | 484 | } |
461 | 485 | ||
462 | static const struct bpf_func_proto bpf_perf_event_output_proto_tp = { | 486 | static const struct bpf_func_proto bpf_perf_event_output_proto_tp = { |
@@ -470,11 +494,18 @@ static const struct bpf_func_proto bpf_perf_event_output_proto_tp = { | |||
470 | .arg5_type = ARG_CONST_STACK_SIZE, | 494 | .arg5_type = ARG_CONST_STACK_SIZE, |
471 | }; | 495 | }; |
472 | 496 | ||
473 | static u64 bpf_get_stackid_tp(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) | 497 | BPF_CALL_3(bpf_get_stackid_tp, void *, tp_buff, struct bpf_map *, map, |
498 | u64, flags) | ||
474 | { | 499 | { |
475 | u64 ctx = *(long *)(uintptr_t)r1; | 500 | struct pt_regs *regs = *(struct pt_regs **)tp_buff; |
476 | 501 | ||
477 | return bpf_get_stackid(ctx, r2, r3, r4, r5); | 502 | /* |
503 | * Same comment as in bpf_perf_event_output_tp(), only that this time | ||
504 | * the other helper's function body cannot be inlined due to being | ||
505 | * external, thus we need to call raw helper function. | ||
506 | */ | ||
507 | return bpf_get_stackid((unsigned long) regs, (unsigned long) map, | ||
508 | flags, 0, 0); | ||
478 | } | 509 | } |
479 | 510 | ||
480 | static const struct bpf_func_proto bpf_get_stackid_proto_tp = { | 511 | static const struct bpf_func_proto bpf_get_stackid_proto_tp = { |
@@ -520,10 +551,69 @@ static struct bpf_prog_type_list tracepoint_tl = { | |||
520 | .type = BPF_PROG_TYPE_TRACEPOINT, | 551 | .type = BPF_PROG_TYPE_TRACEPOINT, |
521 | }; | 552 | }; |
522 | 553 | ||
554 | static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type, | ||
555 | enum bpf_reg_type *reg_type) | ||
556 | { | ||
557 | if (off < 0 || off >= sizeof(struct bpf_perf_event_data)) | ||
558 | return false; | ||
559 | if (type != BPF_READ) | ||
560 | return false; | ||
561 | if (off % size != 0) | ||
562 | return false; | ||
563 | if (off == offsetof(struct bpf_perf_event_data, sample_period)) { | ||
564 | if (size != sizeof(u64)) | ||
565 | return false; | ||
566 | } else { | ||
567 | if (size != sizeof(long)) | ||
568 | return false; | ||
569 | } | ||
570 | return true; | ||
571 | } | ||
572 | |||
573 | static u32 pe_prog_convert_ctx_access(enum bpf_access_type type, int dst_reg, | ||
574 | int src_reg, int ctx_off, | ||
575 | struct bpf_insn *insn_buf, | ||
576 | struct bpf_prog *prog) | ||
577 | { | ||
578 | struct bpf_insn *insn = insn_buf; | ||
579 | |||
580 | switch (ctx_off) { | ||
581 | case offsetof(struct bpf_perf_event_data, sample_period): | ||
582 | BUILD_BUG_ON(FIELD_SIZEOF(struct perf_sample_data, period) != sizeof(u64)); | ||
583 | |||
584 | *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern, | ||
585 | data), dst_reg, src_reg, | ||
586 | offsetof(struct bpf_perf_event_data_kern, data)); | ||
587 | *insn++ = BPF_LDX_MEM(BPF_DW, dst_reg, dst_reg, | ||
588 | offsetof(struct perf_sample_data, period)); | ||
589 | break; | ||
590 | default: | ||
591 | *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern, | ||
592 | regs), dst_reg, src_reg, | ||
593 | offsetof(struct bpf_perf_event_data_kern, regs)); | ||
594 | *insn++ = BPF_LDX_MEM(BPF_SIZEOF(long), dst_reg, dst_reg, ctx_off); | ||
595 | break; | ||
596 | } | ||
597 | |||
598 | return insn - insn_buf; | ||
599 | } | ||
600 | |||
601 | static const struct bpf_verifier_ops perf_event_prog_ops = { | ||
602 | .get_func_proto = tp_prog_func_proto, | ||
603 | .is_valid_access = pe_prog_is_valid_access, | ||
604 | .convert_ctx_access = pe_prog_convert_ctx_access, | ||
605 | }; | ||
606 | |||
607 | static struct bpf_prog_type_list perf_event_tl = { | ||
608 | .ops = &perf_event_prog_ops, | ||
609 | .type = BPF_PROG_TYPE_PERF_EVENT, | ||
610 | }; | ||
611 | |||
523 | static int __init register_kprobe_prog_ops(void) | 612 | static int __init register_kprobe_prog_ops(void) |
524 | { | 613 | { |
525 | bpf_register_prog_type(&kprobe_tl); | 614 | bpf_register_prog_type(&kprobe_tl); |
526 | bpf_register_prog_type(&tracepoint_tl); | 615 | bpf_register_prog_type(&tracepoint_tl); |
616 | bpf_register_prog_type(&perf_event_tl); | ||
527 | return 0; | 617 | return 0; |
528 | } | 618 | } |
529 | late_initcall(register_kprobe_prog_ops); | 619 | late_initcall(register_kprobe_prog_ops); |
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 84752c8e28b5..da87b3cba5b3 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c | |||
@@ -872,7 +872,13 @@ function_profile_call(unsigned long ip, unsigned long parent_ip, | |||
872 | #ifdef CONFIG_FUNCTION_GRAPH_TRACER | 872 | #ifdef CONFIG_FUNCTION_GRAPH_TRACER |
873 | static int profile_graph_entry(struct ftrace_graph_ent *trace) | 873 | static int profile_graph_entry(struct ftrace_graph_ent *trace) |
874 | { | 874 | { |
875 | int index = trace->depth; | ||
876 | |||
875 | function_profile_call(trace->func, 0, NULL, NULL); | 877 | function_profile_call(trace->func, 0, NULL, NULL); |
878 | |||
879 | if (index >= 0 && index < FTRACE_RETFUNC_DEPTH) | ||
880 | current->ret_stack[index].subtime = 0; | ||
881 | |||
876 | return 1; | 882 | return 1; |
877 | } | 883 | } |
878 | 884 | ||
@@ -1856,6 +1862,10 @@ static int __ftrace_hash_update_ipmodify(struct ftrace_ops *ops, | |||
1856 | 1862 | ||
1857 | /* Update rec->flags */ | 1863 | /* Update rec->flags */ |
1858 | do_for_each_ftrace_rec(pg, rec) { | 1864 | do_for_each_ftrace_rec(pg, rec) { |
1865 | |||
1866 | if (rec->flags & FTRACE_FL_DISABLED) | ||
1867 | continue; | ||
1868 | |||
1859 | /* We need to update only differences of filter_hash */ | 1869 | /* We need to update only differences of filter_hash */ |
1860 | in_old = !!ftrace_lookup_ip(old_hash, rec->ip); | 1870 | in_old = !!ftrace_lookup_ip(old_hash, rec->ip); |
1861 | in_new = !!ftrace_lookup_ip(new_hash, rec->ip); | 1871 | in_new = !!ftrace_lookup_ip(new_hash, rec->ip); |
@@ -1878,6 +1888,10 @@ rollback: | |||
1878 | 1888 | ||
1879 | /* Roll back what we did above */ | 1889 | /* Roll back what we did above */ |
1880 | do_for_each_ftrace_rec(pg, rec) { | 1890 | do_for_each_ftrace_rec(pg, rec) { |
1891 | |||
1892 | if (rec->flags & FTRACE_FL_DISABLED) | ||
1893 | continue; | ||
1894 | |||
1881 | if (rec == end) | 1895 | if (rec == end) |
1882 | goto err_out; | 1896 | goto err_out; |
1883 | 1897 | ||
@@ -2391,6 +2405,10 @@ void __weak ftrace_replace_code(int enable) | |||
2391 | return; | 2405 | return; |
2392 | 2406 | ||
2393 | do_for_each_ftrace_rec(pg, rec) { | 2407 | do_for_each_ftrace_rec(pg, rec) { |
2408 | |||
2409 | if (rec->flags & FTRACE_FL_DISABLED) | ||
2410 | continue; | ||
2411 | |||
2394 | failed = __ftrace_replace_code(rec, enable); | 2412 | failed = __ftrace_replace_code(rec, enable); |
2395 | if (failed) { | 2413 | if (failed) { |
2396 | ftrace_bug(failed, rec); | 2414 | ftrace_bug(failed, rec); |
@@ -2757,7 +2775,7 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command) | |||
2757 | struct dyn_ftrace *rec; | 2775 | struct dyn_ftrace *rec; |
2758 | 2776 | ||
2759 | do_for_each_ftrace_rec(pg, rec) { | 2777 | do_for_each_ftrace_rec(pg, rec) { |
2760 | if (FTRACE_WARN_ON_ONCE(rec->flags)) | 2778 | if (FTRACE_WARN_ON_ONCE(rec->flags & ~FTRACE_FL_DISABLED)) |
2761 | pr_warn(" %pS flags:%lx\n", | 2779 | pr_warn(" %pS flags:%lx\n", |
2762 | (void *)rec->ip, rec->flags); | 2780 | (void *)rec->ip, rec->flags); |
2763 | } while_for_each_ftrace_rec(); | 2781 | } while_for_each_ftrace_rec(); |
@@ -3592,6 +3610,10 @@ match_records(struct ftrace_hash *hash, char *func, int len, char *mod) | |||
3592 | goto out_unlock; | 3610 | goto out_unlock; |
3593 | 3611 | ||
3594 | do_for_each_ftrace_rec(pg, rec) { | 3612 | do_for_each_ftrace_rec(pg, rec) { |
3613 | |||
3614 | if (rec->flags & FTRACE_FL_DISABLED) | ||
3615 | continue; | ||
3616 | |||
3595 | if (ftrace_match_record(rec, &func_g, mod_match, exclude_mod)) { | 3617 | if (ftrace_match_record(rec, &func_g, mod_match, exclude_mod)) { |
3596 | ret = enter_record(hash, rec, clear_filter); | 3618 | ret = enter_record(hash, rec, clear_filter); |
3597 | if (ret < 0) { | 3619 | if (ret < 0) { |
@@ -3787,6 +3809,9 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, | |||
3787 | 3809 | ||
3788 | do_for_each_ftrace_rec(pg, rec) { | 3810 | do_for_each_ftrace_rec(pg, rec) { |
3789 | 3811 | ||
3812 | if (rec->flags & FTRACE_FL_DISABLED) | ||
3813 | continue; | ||
3814 | |||
3790 | if (!ftrace_match_record(rec, &func_g, NULL, 0)) | 3815 | if (!ftrace_match_record(rec, &func_g, NULL, 0)) |
3791 | continue; | 3816 | continue; |
3792 | 3817 | ||
@@ -4679,6 +4704,9 @@ ftrace_set_func(unsigned long *array, int *idx, int size, char *buffer) | |||
4679 | 4704 | ||
4680 | do_for_each_ftrace_rec(pg, rec) { | 4705 | do_for_each_ftrace_rec(pg, rec) { |
4681 | 4706 | ||
4707 | if (rec->flags & FTRACE_FL_DISABLED) | ||
4708 | continue; | ||
4709 | |||
4682 | if (ftrace_match_record(rec, &func_g, NULL, 0)) { | 4710 | if (ftrace_match_record(rec, &func_g, NULL, 0)) { |
4683 | /* if it is in the array */ | 4711 | /* if it is in the array */ |
4684 | exists = false; | 4712 | exists = false; |
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index dade4c9559cc..8696ce6bf2f6 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c | |||
@@ -1047,7 +1047,7 @@ void disable_trace_on_warning(void) | |||
1047 | * | 1047 | * |
1048 | * Shows real state of the ring buffer if it is enabled or not. | 1048 | * Shows real state of the ring buffer if it is enabled or not. |
1049 | */ | 1049 | */ |
1050 | static int tracer_tracing_is_on(struct trace_array *tr) | 1050 | int tracer_tracing_is_on(struct trace_array *tr) |
1051 | { | 1051 | { |
1052 | if (tr->trace_buffer.buffer) | 1052 | if (tr->trace_buffer.buffer) |
1053 | return ring_buffer_record_is_on(tr->trace_buffer.buffer); | 1053 | return ring_buffer_record_is_on(tr->trace_buffer.buffer); |
@@ -4123,6 +4123,30 @@ static const char readme_msg[] = | |||
4123 | "\t\t\t traces\n" | 4123 | "\t\t\t traces\n" |
4124 | #endif | 4124 | #endif |
4125 | #endif /* CONFIG_STACK_TRACER */ | 4125 | #endif /* CONFIG_STACK_TRACER */ |
4126 | #ifdef CONFIG_KPROBE_EVENT | ||
4127 | " kprobe_events\t\t- Add/remove/show the kernel dynamic events\n" | ||
4128 | "\t\t\t Write into this file to define/undefine new trace events.\n" | ||
4129 | #endif | ||
4130 | #ifdef CONFIG_UPROBE_EVENT | ||
4131 | " uprobe_events\t\t- Add/remove/show the userspace dynamic events\n" | ||
4132 | "\t\t\t Write into this file to define/undefine new trace events.\n" | ||
4133 | #endif | ||
4134 | #if defined(CONFIG_KPROBE_EVENT) || defined(CONFIG_UPROBE_EVENT) | ||
4135 | "\t accepts: event-definitions (one definition per line)\n" | ||
4136 | "\t Format: p|r[:[<group>/]<event>] <place> [<args>]\n" | ||
4137 | "\t -:[<group>/]<event>\n" | ||
4138 | #ifdef CONFIG_KPROBE_EVENT | ||
4139 | "\t place: [<module>:]<symbol>[+<offset>]|<memaddr>\n" | ||
4140 | #endif | ||
4141 | #ifdef CONFIG_UPROBE_EVENT | ||
4142 | "\t place: <path>:<offset>\n" | ||
4143 | #endif | ||
4144 | "\t args: <name>=fetcharg[:type]\n" | ||
4145 | "\t fetcharg: %<register>, @<address>, @<symbol>[+|-<offset>],\n" | ||
4146 | "\t $stack<index>, $stack, $retval, $comm\n" | ||
4147 | "\t type: s8/16/32/64, u8/16/32/64, x8/16/32/64, string,\n" | ||
4148 | "\t b<bit-width>@<bit-offset>/<container-size>\n" | ||
4149 | #endif | ||
4126 | " events/\t\t- Directory containing all trace event subsystems:\n" | 4150 | " events/\t\t- Directory containing all trace event subsystems:\n" |
4127 | " enable\t\t- Write 0/1 to enable/disable tracing of all events\n" | 4151 | " enable\t\t- Write 0/1 to enable/disable tracing of all events\n" |
4128 | " events/<system>/\t- Directory containing all trace events for <system>:\n" | 4152 | " events/<system>/\t- Directory containing all trace events for <system>:\n" |
@@ -4945,7 +4969,7 @@ out: | |||
4945 | return ret; | 4969 | return ret; |
4946 | } | 4970 | } |
4947 | 4971 | ||
4948 | #ifdef CONFIG_TRACER_MAX_TRACE | 4972 | #if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER) |
4949 | 4973 | ||
4950 | static ssize_t | 4974 | static ssize_t |
4951 | tracing_max_lat_read(struct file *filp, char __user *ubuf, | 4975 | tracing_max_lat_read(struct file *filp, char __user *ubuf, |
@@ -5124,19 +5148,20 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, | |||
5124 | struct trace_iterator *iter = filp->private_data; | 5148 | struct trace_iterator *iter = filp->private_data; |
5125 | ssize_t sret; | 5149 | ssize_t sret; |
5126 | 5150 | ||
5127 | /* return any leftover data */ | ||
5128 | sret = trace_seq_to_user(&iter->seq, ubuf, cnt); | ||
5129 | if (sret != -EBUSY) | ||
5130 | return sret; | ||
5131 | |||
5132 | trace_seq_init(&iter->seq); | ||
5133 | |||
5134 | /* | 5151 | /* |
5135 | * Avoid more than one consumer on a single file descriptor | 5152 | * Avoid more than one consumer on a single file descriptor |
5136 | * This is just a matter of traces coherency, the ring buffer itself | 5153 | * This is just a matter of traces coherency, the ring buffer itself |
5137 | * is protected. | 5154 | * is protected. |
5138 | */ | 5155 | */ |
5139 | mutex_lock(&iter->mutex); | 5156 | mutex_lock(&iter->mutex); |
5157 | |||
5158 | /* return any leftover data */ | ||
5159 | sret = trace_seq_to_user(&iter->seq, ubuf, cnt); | ||
5160 | if (sret != -EBUSY) | ||
5161 | goto out; | ||
5162 | |||
5163 | trace_seq_init(&iter->seq); | ||
5164 | |||
5140 | if (iter->trace->read) { | 5165 | if (iter->trace->read) { |
5141 | sret = iter->trace->read(iter, filp, ubuf, cnt, ppos); | 5166 | sret = iter->trace->read(iter, filp, ubuf, cnt, ppos); |
5142 | if (sret) | 5167 | if (sret) |
@@ -5867,7 +5892,7 @@ static const struct file_operations tracing_thresh_fops = { | |||
5867 | .llseek = generic_file_llseek, | 5892 | .llseek = generic_file_llseek, |
5868 | }; | 5893 | }; |
5869 | 5894 | ||
5870 | #ifdef CONFIG_TRACER_MAX_TRACE | 5895 | #if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER) |
5871 | static const struct file_operations tracing_max_lat_fops = { | 5896 | static const struct file_operations tracing_max_lat_fops = { |
5872 | .open = tracing_open_generic, | 5897 | .open = tracing_open_generic, |
5873 | .read = tracing_max_lat_read, | 5898 | .read = tracing_max_lat_read, |
@@ -6163,9 +6188,6 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, | |||
6163 | return -EBUSY; | 6188 | return -EBUSY; |
6164 | #endif | 6189 | #endif |
6165 | 6190 | ||
6166 | if (splice_grow_spd(pipe, &spd)) | ||
6167 | return -ENOMEM; | ||
6168 | |||
6169 | if (*ppos & (PAGE_SIZE - 1)) | 6191 | if (*ppos & (PAGE_SIZE - 1)) |
6170 | return -EINVAL; | 6192 | return -EINVAL; |
6171 | 6193 | ||
@@ -6175,6 +6197,9 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, | |||
6175 | len &= PAGE_MASK; | 6197 | len &= PAGE_MASK; |
6176 | } | 6198 | } |
6177 | 6199 | ||
6200 | if (splice_grow_spd(pipe, &spd)) | ||
6201 | return -ENOMEM; | ||
6202 | |||
6178 | again: | 6203 | again: |
6179 | trace_access_lock(iter->cpu_file); | 6204 | trace_access_lock(iter->cpu_file); |
6180 | entries = ring_buffer_entries_cpu(iter->trace_buffer->buffer, iter->cpu_file); | 6205 | entries = ring_buffer_entries_cpu(iter->trace_buffer->buffer, iter->cpu_file); |
@@ -6232,19 +6257,21 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, | |||
6232 | /* did we read anything? */ | 6257 | /* did we read anything? */ |
6233 | if (!spd.nr_pages) { | 6258 | if (!spd.nr_pages) { |
6234 | if (ret) | 6259 | if (ret) |
6235 | return ret; | 6260 | goto out; |
6236 | 6261 | ||
6262 | ret = -EAGAIN; | ||
6237 | if ((file->f_flags & O_NONBLOCK) || (flags & SPLICE_F_NONBLOCK)) | 6263 | if ((file->f_flags & O_NONBLOCK) || (flags & SPLICE_F_NONBLOCK)) |
6238 | return -EAGAIN; | 6264 | goto out; |
6239 | 6265 | ||
6240 | ret = wait_on_pipe(iter, true); | 6266 | ret = wait_on_pipe(iter, true); |
6241 | if (ret) | 6267 | if (ret) |
6242 | return ret; | 6268 | goto out; |
6243 | 6269 | ||
6244 | goto again; | 6270 | goto again; |
6245 | } | 6271 | } |
6246 | 6272 | ||
6247 | ret = splice_to_pipe(pipe, &spd); | 6273 | ret = splice_to_pipe(pipe, &spd); |
6274 | out: | ||
6248 | splice_shrink_spd(&spd); | 6275 | splice_shrink_spd(&spd); |
6249 | 6276 | ||
6250 | return ret; | 6277 | return ret; |
@@ -7195,7 +7222,7 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer) | |||
7195 | 7222 | ||
7196 | create_trace_options_dir(tr); | 7223 | create_trace_options_dir(tr); |
7197 | 7224 | ||
7198 | #ifdef CONFIG_TRACER_MAX_TRACE | 7225 | #if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER) |
7199 | trace_create_file("tracing_max_latency", 0644, d_tracer, | 7226 | trace_create_file("tracing_max_latency", 0644, d_tracer, |
7200 | &tr->max_latency, &tracing_max_lat_fops); | 7227 | &tr->max_latency, &tracing_max_lat_fops); |
7201 | #endif | 7228 | #endif |
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index f783df416726..fd24b1f9ac43 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h | |||
@@ -38,6 +38,7 @@ enum trace_type { | |||
38 | TRACE_USER_STACK, | 38 | TRACE_USER_STACK, |
39 | TRACE_BLK, | 39 | TRACE_BLK, |
40 | TRACE_BPUTS, | 40 | TRACE_BPUTS, |
41 | TRACE_HWLAT, | ||
41 | 42 | ||
42 | __TRACE_LAST_TYPE, | 43 | __TRACE_LAST_TYPE, |
43 | }; | 44 | }; |
@@ -213,6 +214,8 @@ struct trace_array { | |||
213 | */ | 214 | */ |
214 | struct trace_buffer max_buffer; | 215 | struct trace_buffer max_buffer; |
215 | bool allocated_snapshot; | 216 | bool allocated_snapshot; |
217 | #endif | ||
218 | #if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER) | ||
216 | unsigned long max_latency; | 219 | unsigned long max_latency; |
217 | #endif | 220 | #endif |
218 | struct trace_pid_list __rcu *filtered_pids; | 221 | struct trace_pid_list __rcu *filtered_pids; |
@@ -326,6 +329,7 @@ extern void __ftrace_bad_type(void); | |||
326 | IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT); \ | 329 | IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT); \ |
327 | IF_ASSIGN(var, ent, struct bprint_entry, TRACE_BPRINT); \ | 330 | IF_ASSIGN(var, ent, struct bprint_entry, TRACE_BPRINT); \ |
328 | IF_ASSIGN(var, ent, struct bputs_entry, TRACE_BPUTS); \ | 331 | IF_ASSIGN(var, ent, struct bputs_entry, TRACE_BPUTS); \ |
332 | IF_ASSIGN(var, ent, struct hwlat_entry, TRACE_HWLAT); \ | ||
329 | IF_ASSIGN(var, ent, struct trace_mmiotrace_rw, \ | 333 | IF_ASSIGN(var, ent, struct trace_mmiotrace_rw, \ |
330 | TRACE_MMIO_RW); \ | 334 | TRACE_MMIO_RW); \ |
331 | IF_ASSIGN(var, ent, struct trace_mmiotrace_map, \ | 335 | IF_ASSIGN(var, ent, struct trace_mmiotrace_map, \ |
@@ -571,6 +575,7 @@ void tracing_reset_current(int cpu); | |||
571 | void tracing_reset_all_online_cpus(void); | 575 | void tracing_reset_all_online_cpus(void); |
572 | int tracing_open_generic(struct inode *inode, struct file *filp); | 576 | int tracing_open_generic(struct inode *inode, struct file *filp); |
573 | bool tracing_is_disabled(void); | 577 | bool tracing_is_disabled(void); |
578 | int tracer_tracing_is_on(struct trace_array *tr); | ||
574 | struct dentry *trace_create_file(const char *name, | 579 | struct dentry *trace_create_file(const char *name, |
575 | umode_t mode, | 580 | umode_t mode, |
576 | struct dentry *parent, | 581 | struct dentry *parent, |
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index 5c30efcda5e6..d1cc37e78f99 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h | |||
@@ -322,3 +322,30 @@ FTRACE_ENTRY(branch, trace_branch, | |||
322 | FILTER_OTHER | 322 | FILTER_OTHER |
323 | ); | 323 | ); |
324 | 324 | ||
325 | |||
326 | FTRACE_ENTRY(hwlat, hwlat_entry, | ||
327 | |||
328 | TRACE_HWLAT, | ||
329 | |||
330 | F_STRUCT( | ||
331 | __field( u64, duration ) | ||
332 | __field( u64, outer_duration ) | ||
333 | __field( u64, nmi_total_ts ) | ||
334 | __field_struct( struct timespec, timestamp ) | ||
335 | __field_desc( long, timestamp, tv_sec ) | ||
336 | __field_desc( long, timestamp, tv_nsec ) | ||
337 | __field( unsigned int, nmi_count ) | ||
338 | __field( unsigned int, seqnum ) | ||
339 | ), | ||
340 | |||
341 | F_printk("cnt:%u\tts:%010lu.%010lu\tinner:%llu\touter:%llunmi-ts:%llu\tnmi-count:%u\n", | ||
342 | __entry->seqnum, | ||
343 | __entry->tv_sec, | ||
344 | __entry->tv_nsec, | ||
345 | __entry->duration, | ||
346 | __entry->outer_duration, | ||
347 | __entry->nmi_total_ts, | ||
348 | __entry->nmi_count), | ||
349 | |||
350 | FILTER_OTHER | ||
351 | ); | ||
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index a975571cde24..6721a1e89f39 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c | |||
@@ -1028,6 +1028,7 @@ static struct event_command trigger_traceon_cmd = { | |||
1028 | static struct event_command trigger_traceoff_cmd = { | 1028 | static struct event_command trigger_traceoff_cmd = { |
1029 | .name = "traceoff", | 1029 | .name = "traceoff", |
1030 | .trigger_type = ETT_TRACE_ONOFF, | 1030 | .trigger_type = ETT_TRACE_ONOFF, |
1031 | .flags = EVENT_CMD_FL_POST_TRIGGER, | ||
1031 | .func = event_trigger_callback, | 1032 | .func = event_trigger_callback, |
1032 | .reg = register_trigger, | 1033 | .reg = register_trigger, |
1033 | .unreg = unregister_trigger, | 1034 | .unreg = unregister_trigger, |
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 7363ccf79512..4e480e870474 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c | |||
@@ -119,7 +119,7 @@ print_graph_duration(struct trace_array *tr, unsigned long long duration, | |||
119 | /* Add a function return address to the trace stack on thread info.*/ | 119 | /* Add a function return address to the trace stack on thread info.*/ |
120 | int | 120 | int |
121 | ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth, | 121 | ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth, |
122 | unsigned long frame_pointer) | 122 | unsigned long frame_pointer, unsigned long *retp) |
123 | { | 123 | { |
124 | unsigned long long calltime; | 124 | unsigned long long calltime; |
125 | int index; | 125 | int index; |
@@ -170,8 +170,12 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth, | |||
170 | current->ret_stack[index].ret = ret; | 170 | current->ret_stack[index].ret = ret; |
171 | current->ret_stack[index].func = func; | 171 | current->ret_stack[index].func = func; |
172 | current->ret_stack[index].calltime = calltime; | 172 | current->ret_stack[index].calltime = calltime; |
173 | current->ret_stack[index].subtime = 0; | 173 | #ifdef HAVE_FUNCTION_GRAPH_FP_TEST |
174 | current->ret_stack[index].fp = frame_pointer; | 174 | current->ret_stack[index].fp = frame_pointer; |
175 | #endif | ||
176 | #ifdef HAVE_FUNCTION_GRAPH_RET_ADDR_PTR | ||
177 | current->ret_stack[index].retp = retp; | ||
178 | #endif | ||
175 | *depth = current->curr_ret_stack; | 179 | *depth = current->curr_ret_stack; |
176 | 180 | ||
177 | return 0; | 181 | return 0; |
@@ -204,7 +208,7 @@ ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret, | |||
204 | return; | 208 | return; |
205 | } | 209 | } |
206 | 210 | ||
207 | #if defined(CONFIG_HAVE_FUNCTION_GRAPH_FP_TEST) && !defined(CC_USING_FENTRY) | 211 | #ifdef HAVE_FUNCTION_GRAPH_FP_TEST |
208 | /* | 212 | /* |
209 | * The arch may choose to record the frame pointer used | 213 | * The arch may choose to record the frame pointer used |
210 | * and check it here to make sure that it is what we expect it | 214 | * and check it here to make sure that it is what we expect it |
@@ -279,6 +283,64 @@ unsigned long ftrace_return_to_handler(unsigned long frame_pointer) | |||
279 | return ret; | 283 | return ret; |
280 | } | 284 | } |
281 | 285 | ||
286 | /** | ||
287 | * ftrace_graph_ret_addr - convert a potentially modified stack return address | ||
288 | * to its original value | ||
289 | * | ||
290 | * This function can be called by stack unwinding code to convert a found stack | ||
291 | * return address ('ret') to its original value, in case the function graph | ||
292 | * tracer has modified it to be 'return_to_handler'. If the address hasn't | ||
293 | * been modified, the unchanged value of 'ret' is returned. | ||
294 | * | ||
295 | * 'idx' is a state variable which should be initialized by the caller to zero | ||
296 | * before the first call. | ||
297 | * | ||
298 | * 'retp' is a pointer to the return address on the stack. It's ignored if | ||
299 | * the arch doesn't have HAVE_FUNCTION_GRAPH_RET_ADDR_PTR defined. | ||
300 | */ | ||
301 | #ifdef HAVE_FUNCTION_GRAPH_RET_ADDR_PTR | ||
302 | unsigned long ftrace_graph_ret_addr(struct task_struct *task, int *idx, | ||
303 | unsigned long ret, unsigned long *retp) | ||
304 | { | ||
305 | int index = task->curr_ret_stack; | ||
306 | int i; | ||
307 | |||
308 | if (ret != (unsigned long)return_to_handler) | ||
309 | return ret; | ||
310 | |||
311 | if (index < -1) | ||
312 | index += FTRACE_NOTRACE_DEPTH; | ||
313 | |||
314 | if (index < 0) | ||
315 | return ret; | ||
316 | |||
317 | for (i = 0; i <= index; i++) | ||
318 | if (task->ret_stack[i].retp == retp) | ||
319 | return task->ret_stack[i].ret; | ||
320 | |||
321 | return ret; | ||
322 | } | ||
323 | #else /* !HAVE_FUNCTION_GRAPH_RET_ADDR_PTR */ | ||
324 | unsigned long ftrace_graph_ret_addr(struct task_struct *task, int *idx, | ||
325 | unsigned long ret, unsigned long *retp) | ||
326 | { | ||
327 | int task_idx; | ||
328 | |||
329 | if (ret != (unsigned long)return_to_handler) | ||
330 | return ret; | ||
331 | |||
332 | task_idx = task->curr_ret_stack; | ||
333 | |||
334 | if (!task->ret_stack || task_idx < *idx) | ||
335 | return ret; | ||
336 | |||
337 | task_idx -= *idx; | ||
338 | (*idx)++; | ||
339 | |||
340 | return task->ret_stack[task_idx].ret; | ||
341 | } | ||
342 | #endif /* HAVE_FUNCTION_GRAPH_RET_ADDR_PTR */ | ||
343 | |||
282 | int __trace_graph_entry(struct trace_array *tr, | 344 | int __trace_graph_entry(struct trace_array *tr, |
283 | struct ftrace_graph_ent *trace, | 345 | struct ftrace_graph_ent *trace, |
284 | unsigned long flags, | 346 | unsigned long flags, |
@@ -1120,6 +1182,11 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent, | |||
1120 | trace_seq_puts(s, "/* "); | 1182 | trace_seq_puts(s, "/* "); |
1121 | 1183 | ||
1122 | switch (iter->ent->type) { | 1184 | switch (iter->ent->type) { |
1185 | case TRACE_BPUTS: | ||
1186 | ret = trace_print_bputs_msg_only(iter); | ||
1187 | if (ret != TRACE_TYPE_HANDLED) | ||
1188 | return ret; | ||
1189 | break; | ||
1123 | case TRACE_BPRINT: | 1190 | case TRACE_BPRINT: |
1124 | ret = trace_print_bprintk_msg_only(iter); | 1191 | ret = trace_print_bprintk_msg_only(iter); |
1125 | if (ret != TRACE_TYPE_HANDLED) | 1192 | if (ret != TRACE_TYPE_HANDLED) |
diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c new file mode 100644 index 000000000000..b97286c48735 --- /dev/null +++ b/kernel/trace/trace_hwlat.c | |||
@@ -0,0 +1,633 @@ | |||
1 | /* | ||
2 | * trace_hwlatdetect.c - A simple Hardware Latency detector. | ||
3 | * | ||
4 | * Use this tracer to detect large system latencies induced by the behavior of | ||
5 | * certain underlying system hardware or firmware, independent of Linux itself. | ||
6 | * The code was developed originally to detect the presence of SMIs on Intel | ||
7 | * and AMD systems, although there is no dependency upon x86 herein. | ||
8 | * | ||
9 | * The classical example usage of this tracer is in detecting the presence of | ||
10 | * SMIs or System Management Interrupts on Intel and AMD systems. An SMI is a | ||
11 | * somewhat special form of hardware interrupt spawned from earlier CPU debug | ||
12 | * modes in which the (BIOS/EFI/etc.) firmware arranges for the South Bridge | ||
13 | * LPC (or other device) to generate a special interrupt under certain | ||
14 | * circumstances, for example, upon expiration of a special SMI timer device, | ||
15 | * due to certain external thermal readings, on certain I/O address accesses, | ||
16 | * and other situations. An SMI hits a special CPU pin, triggers a special | ||
17 | * SMI mode (complete with special memory map), and the OS is unaware. | ||
18 | * | ||
19 | * Although certain hardware-inducing latencies are necessary (for example, | ||
20 | * a modern system often requires an SMI handler for correct thermal control | ||
21 | * and remote management) they can wreak havoc upon any OS-level performance | ||
22 | * guarantees toward low-latency, especially when the OS is not even made | ||
23 | * aware of the presence of these interrupts. For this reason, we need a | ||
24 | * somewhat brute force mechanism to detect these interrupts. In this case, | ||
25 | * we do it by hogging all of the CPU(s) for configurable timer intervals, | ||
26 | * sampling the built-in CPU timer, looking for discontiguous readings. | ||
27 | * | ||
28 | * WARNING: This implementation necessarily introduces latencies. Therefore, | ||
29 | * you should NEVER use this tracer while running in a production | ||
30 | * environment requiring any kind of low-latency performance | ||
31 | * guarantee(s). | ||
32 | * | ||
33 | * Copyright (C) 2008-2009 Jon Masters, Red Hat, Inc. <jcm@redhat.com> | ||
34 | * Copyright (C) 2013-2016 Steven Rostedt, Red Hat, Inc. <srostedt@redhat.com> | ||
35 | * | ||
36 | * Includes useful feedback from Clark Williams <clark@redhat.com> | ||
37 | * | ||
38 | * This file is licensed under the terms of the GNU General Public | ||
39 | * License version 2. This program is licensed "as is" without any | ||
40 | * warranty of any kind, whether express or implied. | ||
41 | */ | ||
42 | #include <linux/kthread.h> | ||
43 | #include <linux/tracefs.h> | ||
44 | #include <linux/uaccess.h> | ||
45 | #include <linux/cpumask.h> | ||
46 | #include <linux/delay.h> | ||
47 | #include "trace.h" | ||
48 | |||
49 | static struct trace_array *hwlat_trace; | ||
50 | |||
51 | #define U64STR_SIZE 22 /* 20 digits max */ | ||
52 | |||
53 | #define BANNER "hwlat_detector: " | ||
54 | #define DEFAULT_SAMPLE_WINDOW 1000000 /* 1s */ | ||
55 | #define DEFAULT_SAMPLE_WIDTH 500000 /* 0.5s */ | ||
56 | #define DEFAULT_LAT_THRESHOLD 10 /* 10us */ | ||
57 | |||
58 | /* sampling thread*/ | ||
59 | static struct task_struct *hwlat_kthread; | ||
60 | |||
61 | static struct dentry *hwlat_sample_width; /* sample width us */ | ||
62 | static struct dentry *hwlat_sample_window; /* sample window us */ | ||
63 | |||
64 | /* Save the previous tracing_thresh value */ | ||
65 | static unsigned long save_tracing_thresh; | ||
66 | |||
67 | /* NMI timestamp counters */ | ||
68 | static u64 nmi_ts_start; | ||
69 | static u64 nmi_total_ts; | ||
70 | static int nmi_count; | ||
71 | static int nmi_cpu; | ||
72 | |||
73 | /* Tells NMIs to call back to the hwlat tracer to record timestamps */ | ||
74 | bool trace_hwlat_callback_enabled; | ||
75 | |||
76 | /* If the user changed threshold, remember it */ | ||
77 | static u64 last_tracing_thresh = DEFAULT_LAT_THRESHOLD * NSEC_PER_USEC; | ||
78 | |||
79 | /* Individual latency samples are stored here when detected. */ | ||
80 | struct hwlat_sample { | ||
81 | u64 seqnum; /* unique sequence */ | ||
82 | u64 duration; /* delta */ | ||
83 | u64 outer_duration; /* delta (outer loop) */ | ||
84 | u64 nmi_total_ts; /* Total time spent in NMIs */ | ||
85 | struct timespec timestamp; /* wall time */ | ||
86 | int nmi_count; /* # NMIs during this sample */ | ||
87 | }; | ||
88 | |||
89 | /* keep the global state somewhere. */ | ||
90 | static struct hwlat_data { | ||
91 | |||
92 | struct mutex lock; /* protect changes */ | ||
93 | |||
94 | u64 count; /* total since reset */ | ||
95 | |||
96 | u64 sample_window; /* total sampling window (on+off) */ | ||
97 | u64 sample_width; /* active sampling portion of window */ | ||
98 | |||
99 | } hwlat_data = { | ||
100 | .sample_window = DEFAULT_SAMPLE_WINDOW, | ||
101 | .sample_width = DEFAULT_SAMPLE_WIDTH, | ||
102 | }; | ||
103 | |||
104 | static void trace_hwlat_sample(struct hwlat_sample *sample) | ||
105 | { | ||
106 | struct trace_array *tr = hwlat_trace; | ||
107 | struct trace_event_call *call = &event_hwlat; | ||
108 | struct ring_buffer *buffer = tr->trace_buffer.buffer; | ||
109 | struct ring_buffer_event *event; | ||
110 | struct hwlat_entry *entry; | ||
111 | unsigned long flags; | ||
112 | int pc; | ||
113 | |||
114 | pc = preempt_count(); | ||
115 | local_save_flags(flags); | ||
116 | |||
117 | event = trace_buffer_lock_reserve(buffer, TRACE_HWLAT, sizeof(*entry), | ||
118 | flags, pc); | ||
119 | if (!event) | ||
120 | return; | ||
121 | entry = ring_buffer_event_data(event); | ||
122 | entry->seqnum = sample->seqnum; | ||
123 | entry->duration = sample->duration; | ||
124 | entry->outer_duration = sample->outer_duration; | ||
125 | entry->timestamp = sample->timestamp; | ||
126 | entry->nmi_total_ts = sample->nmi_total_ts; | ||
127 | entry->nmi_count = sample->nmi_count; | ||
128 | |||
129 | if (!call_filter_check_discard(call, entry, buffer, event)) | ||
130 | __buffer_unlock_commit(buffer, event); | ||
131 | } | ||
132 | |||
133 | /* Macros to encapsulate the time capturing infrastructure */ | ||
134 | #define time_type u64 | ||
135 | #define time_get() trace_clock_local() | ||
136 | #define time_to_us(x) div_u64(x, 1000) | ||
137 | #define time_sub(a, b) ((a) - (b)) | ||
138 | #define init_time(a, b) (a = b) | ||
139 | #define time_u64(a) a | ||
140 | |||
141 | void trace_hwlat_callback(bool enter) | ||
142 | { | ||
143 | if (smp_processor_id() != nmi_cpu) | ||
144 | return; | ||
145 | |||
146 | /* | ||
147 | * Currently trace_clock_local() calls sched_clock() and the | ||
148 | * generic version is not NMI safe. | ||
149 | */ | ||
150 | if (!IS_ENABLED(CONFIG_GENERIC_SCHED_CLOCK)) { | ||
151 | if (enter) | ||
152 | nmi_ts_start = time_get(); | ||
153 | else | ||
154 | nmi_total_ts = time_get() - nmi_ts_start; | ||
155 | } | ||
156 | |||
157 | if (enter) | ||
158 | nmi_count++; | ||
159 | } | ||
160 | |||
161 | /** | ||
162 | * get_sample - sample the CPU TSC and look for likely hardware latencies | ||
163 | * | ||
164 | * Used to repeatedly capture the CPU TSC (or similar), looking for potential | ||
165 | * hardware-induced latency. Called with interrupts disabled and with | ||
166 | * hwlat_data.lock held. | ||
167 | */ | ||
168 | static int get_sample(void) | ||
169 | { | ||
170 | struct trace_array *tr = hwlat_trace; | ||
171 | time_type start, t1, t2, last_t2; | ||
172 | s64 diff, total, last_total = 0; | ||
173 | u64 sample = 0; | ||
174 | u64 thresh = tracing_thresh; | ||
175 | u64 outer_sample = 0; | ||
176 | int ret = -1; | ||
177 | |||
178 | do_div(thresh, NSEC_PER_USEC); /* modifies interval value */ | ||
179 | |||
180 | nmi_cpu = smp_processor_id(); | ||
181 | nmi_total_ts = 0; | ||
182 | nmi_count = 0; | ||
183 | /* Make sure NMIs see this first */ | ||
184 | barrier(); | ||
185 | |||
186 | trace_hwlat_callback_enabled = true; | ||
187 | |||
188 | init_time(last_t2, 0); | ||
189 | start = time_get(); /* start timestamp */ | ||
190 | |||
191 | do { | ||
192 | |||
193 | t1 = time_get(); /* we'll look for a discontinuity */ | ||
194 | t2 = time_get(); | ||
195 | |||
196 | if (time_u64(last_t2)) { | ||
197 | /* Check the delta from outer loop (t2 to next t1) */ | ||
198 | diff = time_to_us(time_sub(t1, last_t2)); | ||
199 | /* This shouldn't happen */ | ||
200 | if (diff < 0) { | ||
201 | pr_err(BANNER "time running backwards\n"); | ||
202 | goto out; | ||
203 | } | ||
204 | if (diff > outer_sample) | ||
205 | outer_sample = diff; | ||
206 | } | ||
207 | last_t2 = t2; | ||
208 | |||
209 | total = time_to_us(time_sub(t2, start)); /* sample width */ | ||
210 | |||
211 | /* Check for possible overflows */ | ||
212 | if (total < last_total) { | ||
213 | pr_err("Time total overflowed\n"); | ||
214 | break; | ||
215 | } | ||
216 | last_total = total; | ||
217 | |||
218 | /* This checks the inner loop (t1 to t2) */ | ||
219 | diff = time_to_us(time_sub(t2, t1)); /* current diff */ | ||
220 | |||
221 | /* This shouldn't happen */ | ||
222 | if (diff < 0) { | ||
223 | pr_err(BANNER "time running backwards\n"); | ||
224 | goto out; | ||
225 | } | ||
226 | |||
227 | if (diff > sample) | ||
228 | sample = diff; /* only want highest value */ | ||
229 | |||
230 | } while (total <= hwlat_data.sample_width); | ||
231 | |||
232 | barrier(); /* finish the above in the view for NMIs */ | ||
233 | trace_hwlat_callback_enabled = false; | ||
234 | barrier(); /* Make sure nmi_total_ts is no longer updated */ | ||
235 | |||
236 | ret = 0; | ||
237 | |||
238 | /* If we exceed the threshold value, we have found a hardware latency */ | ||
239 | if (sample > thresh || outer_sample > thresh) { | ||
240 | struct hwlat_sample s; | ||
241 | |||
242 | ret = 1; | ||
243 | |||
244 | /* We read in microseconds */ | ||
245 | if (nmi_total_ts) | ||
246 | do_div(nmi_total_ts, NSEC_PER_USEC); | ||
247 | |||
248 | hwlat_data.count++; | ||
249 | s.seqnum = hwlat_data.count; | ||
250 | s.duration = sample; | ||
251 | s.outer_duration = outer_sample; | ||
252 | s.timestamp = CURRENT_TIME; | ||
253 | s.nmi_total_ts = nmi_total_ts; | ||
254 | s.nmi_count = nmi_count; | ||
255 | trace_hwlat_sample(&s); | ||
256 | |||
257 | /* Keep a running maximum ever recorded hardware latency */ | ||
258 | if (sample > tr->max_latency) | ||
259 | tr->max_latency = sample; | ||
260 | } | ||
261 | |||
262 | out: | ||
263 | return ret; | ||
264 | } | ||
265 | |||
266 | static struct cpumask save_cpumask; | ||
267 | static bool disable_migrate; | ||
268 | |||
269 | static void move_to_next_cpu(void) | ||
270 | { | ||
271 | static struct cpumask *current_mask; | ||
272 | int next_cpu; | ||
273 | |||
274 | if (disable_migrate) | ||
275 | return; | ||
276 | |||
277 | /* Just pick the first CPU on first iteration */ | ||
278 | if (!current_mask) { | ||
279 | current_mask = &save_cpumask; | ||
280 | get_online_cpus(); | ||
281 | cpumask_and(current_mask, cpu_online_mask, tracing_buffer_mask); | ||
282 | put_online_cpus(); | ||
283 | next_cpu = cpumask_first(current_mask); | ||
284 | goto set_affinity; | ||
285 | } | ||
286 | |||
287 | /* | ||
288 | * If for some reason the user modifies the CPU affinity | ||
289 | * of this thread, than stop migrating for the duration | ||
290 | * of the current test. | ||
291 | */ | ||
292 | if (!cpumask_equal(current_mask, ¤t->cpus_allowed)) | ||
293 | goto disable; | ||
294 | |||
295 | get_online_cpus(); | ||
296 | cpumask_and(current_mask, cpu_online_mask, tracing_buffer_mask); | ||
297 | next_cpu = cpumask_next(smp_processor_id(), current_mask); | ||
298 | put_online_cpus(); | ||
299 | |||
300 | if (next_cpu >= nr_cpu_ids) | ||
301 | next_cpu = cpumask_first(current_mask); | ||
302 | |||
303 | set_affinity: | ||
304 | if (next_cpu >= nr_cpu_ids) /* Shouldn't happen! */ | ||
305 | goto disable; | ||
306 | |||
307 | cpumask_clear(current_mask); | ||
308 | cpumask_set_cpu(next_cpu, current_mask); | ||
309 | |||
310 | sched_setaffinity(0, current_mask); | ||
311 | return; | ||
312 | |||
313 | disable: | ||
314 | disable_migrate = true; | ||
315 | } | ||
316 | |||
317 | /* | ||
318 | * kthread_fn - The CPU time sampling/hardware latency detection kernel thread | ||
319 | * | ||
320 | * Used to periodically sample the CPU TSC via a call to get_sample. We | ||
321 | * disable interrupts, which does (intentionally) introduce latency since we | ||
322 | * need to ensure nothing else might be running (and thus preempting). | ||
323 | * Obviously this should never be used in production environments. | ||
324 | * | ||
325 | * Currently this runs on which ever CPU it was scheduled on, but most | ||
326 | * real-world hardware latency situations occur across several CPUs, | ||
327 | * but we might later generalize this if we find there are any actualy | ||
328 | * systems with alternate SMI delivery or other hardware latencies. | ||
329 | */ | ||
330 | static int kthread_fn(void *data) | ||
331 | { | ||
332 | u64 interval; | ||
333 | |||
334 | while (!kthread_should_stop()) { | ||
335 | |||
336 | move_to_next_cpu(); | ||
337 | |||
338 | local_irq_disable(); | ||
339 | get_sample(); | ||
340 | local_irq_enable(); | ||
341 | |||
342 | mutex_lock(&hwlat_data.lock); | ||
343 | interval = hwlat_data.sample_window - hwlat_data.sample_width; | ||
344 | mutex_unlock(&hwlat_data.lock); | ||
345 | |||
346 | do_div(interval, USEC_PER_MSEC); /* modifies interval value */ | ||
347 | |||
348 | /* Always sleep for at least 1ms */ | ||
349 | if (interval < 1) | ||
350 | interval = 1; | ||
351 | |||
352 | if (msleep_interruptible(interval)) | ||
353 | break; | ||
354 | } | ||
355 | |||
356 | return 0; | ||
357 | } | ||
358 | |||
359 | /** | ||
360 | * start_kthread - Kick off the hardware latency sampling/detector kthread | ||
361 | * | ||
362 | * This starts the kernel thread that will sit and sample the CPU timestamp | ||
363 | * counter (TSC or similar) and look for potential hardware latencies. | ||
364 | */ | ||
365 | static int start_kthread(struct trace_array *tr) | ||
366 | { | ||
367 | struct task_struct *kthread; | ||
368 | |||
369 | kthread = kthread_create(kthread_fn, NULL, "hwlatd"); | ||
370 | if (IS_ERR(kthread)) { | ||
371 | pr_err(BANNER "could not start sampling thread\n"); | ||
372 | return -ENOMEM; | ||
373 | } | ||
374 | hwlat_kthread = kthread; | ||
375 | wake_up_process(kthread); | ||
376 | |||
377 | return 0; | ||
378 | } | ||
379 | |||
380 | /** | ||
381 | * stop_kthread - Inform the hardware latency samping/detector kthread to stop | ||
382 | * | ||
383 | * This kicks the running hardware latency sampling/detector kernel thread and | ||
384 | * tells it to stop sampling now. Use this on unload and at system shutdown. | ||
385 | */ | ||
386 | static void stop_kthread(void) | ||
387 | { | ||
388 | if (!hwlat_kthread) | ||
389 | return; | ||
390 | kthread_stop(hwlat_kthread); | ||
391 | hwlat_kthread = NULL; | ||
392 | } | ||
393 | |||
394 | /* | ||
395 | * hwlat_read - Wrapper read function for reading both window and width | ||
396 | * @filp: The active open file structure | ||
397 | * @ubuf: The userspace provided buffer to read value into | ||
398 | * @cnt: The maximum number of bytes to read | ||
399 | * @ppos: The current "file" position | ||
400 | * | ||
401 | * This function provides a generic read implementation for the global state | ||
402 | * "hwlat_data" structure filesystem entries. | ||
403 | */ | ||
404 | static ssize_t hwlat_read(struct file *filp, char __user *ubuf, | ||
405 | size_t cnt, loff_t *ppos) | ||
406 | { | ||
407 | char buf[U64STR_SIZE]; | ||
408 | u64 *entry = filp->private_data; | ||
409 | u64 val; | ||
410 | int len; | ||
411 | |||
412 | if (!entry) | ||
413 | return -EFAULT; | ||
414 | |||
415 | if (cnt > sizeof(buf)) | ||
416 | cnt = sizeof(buf); | ||
417 | |||
418 | val = *entry; | ||
419 | |||
420 | len = snprintf(buf, sizeof(buf), "%llu\n", val); | ||
421 | |||
422 | return simple_read_from_buffer(ubuf, cnt, ppos, buf, len); | ||
423 | } | ||
424 | |||
425 | /** | ||
426 | * hwlat_width_write - Write function for "width" entry | ||
427 | * @filp: The active open file structure | ||
428 | * @ubuf: The user buffer that contains the value to write | ||
429 | * @cnt: The maximum number of bytes to write to "file" | ||
430 | * @ppos: The current position in @file | ||
431 | * | ||
432 | * This function provides a write implementation for the "width" interface | ||
433 | * to the hardware latency detector. It can be used to configure | ||
434 | * for how many us of the total window us we will actively sample for any | ||
435 | * hardware-induced latency periods. Obviously, it is not possible to | ||
436 | * sample constantly and have the system respond to a sample reader, or, | ||
437 | * worse, without having the system appear to have gone out to lunch. It | ||
438 | * is enforced that width is less that the total window size. | ||
439 | */ | ||
440 | static ssize_t | ||
441 | hwlat_width_write(struct file *filp, const char __user *ubuf, | ||
442 | size_t cnt, loff_t *ppos) | ||
443 | { | ||
444 | u64 val; | ||
445 | int err; | ||
446 | |||
447 | err = kstrtoull_from_user(ubuf, cnt, 10, &val); | ||
448 | if (err) | ||
449 | return err; | ||
450 | |||
451 | mutex_lock(&hwlat_data.lock); | ||
452 | if (val < hwlat_data.sample_window) | ||
453 | hwlat_data.sample_width = val; | ||
454 | else | ||
455 | err = -EINVAL; | ||
456 | mutex_unlock(&hwlat_data.lock); | ||
457 | |||
458 | if (err) | ||
459 | return err; | ||
460 | |||
461 | return cnt; | ||
462 | } | ||
463 | |||
464 | /** | ||
465 | * hwlat_window_write - Write function for "window" entry | ||
466 | * @filp: The active open file structure | ||
467 | * @ubuf: The user buffer that contains the value to write | ||
468 | * @cnt: The maximum number of bytes to write to "file" | ||
469 | * @ppos: The current position in @file | ||
470 | * | ||
471 | * This function provides a write implementation for the "window" interface | ||
472 | * to the hardware latency detetector. The window is the total time | ||
473 | * in us that will be considered one sample period. Conceptually, windows | ||
474 | * occur back-to-back and contain a sample width period during which | ||
475 | * actual sampling occurs. Can be used to write a new total window size. It | ||
476 | * is enfoced that any value written must be greater than the sample width | ||
477 | * size, or an error results. | ||
478 | */ | ||
479 | static ssize_t | ||
480 | hwlat_window_write(struct file *filp, const char __user *ubuf, | ||
481 | size_t cnt, loff_t *ppos) | ||
482 | { | ||
483 | u64 val; | ||
484 | int err; | ||
485 | |||
486 | err = kstrtoull_from_user(ubuf, cnt, 10, &val); | ||
487 | if (err) | ||
488 | return err; | ||
489 | |||
490 | mutex_lock(&hwlat_data.lock); | ||
491 | if (hwlat_data.sample_width < val) | ||
492 | hwlat_data.sample_window = val; | ||
493 | else | ||
494 | err = -EINVAL; | ||
495 | mutex_unlock(&hwlat_data.lock); | ||
496 | |||
497 | if (err) | ||
498 | return err; | ||
499 | |||
500 | return cnt; | ||
501 | } | ||
502 | |||
503 | static const struct file_operations width_fops = { | ||
504 | .open = tracing_open_generic, | ||
505 | .read = hwlat_read, | ||
506 | .write = hwlat_width_write, | ||
507 | }; | ||
508 | |||
509 | static const struct file_operations window_fops = { | ||
510 | .open = tracing_open_generic, | ||
511 | .read = hwlat_read, | ||
512 | .write = hwlat_window_write, | ||
513 | }; | ||
514 | |||
515 | /** | ||
516 | * init_tracefs - A function to initialize the tracefs interface files | ||
517 | * | ||
518 | * This function creates entries in tracefs for "hwlat_detector". | ||
519 | * It creates the hwlat_detector directory in the tracing directory, | ||
520 | * and within that directory is the count, width and window files to | ||
521 | * change and view those values. | ||
522 | */ | ||
523 | static int init_tracefs(void) | ||
524 | { | ||
525 | struct dentry *d_tracer; | ||
526 | struct dentry *top_dir; | ||
527 | |||
528 | d_tracer = tracing_init_dentry(); | ||
529 | if (IS_ERR(d_tracer)) | ||
530 | return -ENOMEM; | ||
531 | |||
532 | top_dir = tracefs_create_dir("hwlat_detector", d_tracer); | ||
533 | if (!top_dir) | ||
534 | return -ENOMEM; | ||
535 | |||
536 | hwlat_sample_window = tracefs_create_file("window", 0640, | ||
537 | top_dir, | ||
538 | &hwlat_data.sample_window, | ||
539 | &window_fops); | ||
540 | if (!hwlat_sample_window) | ||
541 | goto err; | ||
542 | |||
543 | hwlat_sample_width = tracefs_create_file("width", 0644, | ||
544 | top_dir, | ||
545 | &hwlat_data.sample_width, | ||
546 | &width_fops); | ||
547 | if (!hwlat_sample_width) | ||
548 | goto err; | ||
549 | |||
550 | return 0; | ||
551 | |||
552 | err: | ||
553 | tracefs_remove_recursive(top_dir); | ||
554 | return -ENOMEM; | ||
555 | } | ||
556 | |||
557 | static void hwlat_tracer_start(struct trace_array *tr) | ||
558 | { | ||
559 | int err; | ||
560 | |||
561 | err = start_kthread(tr); | ||
562 | if (err) | ||
563 | pr_err(BANNER "Cannot start hwlat kthread\n"); | ||
564 | } | ||
565 | |||
566 | static void hwlat_tracer_stop(struct trace_array *tr) | ||
567 | { | ||
568 | stop_kthread(); | ||
569 | } | ||
570 | |||
571 | static bool hwlat_busy; | ||
572 | |||
573 | static int hwlat_tracer_init(struct trace_array *tr) | ||
574 | { | ||
575 | /* Only allow one instance to enable this */ | ||
576 | if (hwlat_busy) | ||
577 | return -EBUSY; | ||
578 | |||
579 | hwlat_trace = tr; | ||
580 | |||
581 | disable_migrate = false; | ||
582 | hwlat_data.count = 0; | ||
583 | tr->max_latency = 0; | ||
584 | save_tracing_thresh = tracing_thresh; | ||
585 | |||
586 | /* tracing_thresh is in nsecs, we speak in usecs */ | ||
587 | if (!tracing_thresh) | ||
588 | tracing_thresh = last_tracing_thresh; | ||
589 | |||
590 | if (tracer_tracing_is_on(tr)) | ||
591 | hwlat_tracer_start(tr); | ||
592 | |||
593 | hwlat_busy = true; | ||
594 | |||
595 | return 0; | ||
596 | } | ||
597 | |||
598 | static void hwlat_tracer_reset(struct trace_array *tr) | ||
599 | { | ||
600 | stop_kthread(); | ||
601 | |||
602 | /* the tracing threshold is static between runs */ | ||
603 | last_tracing_thresh = tracing_thresh; | ||
604 | |||
605 | tracing_thresh = save_tracing_thresh; | ||
606 | hwlat_busy = false; | ||
607 | } | ||
608 | |||
609 | static struct tracer hwlat_tracer __read_mostly = | ||
610 | { | ||
611 | .name = "hwlat", | ||
612 | .init = hwlat_tracer_init, | ||
613 | .reset = hwlat_tracer_reset, | ||
614 | .start = hwlat_tracer_start, | ||
615 | .stop = hwlat_tracer_stop, | ||
616 | .allow_instances = true, | ||
617 | }; | ||
618 | |||
619 | __init static int init_hwlat_tracer(void) | ||
620 | { | ||
621 | int ret; | ||
622 | |||
623 | mutex_init(&hwlat_data.lock); | ||
624 | |||
625 | ret = register_tracer(&hwlat_tracer); | ||
626 | if (ret) | ||
627 | return ret; | ||
628 | |||
629 | init_tracefs(); | ||
630 | |||
631 | return 0; | ||
632 | } | ||
633 | late_initcall(init_hwlat_tracer); | ||
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 9aedb0b06683..eb6c9f1d3a93 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c | |||
@@ -253,6 +253,10 @@ static const struct fetch_type kprobes_fetch_type_table[] = { | |||
253 | ASSIGN_FETCH_TYPE(s16, u16, 1), | 253 | ASSIGN_FETCH_TYPE(s16, u16, 1), |
254 | ASSIGN_FETCH_TYPE(s32, u32, 1), | 254 | ASSIGN_FETCH_TYPE(s32, u32, 1), |
255 | ASSIGN_FETCH_TYPE(s64, u64, 1), | 255 | ASSIGN_FETCH_TYPE(s64, u64, 1), |
256 | ASSIGN_FETCH_TYPE_ALIAS(x8, u8, u8, 0), | ||
257 | ASSIGN_FETCH_TYPE_ALIAS(x16, u16, u16, 0), | ||
258 | ASSIGN_FETCH_TYPE_ALIAS(x32, u32, u32, 0), | ||
259 | ASSIGN_FETCH_TYPE_ALIAS(x64, u64, u64, 0), | ||
256 | 260 | ||
257 | ASSIGN_FETCH_TYPE_END | 261 | ASSIGN_FETCH_TYPE_END |
258 | }; | 262 | }; |
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 0bb9cf2d53e6..3fc20422c166 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c | |||
@@ -1098,6 +1098,71 @@ static struct trace_event trace_user_stack_event = { | |||
1098 | .funcs = &trace_user_stack_funcs, | 1098 | .funcs = &trace_user_stack_funcs, |
1099 | }; | 1099 | }; |
1100 | 1100 | ||
1101 | /* TRACE_HWLAT */ | ||
1102 | static enum print_line_t | ||
1103 | trace_hwlat_print(struct trace_iterator *iter, int flags, | ||
1104 | struct trace_event *event) | ||
1105 | { | ||
1106 | struct trace_entry *entry = iter->ent; | ||
1107 | struct trace_seq *s = &iter->seq; | ||
1108 | struct hwlat_entry *field; | ||
1109 | |||
1110 | trace_assign_type(field, entry); | ||
1111 | |||
1112 | trace_seq_printf(s, "#%-5u inner/outer(us): %4llu/%-5llu ts:%ld.%09ld", | ||
1113 | field->seqnum, | ||
1114 | field->duration, | ||
1115 | field->outer_duration, | ||
1116 | field->timestamp.tv_sec, | ||
1117 | field->timestamp.tv_nsec); | ||
1118 | |||
1119 | if (field->nmi_count) { | ||
1120 | /* | ||
1121 | * The generic sched_clock() is not NMI safe, thus | ||
1122 | * we only record the count and not the time. | ||
1123 | */ | ||
1124 | if (!IS_ENABLED(CONFIG_GENERIC_SCHED_CLOCK)) | ||
1125 | trace_seq_printf(s, " nmi-total:%llu", | ||
1126 | field->nmi_total_ts); | ||
1127 | trace_seq_printf(s, " nmi-count:%u", | ||
1128 | field->nmi_count); | ||
1129 | } | ||
1130 | |||
1131 | trace_seq_putc(s, '\n'); | ||
1132 | |||
1133 | return trace_handle_return(s); | ||
1134 | } | ||
1135 | |||
1136 | |||
1137 | static enum print_line_t | ||
1138 | trace_hwlat_raw(struct trace_iterator *iter, int flags, | ||
1139 | struct trace_event *event) | ||
1140 | { | ||
1141 | struct hwlat_entry *field; | ||
1142 | struct trace_seq *s = &iter->seq; | ||
1143 | |||
1144 | trace_assign_type(field, iter->ent); | ||
1145 | |||
1146 | trace_seq_printf(s, "%llu %lld %ld %09ld %u\n", | ||
1147 | field->duration, | ||
1148 | field->outer_duration, | ||
1149 | field->timestamp.tv_sec, | ||
1150 | field->timestamp.tv_nsec, | ||
1151 | field->seqnum); | ||
1152 | |||
1153 | return trace_handle_return(s); | ||
1154 | } | ||
1155 | |||
1156 | static struct trace_event_functions trace_hwlat_funcs = { | ||
1157 | .trace = trace_hwlat_print, | ||
1158 | .raw = trace_hwlat_raw, | ||
1159 | }; | ||
1160 | |||
1161 | static struct trace_event trace_hwlat_event = { | ||
1162 | .type = TRACE_HWLAT, | ||
1163 | .funcs = &trace_hwlat_funcs, | ||
1164 | }; | ||
1165 | |||
1101 | /* TRACE_BPUTS */ | 1166 | /* TRACE_BPUTS */ |
1102 | static enum print_line_t | 1167 | static enum print_line_t |
1103 | trace_bputs_print(struct trace_iterator *iter, int flags, | 1168 | trace_bputs_print(struct trace_iterator *iter, int flags, |
@@ -1233,6 +1298,7 @@ static struct trace_event *events[] __initdata = { | |||
1233 | &trace_bputs_event, | 1298 | &trace_bputs_event, |
1234 | &trace_bprint_event, | 1299 | &trace_bprint_event, |
1235 | &trace_print_event, | 1300 | &trace_print_event, |
1301 | &trace_hwlat_event, | ||
1236 | NULL | 1302 | NULL |
1237 | }; | 1303 | }; |
1238 | 1304 | ||
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 74e80a582c28..8c0553d9afd3 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c | |||
@@ -36,24 +36,28 @@ const char *reserved_field_names[] = { | |||
36 | }; | 36 | }; |
37 | 37 | ||
38 | /* Printing in basic type function template */ | 38 | /* Printing in basic type function template */ |
39 | #define DEFINE_BASIC_PRINT_TYPE_FUNC(type, fmt) \ | 39 | #define DEFINE_BASIC_PRINT_TYPE_FUNC(tname, type, fmt) \ |
40 | int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, const char *name, \ | 40 | int PRINT_TYPE_FUNC_NAME(tname)(struct trace_seq *s, const char *name, \ |
41 | void *data, void *ent) \ | 41 | void *data, void *ent) \ |
42 | { \ | 42 | { \ |
43 | trace_seq_printf(s, " %s=" fmt, name, *(type *)data); \ | 43 | trace_seq_printf(s, " %s=" fmt, name, *(type *)data); \ |
44 | return !trace_seq_has_overflowed(s); \ | 44 | return !trace_seq_has_overflowed(s); \ |
45 | } \ | 45 | } \ |
46 | const char PRINT_TYPE_FMT_NAME(type)[] = fmt; \ | 46 | const char PRINT_TYPE_FMT_NAME(tname)[] = fmt; \ |
47 | NOKPROBE_SYMBOL(PRINT_TYPE_FUNC_NAME(type)); | 47 | NOKPROBE_SYMBOL(PRINT_TYPE_FUNC_NAME(tname)); |
48 | 48 | ||
49 | DEFINE_BASIC_PRINT_TYPE_FUNC(u8 , "0x%x") | 49 | DEFINE_BASIC_PRINT_TYPE_FUNC(u8, u8, "%u") |
50 | DEFINE_BASIC_PRINT_TYPE_FUNC(u16, "0x%x") | 50 | DEFINE_BASIC_PRINT_TYPE_FUNC(u16, u16, "%u") |
51 | DEFINE_BASIC_PRINT_TYPE_FUNC(u32, "0x%x") | 51 | DEFINE_BASIC_PRINT_TYPE_FUNC(u32, u32, "%u") |
52 | DEFINE_BASIC_PRINT_TYPE_FUNC(u64, "0x%Lx") | 52 | DEFINE_BASIC_PRINT_TYPE_FUNC(u64, u64, "%Lu") |
53 | DEFINE_BASIC_PRINT_TYPE_FUNC(s8, "%d") | 53 | DEFINE_BASIC_PRINT_TYPE_FUNC(s8, s8, "%d") |
54 | DEFINE_BASIC_PRINT_TYPE_FUNC(s16, "%d") | 54 | DEFINE_BASIC_PRINT_TYPE_FUNC(s16, s16, "%d") |
55 | DEFINE_BASIC_PRINT_TYPE_FUNC(s32, "%d") | 55 | DEFINE_BASIC_PRINT_TYPE_FUNC(s32, s32, "%d") |
56 | DEFINE_BASIC_PRINT_TYPE_FUNC(s64, "%Ld") | 56 | DEFINE_BASIC_PRINT_TYPE_FUNC(s64, s64, "%Ld") |
57 | DEFINE_BASIC_PRINT_TYPE_FUNC(x8, u8, "0x%x") | ||
58 | DEFINE_BASIC_PRINT_TYPE_FUNC(x16, u16, "0x%x") | ||
59 | DEFINE_BASIC_PRINT_TYPE_FUNC(x32, u32, "0x%x") | ||
60 | DEFINE_BASIC_PRINT_TYPE_FUNC(x64, u64, "0x%Lx") | ||
57 | 61 | ||
58 | /* Print type function for string type */ | 62 | /* Print type function for string type */ |
59 | int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s, const char *name, | 63 | int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s, const char *name, |
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 45400ca5ded1..0c0ae54d44c6 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h | |||
@@ -149,6 +149,11 @@ DECLARE_BASIC_PRINT_TYPE_FUNC(s8); | |||
149 | DECLARE_BASIC_PRINT_TYPE_FUNC(s16); | 149 | DECLARE_BASIC_PRINT_TYPE_FUNC(s16); |
150 | DECLARE_BASIC_PRINT_TYPE_FUNC(s32); | 150 | DECLARE_BASIC_PRINT_TYPE_FUNC(s32); |
151 | DECLARE_BASIC_PRINT_TYPE_FUNC(s64); | 151 | DECLARE_BASIC_PRINT_TYPE_FUNC(s64); |
152 | DECLARE_BASIC_PRINT_TYPE_FUNC(x8); | ||
153 | DECLARE_BASIC_PRINT_TYPE_FUNC(x16); | ||
154 | DECLARE_BASIC_PRINT_TYPE_FUNC(x32); | ||
155 | DECLARE_BASIC_PRINT_TYPE_FUNC(x64); | ||
156 | |||
152 | DECLARE_BASIC_PRINT_TYPE_FUNC(string); | 157 | DECLARE_BASIC_PRINT_TYPE_FUNC(string); |
153 | 158 | ||
154 | #define FETCH_FUNC_NAME(method, type) fetch_##method##_##type | 159 | #define FETCH_FUNC_NAME(method, type) fetch_##method##_##type |
@@ -203,7 +208,7 @@ DEFINE_FETCH_##method(u32) \ | |||
203 | DEFINE_FETCH_##method(u64) | 208 | DEFINE_FETCH_##method(u64) |
204 | 209 | ||
205 | /* Default (unsigned long) fetch type */ | 210 | /* Default (unsigned long) fetch type */ |
206 | #define __DEFAULT_FETCH_TYPE(t) u##t | 211 | #define __DEFAULT_FETCH_TYPE(t) x##t |
207 | #define _DEFAULT_FETCH_TYPE(t) __DEFAULT_FETCH_TYPE(t) | 212 | #define _DEFAULT_FETCH_TYPE(t) __DEFAULT_FETCH_TYPE(t) |
208 | #define DEFAULT_FETCH_TYPE _DEFAULT_FETCH_TYPE(BITS_PER_LONG) | 213 | #define DEFAULT_FETCH_TYPE _DEFAULT_FETCH_TYPE(BITS_PER_LONG) |
209 | #define DEFAULT_FETCH_TYPE_STR __stringify(DEFAULT_FETCH_TYPE) | 214 | #define DEFAULT_FETCH_TYPE_STR __stringify(DEFAULT_FETCH_TYPE) |
@@ -234,6 +239,10 @@ ASSIGN_FETCH_FUNC(file_offset, ftype), \ | |||
234 | #define ASSIGN_FETCH_TYPE(ptype, ftype, sign) \ | 239 | #define ASSIGN_FETCH_TYPE(ptype, ftype, sign) \ |
235 | __ASSIGN_FETCH_TYPE(#ptype, ptype, ftype, sizeof(ftype), sign, #ptype) | 240 | __ASSIGN_FETCH_TYPE(#ptype, ptype, ftype, sizeof(ftype), sign, #ptype) |
236 | 241 | ||
242 | /* If ptype is an alias of atype, use this macro (show atype in format) */ | ||
243 | #define ASSIGN_FETCH_TYPE_ALIAS(ptype, atype, ftype, sign) \ | ||
244 | __ASSIGN_FETCH_TYPE(#ptype, ptype, ftype, sizeof(ftype), sign, #atype) | ||
245 | |||
237 | #define ASSIGN_FETCH_TYPE_END {} | 246 | #define ASSIGN_FETCH_TYPE_END {} |
238 | 247 | ||
239 | #define FETCH_TYPE_STRING 0 | 248 | #define FETCH_TYPE_STRING 0 |
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index b2b6efc083a4..5e10395da88e 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c | |||
@@ -610,8 +610,7 @@ static int perf_sysenter_enable(struct trace_event_call *call) | |||
610 | if (!sys_perf_refcount_enter) | 610 | if (!sys_perf_refcount_enter) |
611 | ret = register_trace_sys_enter(perf_syscall_enter, NULL); | 611 | ret = register_trace_sys_enter(perf_syscall_enter, NULL); |
612 | if (ret) { | 612 | if (ret) { |
613 | pr_info("event trace: Could not activate" | 613 | pr_info("event trace: Could not activate syscall entry trace point"); |
614 | "syscall entry trace point"); | ||
615 | } else { | 614 | } else { |
616 | set_bit(num, enabled_perf_enter_syscalls); | 615 | set_bit(num, enabled_perf_enter_syscalls); |
617 | sys_perf_refcount_enter++; | 616 | sys_perf_refcount_enter++; |
@@ -682,8 +681,7 @@ static int perf_sysexit_enable(struct trace_event_call *call) | |||
682 | if (!sys_perf_refcount_exit) | 681 | if (!sys_perf_refcount_exit) |
683 | ret = register_trace_sys_exit(perf_syscall_exit, NULL); | 682 | ret = register_trace_sys_exit(perf_syscall_exit, NULL); |
684 | if (ret) { | 683 | if (ret) { |
685 | pr_info("event trace: Could not activate" | 684 | pr_info("event trace: Could not activate syscall exit trace point"); |
686 | "syscall exit trace point"); | ||
687 | } else { | 685 | } else { |
688 | set_bit(num, enabled_perf_exit_syscalls); | 686 | set_bit(num, enabled_perf_exit_syscalls); |
689 | sys_perf_refcount_exit++; | 687 | sys_perf_refcount_exit++; |
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index c53485441c88..0913693caf6e 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c | |||
@@ -211,6 +211,10 @@ static const struct fetch_type uprobes_fetch_type_table[] = { | |||
211 | ASSIGN_FETCH_TYPE(s16, u16, 1), | 211 | ASSIGN_FETCH_TYPE(s16, u16, 1), |
212 | ASSIGN_FETCH_TYPE(s32, u32, 1), | 212 | ASSIGN_FETCH_TYPE(s32, u32, 1), |
213 | ASSIGN_FETCH_TYPE(s64, u64, 1), | 213 | ASSIGN_FETCH_TYPE(s64, u64, 1), |
214 | ASSIGN_FETCH_TYPE_ALIAS(x8, u8, u8, 0), | ||
215 | ASSIGN_FETCH_TYPE_ALIAS(x16, u16, u16, 0), | ||
216 | ASSIGN_FETCH_TYPE_ALIAS(x32, u32, u32, 0), | ||
217 | ASSIGN_FETCH_TYPE_ALIAS(x64, u64, u64, 0), | ||
214 | 218 | ||
215 | ASSIGN_FETCH_TYPE_END | 219 | ASSIGN_FETCH_TYPE_END |
216 | }; | 220 | }; |
@@ -427,10 +431,6 @@ static int create_trace_uprobe(int argc, char **argv) | |||
427 | pr_info("Probe point is not specified.\n"); | 431 | pr_info("Probe point is not specified.\n"); |
428 | return -EINVAL; | 432 | return -EINVAL; |
429 | } | 433 | } |
430 | if (isdigit(argv[1][0])) { | ||
431 | pr_info("probe point must be have a filename.\n"); | ||
432 | return -EINVAL; | ||
433 | } | ||
434 | arg = strchr(argv[1], ':'); | 434 | arg = strchr(argv[1], ':'); |
435 | if (!arg) { | 435 | if (!arg) { |
436 | ret = -EINVAL; | 436 | ret = -EINVAL; |
diff --git a/kernel/ucount.c b/kernel/ucount.c new file mode 100644 index 000000000000..9d20d5dd298a --- /dev/null +++ b/kernel/ucount.c | |||
@@ -0,0 +1,235 @@ | |||
1 | /* | ||
2 | * This program is free software; you can redistribute it and/or | ||
3 | * modify it under the terms of the GNU General Public License as | ||
4 | * published by the Free Software Foundation, version 2 of the | ||
5 | * License. | ||
6 | */ | ||
7 | |||
8 | #include <linux/stat.h> | ||
9 | #include <linux/sysctl.h> | ||
10 | #include <linux/slab.h> | ||
11 | #include <linux/hash.h> | ||
12 | #include <linux/user_namespace.h> | ||
13 | |||
14 | #define UCOUNTS_HASHTABLE_BITS 10 | ||
15 | static struct hlist_head ucounts_hashtable[(1 << UCOUNTS_HASHTABLE_BITS)]; | ||
16 | static DEFINE_SPINLOCK(ucounts_lock); | ||
17 | |||
18 | #define ucounts_hashfn(ns, uid) \ | ||
19 | hash_long((unsigned long)__kuid_val(uid) + (unsigned long)(ns), \ | ||
20 | UCOUNTS_HASHTABLE_BITS) | ||
21 | #define ucounts_hashentry(ns, uid) \ | ||
22 | (ucounts_hashtable + ucounts_hashfn(ns, uid)) | ||
23 | |||
24 | |||
25 | #ifdef CONFIG_SYSCTL | ||
26 | static struct ctl_table_set * | ||
27 | set_lookup(struct ctl_table_root *root) | ||
28 | { | ||
29 | return ¤t_user_ns()->set; | ||
30 | } | ||
31 | |||
32 | static int set_is_seen(struct ctl_table_set *set) | ||
33 | { | ||
34 | return ¤t_user_ns()->set == set; | ||
35 | } | ||
36 | |||
37 | static int set_permissions(struct ctl_table_header *head, | ||
38 | struct ctl_table *table) | ||
39 | { | ||
40 | struct user_namespace *user_ns = | ||
41 | container_of(head->set, struct user_namespace, set); | ||
42 | int mode; | ||
43 | |||
44 | /* Allow users with CAP_SYS_RESOURCE unrestrained access */ | ||
45 | if (ns_capable(user_ns, CAP_SYS_RESOURCE)) | ||
46 | mode = (table->mode & S_IRWXU) >> 6; | ||
47 | else | ||
48 | /* Allow all others at most read-only access */ | ||
49 | mode = table->mode & S_IROTH; | ||
50 | return (mode << 6) | (mode << 3) | mode; | ||
51 | } | ||
52 | |||
53 | static struct ctl_table_root set_root = { | ||
54 | .lookup = set_lookup, | ||
55 | .permissions = set_permissions, | ||
56 | }; | ||
57 | |||
58 | static int zero = 0; | ||
59 | static int int_max = INT_MAX; | ||
60 | #define UCOUNT_ENTRY(name) \ | ||
61 | { \ | ||
62 | .procname = name, \ | ||
63 | .maxlen = sizeof(int), \ | ||
64 | .mode = 0644, \ | ||
65 | .proc_handler = proc_dointvec_minmax, \ | ||
66 | .extra1 = &zero, \ | ||
67 | .extra2 = &int_max, \ | ||
68 | } | ||
69 | static struct ctl_table user_table[] = { | ||
70 | UCOUNT_ENTRY("max_user_namespaces"), | ||
71 | UCOUNT_ENTRY("max_pid_namespaces"), | ||
72 | UCOUNT_ENTRY("max_uts_namespaces"), | ||
73 | UCOUNT_ENTRY("max_ipc_namespaces"), | ||
74 | UCOUNT_ENTRY("max_net_namespaces"), | ||
75 | UCOUNT_ENTRY("max_mnt_namespaces"), | ||
76 | UCOUNT_ENTRY("max_cgroup_namespaces"), | ||
77 | { } | ||
78 | }; | ||
79 | #endif /* CONFIG_SYSCTL */ | ||
80 | |||
81 | bool setup_userns_sysctls(struct user_namespace *ns) | ||
82 | { | ||
83 | #ifdef CONFIG_SYSCTL | ||
84 | struct ctl_table *tbl; | ||
85 | setup_sysctl_set(&ns->set, &set_root, set_is_seen); | ||
86 | tbl = kmemdup(user_table, sizeof(user_table), GFP_KERNEL); | ||
87 | if (tbl) { | ||
88 | int i; | ||
89 | for (i = 0; i < UCOUNT_COUNTS; i++) { | ||
90 | tbl[i].data = &ns->ucount_max[i]; | ||
91 | } | ||
92 | ns->sysctls = __register_sysctl_table(&ns->set, "user", tbl); | ||
93 | } | ||
94 | if (!ns->sysctls) { | ||
95 | kfree(tbl); | ||
96 | retire_sysctl_set(&ns->set); | ||
97 | return false; | ||
98 | } | ||
99 | #endif | ||
100 | return true; | ||
101 | } | ||
102 | |||
103 | void retire_userns_sysctls(struct user_namespace *ns) | ||
104 | { | ||
105 | #ifdef CONFIG_SYSCTL | ||
106 | struct ctl_table *tbl; | ||
107 | |||
108 | tbl = ns->sysctls->ctl_table_arg; | ||
109 | unregister_sysctl_table(ns->sysctls); | ||
110 | retire_sysctl_set(&ns->set); | ||
111 | kfree(tbl); | ||
112 | #endif | ||
113 | } | ||
114 | |||
115 | static struct ucounts *find_ucounts(struct user_namespace *ns, kuid_t uid, struct hlist_head *hashent) | ||
116 | { | ||
117 | struct ucounts *ucounts; | ||
118 | |||
119 | hlist_for_each_entry(ucounts, hashent, node) { | ||
120 | if (uid_eq(ucounts->uid, uid) && (ucounts->ns == ns)) | ||
121 | return ucounts; | ||
122 | } | ||
123 | return NULL; | ||
124 | } | ||
125 | |||
126 | static struct ucounts *get_ucounts(struct user_namespace *ns, kuid_t uid) | ||
127 | { | ||
128 | struct hlist_head *hashent = ucounts_hashentry(ns, uid); | ||
129 | struct ucounts *ucounts, *new; | ||
130 | |||
131 | spin_lock(&ucounts_lock); | ||
132 | ucounts = find_ucounts(ns, uid, hashent); | ||
133 | if (!ucounts) { | ||
134 | spin_unlock(&ucounts_lock); | ||
135 | |||
136 | new = kzalloc(sizeof(*new), GFP_KERNEL); | ||
137 | if (!new) | ||
138 | return NULL; | ||
139 | |||
140 | new->ns = ns; | ||
141 | new->uid = uid; | ||
142 | atomic_set(&new->count, 0); | ||
143 | |||
144 | spin_lock(&ucounts_lock); | ||
145 | ucounts = find_ucounts(ns, uid, hashent); | ||
146 | if (ucounts) { | ||
147 | kfree(new); | ||
148 | } else { | ||
149 | hlist_add_head(&new->node, hashent); | ||
150 | ucounts = new; | ||
151 | } | ||
152 | } | ||
153 | if (!atomic_add_unless(&ucounts->count, 1, INT_MAX)) | ||
154 | ucounts = NULL; | ||
155 | spin_unlock(&ucounts_lock); | ||
156 | return ucounts; | ||
157 | } | ||
158 | |||
159 | static void put_ucounts(struct ucounts *ucounts) | ||
160 | { | ||
161 | if (atomic_dec_and_test(&ucounts->count)) { | ||
162 | spin_lock(&ucounts_lock); | ||
163 | hlist_del_init(&ucounts->node); | ||
164 | spin_unlock(&ucounts_lock); | ||
165 | |||
166 | kfree(ucounts); | ||
167 | } | ||
168 | } | ||
169 | |||
170 | static inline bool atomic_inc_below(atomic_t *v, int u) | ||
171 | { | ||
172 | int c, old; | ||
173 | c = atomic_read(v); | ||
174 | for (;;) { | ||
175 | if (unlikely(c >= u)) | ||
176 | return false; | ||
177 | old = atomic_cmpxchg(v, c, c+1); | ||
178 | if (likely(old == c)) | ||
179 | return true; | ||
180 | c = old; | ||
181 | } | ||
182 | } | ||
183 | |||
184 | struct ucounts *inc_ucount(struct user_namespace *ns, kuid_t uid, | ||
185 | enum ucount_type type) | ||
186 | { | ||
187 | struct ucounts *ucounts, *iter, *bad; | ||
188 | struct user_namespace *tns; | ||
189 | ucounts = get_ucounts(ns, uid); | ||
190 | for (iter = ucounts; iter; iter = tns->ucounts) { | ||
191 | int max; | ||
192 | tns = iter->ns; | ||
193 | max = READ_ONCE(tns->ucount_max[type]); | ||
194 | if (!atomic_inc_below(&iter->ucount[type], max)) | ||
195 | goto fail; | ||
196 | } | ||
197 | return ucounts; | ||
198 | fail: | ||
199 | bad = iter; | ||
200 | for (iter = ucounts; iter != bad; iter = iter->ns->ucounts) | ||
201 | atomic_dec(&iter->ucount[type]); | ||
202 | |||
203 | put_ucounts(ucounts); | ||
204 | return NULL; | ||
205 | } | ||
206 | |||
207 | void dec_ucount(struct ucounts *ucounts, enum ucount_type type) | ||
208 | { | ||
209 | struct ucounts *iter; | ||
210 | for (iter = ucounts; iter; iter = iter->ns->ucounts) { | ||
211 | int dec = atomic_dec_if_positive(&iter->ucount[type]); | ||
212 | WARN_ON_ONCE(dec < 0); | ||
213 | } | ||
214 | put_ucounts(ucounts); | ||
215 | } | ||
216 | |||
217 | static __init int user_namespace_sysctl_init(void) | ||
218 | { | ||
219 | #ifdef CONFIG_SYSCTL | ||
220 | static struct ctl_table_header *user_header; | ||
221 | static struct ctl_table empty[1]; | ||
222 | /* | ||
223 | * It is necessary to register the user directory in the | ||
224 | * default set so that registrations in the child sets work | ||
225 | * properly. | ||
226 | */ | ||
227 | user_header = register_sysctl("user", empty); | ||
228 | BUG_ON(!user_header); | ||
229 | BUG_ON(!setup_userns_sysctls(&init_user_ns)); | ||
230 | #endif | ||
231 | return 0; | ||
232 | } | ||
233 | subsys_initcall(user_namespace_sysctl_init); | ||
234 | |||
235 | |||
diff --git a/kernel/uid16.c b/kernel/uid16.c index d58cc4d8f0d1..cc40793464e3 100644 --- a/kernel/uid16.c +++ b/kernel/uid16.c | |||
@@ -117,7 +117,7 @@ static int groups16_to_user(old_gid_t __user *grouplist, | |||
117 | kgid_t kgid; | 117 | kgid_t kgid; |
118 | 118 | ||
119 | for (i = 0; i < group_info->ngroups; i++) { | 119 | for (i = 0; i < group_info->ngroups; i++) { |
120 | kgid = GROUP_AT(group_info, i); | 120 | kgid = group_info->gid[i]; |
121 | group = high2lowgid(from_kgid_munged(user_ns, kgid)); | 121 | group = high2lowgid(from_kgid_munged(user_ns, kgid)); |
122 | if (put_user(group, grouplist+i)) | 122 | if (put_user(group, grouplist+i)) |
123 | return -EFAULT; | 123 | return -EFAULT; |
@@ -142,7 +142,7 @@ static int groups16_from_user(struct group_info *group_info, | |||
142 | if (!gid_valid(kgid)) | 142 | if (!gid_valid(kgid)) |
143 | return -EINVAL; | 143 | return -EINVAL; |
144 | 144 | ||
145 | GROUP_AT(group_info, i) = kgid; | 145 | group_info->gid[i] = kgid; |
146 | } | 146 | } |
147 | 147 | ||
148 | return 0; | 148 | return 0; |
diff --git a/kernel/up.c b/kernel/up.c index 1760bf3d1463..ee81ac9af4ca 100644 --- a/kernel/up.c +++ b/kernel/up.c | |||
@@ -6,6 +6,7 @@ | |||
6 | #include <linux/kernel.h> | 6 | #include <linux/kernel.h> |
7 | #include <linux/export.h> | 7 | #include <linux/export.h> |
8 | #include <linux/smp.h> | 8 | #include <linux/smp.h> |
9 | #include <linux/hypervisor.h> | ||
9 | 10 | ||
10 | int smp_call_function_single(int cpu, void (*func) (void *info), void *info, | 11 | int smp_call_function_single(int cpu, void (*func) (void *info), void *info, |
11 | int wait) | 12 | int wait) |
@@ -82,3 +83,20 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info), | |||
82 | preempt_enable(); | 83 | preempt_enable(); |
83 | } | 84 | } |
84 | EXPORT_SYMBOL(on_each_cpu_cond); | 85 | EXPORT_SYMBOL(on_each_cpu_cond); |
86 | |||
87 | int smp_call_on_cpu(unsigned int cpu, int (*func)(void *), void *par, bool phys) | ||
88 | { | ||
89 | int ret; | ||
90 | |||
91 | if (cpu != 0) | ||
92 | return -ENXIO; | ||
93 | |||
94 | if (phys) | ||
95 | hypervisor_pin_vcpu(0); | ||
96 | ret = func(par); | ||
97 | if (phys) | ||
98 | hypervisor_pin_vcpu(-1); | ||
99 | |||
100 | return ret; | ||
101 | } | ||
102 | EXPORT_SYMBOL_GPL(smp_call_on_cpu); | ||
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 68f594212759..86b7854fec8e 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c | |||
@@ -29,6 +29,17 @@ static DEFINE_MUTEX(userns_state_mutex); | |||
29 | static bool new_idmap_permitted(const struct file *file, | 29 | static bool new_idmap_permitted(const struct file *file, |
30 | struct user_namespace *ns, int cap_setid, | 30 | struct user_namespace *ns, int cap_setid, |
31 | struct uid_gid_map *map); | 31 | struct uid_gid_map *map); |
32 | static void free_user_ns(struct work_struct *work); | ||
33 | |||
34 | static struct ucounts *inc_user_namespaces(struct user_namespace *ns, kuid_t uid) | ||
35 | { | ||
36 | return inc_ucount(ns, uid, UCOUNT_USER_NAMESPACES); | ||
37 | } | ||
38 | |||
39 | static void dec_user_namespaces(struct ucounts *ucounts) | ||
40 | { | ||
41 | return dec_ucount(ucounts, UCOUNT_USER_NAMESPACES); | ||
42 | } | ||
32 | 43 | ||
33 | static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns) | 44 | static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns) |
34 | { | 45 | { |
@@ -62,10 +73,16 @@ int create_user_ns(struct cred *new) | |||
62 | struct user_namespace *ns, *parent_ns = new->user_ns; | 73 | struct user_namespace *ns, *parent_ns = new->user_ns; |
63 | kuid_t owner = new->euid; | 74 | kuid_t owner = new->euid; |
64 | kgid_t group = new->egid; | 75 | kgid_t group = new->egid; |
65 | int ret; | 76 | struct ucounts *ucounts; |
77 | int ret, i; | ||
66 | 78 | ||
79 | ret = -ENOSPC; | ||
67 | if (parent_ns->level > 32) | 80 | if (parent_ns->level > 32) |
68 | return -EUSERS; | 81 | goto fail; |
82 | |||
83 | ucounts = inc_user_namespaces(parent_ns, owner); | ||
84 | if (!ucounts) | ||
85 | goto fail; | ||
69 | 86 | ||
70 | /* | 87 | /* |
71 | * Verify that we can not violate the policy of which files | 88 | * Verify that we can not violate the policy of which files |
@@ -73,26 +90,27 @@ int create_user_ns(struct cred *new) | |||
73 | * by verifing that the root directory is at the root of the | 90 | * by verifing that the root directory is at the root of the |
74 | * mount namespace which allows all files to be accessed. | 91 | * mount namespace which allows all files to be accessed. |
75 | */ | 92 | */ |
93 | ret = -EPERM; | ||
76 | if (current_chrooted()) | 94 | if (current_chrooted()) |
77 | return -EPERM; | 95 | goto fail_dec; |
78 | 96 | ||
79 | /* The creator needs a mapping in the parent user namespace | 97 | /* The creator needs a mapping in the parent user namespace |
80 | * or else we won't be able to reasonably tell userspace who | 98 | * or else we won't be able to reasonably tell userspace who |
81 | * created a user_namespace. | 99 | * created a user_namespace. |
82 | */ | 100 | */ |
101 | ret = -EPERM; | ||
83 | if (!kuid_has_mapping(parent_ns, owner) || | 102 | if (!kuid_has_mapping(parent_ns, owner) || |
84 | !kgid_has_mapping(parent_ns, group)) | 103 | !kgid_has_mapping(parent_ns, group)) |
85 | return -EPERM; | 104 | goto fail_dec; |
86 | 105 | ||
106 | ret = -ENOMEM; | ||
87 | ns = kmem_cache_zalloc(user_ns_cachep, GFP_KERNEL); | 107 | ns = kmem_cache_zalloc(user_ns_cachep, GFP_KERNEL); |
88 | if (!ns) | 108 | if (!ns) |
89 | return -ENOMEM; | 109 | goto fail_dec; |
90 | 110 | ||
91 | ret = ns_alloc_inum(&ns->ns); | 111 | ret = ns_alloc_inum(&ns->ns); |
92 | if (ret) { | 112 | if (ret) |
93 | kmem_cache_free(user_ns_cachep, ns); | 113 | goto fail_free; |
94 | return ret; | ||
95 | } | ||
96 | ns->ns.ops = &userns_operations; | 114 | ns->ns.ops = &userns_operations; |
97 | 115 | ||
98 | atomic_set(&ns->count, 1); | 116 | atomic_set(&ns->count, 1); |
@@ -101,18 +119,37 @@ int create_user_ns(struct cred *new) | |||
101 | ns->level = parent_ns->level + 1; | 119 | ns->level = parent_ns->level + 1; |
102 | ns->owner = owner; | 120 | ns->owner = owner; |
103 | ns->group = group; | 121 | ns->group = group; |
122 | INIT_WORK(&ns->work, free_user_ns); | ||
123 | for (i = 0; i < UCOUNT_COUNTS; i++) { | ||
124 | ns->ucount_max[i] = INT_MAX; | ||
125 | } | ||
126 | ns->ucounts = ucounts; | ||
104 | 127 | ||
105 | /* Inherit USERNS_SETGROUPS_ALLOWED from our parent */ | 128 | /* Inherit USERNS_SETGROUPS_ALLOWED from our parent */ |
106 | mutex_lock(&userns_state_mutex); | 129 | mutex_lock(&userns_state_mutex); |
107 | ns->flags = parent_ns->flags; | 130 | ns->flags = parent_ns->flags; |
108 | mutex_unlock(&userns_state_mutex); | 131 | mutex_unlock(&userns_state_mutex); |
109 | 132 | ||
110 | set_cred_user_ns(new, ns); | ||
111 | |||
112 | #ifdef CONFIG_PERSISTENT_KEYRINGS | 133 | #ifdef CONFIG_PERSISTENT_KEYRINGS |
113 | init_rwsem(&ns->persistent_keyring_register_sem); | 134 | init_rwsem(&ns->persistent_keyring_register_sem); |
114 | #endif | 135 | #endif |
136 | ret = -ENOMEM; | ||
137 | if (!setup_userns_sysctls(ns)) | ||
138 | goto fail_keyring; | ||
139 | |||
140 | set_cred_user_ns(new, ns); | ||
115 | return 0; | 141 | return 0; |
142 | fail_keyring: | ||
143 | #ifdef CONFIG_PERSISTENT_KEYRINGS | ||
144 | key_put(ns->persistent_keyring_register); | ||
145 | #endif | ||
146 | ns_free_inum(&ns->ns); | ||
147 | fail_free: | ||
148 | kmem_cache_free(user_ns_cachep, ns); | ||
149 | fail_dec: | ||
150 | dec_user_namespaces(ucounts); | ||
151 | fail: | ||
152 | return ret; | ||
116 | } | 153 | } |
117 | 154 | ||
118 | int unshare_userns(unsigned long unshare_flags, struct cred **new_cred) | 155 | int unshare_userns(unsigned long unshare_flags, struct cred **new_cred) |
@@ -135,21 +172,30 @@ int unshare_userns(unsigned long unshare_flags, struct cred **new_cred) | |||
135 | return err; | 172 | return err; |
136 | } | 173 | } |
137 | 174 | ||
138 | void free_user_ns(struct user_namespace *ns) | 175 | static void free_user_ns(struct work_struct *work) |
139 | { | 176 | { |
140 | struct user_namespace *parent; | 177 | struct user_namespace *parent, *ns = |
178 | container_of(work, struct user_namespace, work); | ||
141 | 179 | ||
142 | do { | 180 | do { |
181 | struct ucounts *ucounts = ns->ucounts; | ||
143 | parent = ns->parent; | 182 | parent = ns->parent; |
183 | retire_userns_sysctls(ns); | ||
144 | #ifdef CONFIG_PERSISTENT_KEYRINGS | 184 | #ifdef CONFIG_PERSISTENT_KEYRINGS |
145 | key_put(ns->persistent_keyring_register); | 185 | key_put(ns->persistent_keyring_register); |
146 | #endif | 186 | #endif |
147 | ns_free_inum(&ns->ns); | 187 | ns_free_inum(&ns->ns); |
148 | kmem_cache_free(user_ns_cachep, ns); | 188 | kmem_cache_free(user_ns_cachep, ns); |
189 | dec_user_namespaces(ucounts); | ||
149 | ns = parent; | 190 | ns = parent; |
150 | } while (atomic_dec_and_test(&parent->count)); | 191 | } while (atomic_dec_and_test(&parent->count)); |
151 | } | 192 | } |
152 | EXPORT_SYMBOL(free_user_ns); | 193 | |
194 | void __put_user_ns(struct user_namespace *ns) | ||
195 | { | ||
196 | schedule_work(&ns->work); | ||
197 | } | ||
198 | EXPORT_SYMBOL(__put_user_ns); | ||
153 | 199 | ||
154 | static u32 map_id_range_down(struct uid_gid_map *map, u32 id, u32 count) | 200 | static u32 map_id_range_down(struct uid_gid_map *map, u32 id, u32 count) |
155 | { | 201 | { |
@@ -1004,12 +1050,37 @@ static int userns_install(struct nsproxy *nsproxy, struct ns_common *ns) | |||
1004 | return commit_creds(cred); | 1050 | return commit_creds(cred); |
1005 | } | 1051 | } |
1006 | 1052 | ||
1053 | struct ns_common *ns_get_owner(struct ns_common *ns) | ||
1054 | { | ||
1055 | struct user_namespace *my_user_ns = current_user_ns(); | ||
1056 | struct user_namespace *owner, *p; | ||
1057 | |||
1058 | /* See if the owner is in the current user namespace */ | ||
1059 | owner = p = ns->ops->owner(ns); | ||
1060 | for (;;) { | ||
1061 | if (!p) | ||
1062 | return ERR_PTR(-EPERM); | ||
1063 | if (p == my_user_ns) | ||
1064 | break; | ||
1065 | p = p->parent; | ||
1066 | } | ||
1067 | |||
1068 | return &get_user_ns(owner)->ns; | ||
1069 | } | ||
1070 | |||
1071 | static struct user_namespace *userns_owner(struct ns_common *ns) | ||
1072 | { | ||
1073 | return to_user_ns(ns)->parent; | ||
1074 | } | ||
1075 | |||
1007 | const struct proc_ns_operations userns_operations = { | 1076 | const struct proc_ns_operations userns_operations = { |
1008 | .name = "user", | 1077 | .name = "user", |
1009 | .type = CLONE_NEWUSER, | 1078 | .type = CLONE_NEWUSER, |
1010 | .get = userns_get, | 1079 | .get = userns_get, |
1011 | .put = userns_put, | 1080 | .put = userns_put, |
1012 | .install = userns_install, | 1081 | .install = userns_install, |
1082 | .owner = userns_owner, | ||
1083 | .get_parent = ns_get_owner, | ||
1013 | }; | 1084 | }; |
1014 | 1085 | ||
1015 | static __init int user_namespaces_init(void) | 1086 | static __init int user_namespaces_init(void) |
diff --git a/kernel/utsname.c b/kernel/utsname.c index 831ea7108232..6976cd47dcf6 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c | |||
@@ -17,6 +17,16 @@ | |||
17 | #include <linux/user_namespace.h> | 17 | #include <linux/user_namespace.h> |
18 | #include <linux/proc_ns.h> | 18 | #include <linux/proc_ns.h> |
19 | 19 | ||
20 | static struct ucounts *inc_uts_namespaces(struct user_namespace *ns) | ||
21 | { | ||
22 | return inc_ucount(ns, current_euid(), UCOUNT_UTS_NAMESPACES); | ||
23 | } | ||
24 | |||
25 | static void dec_uts_namespaces(struct ucounts *ucounts) | ||
26 | { | ||
27 | dec_ucount(ucounts, UCOUNT_UTS_NAMESPACES); | ||
28 | } | ||
29 | |||
20 | static struct uts_namespace *create_uts_ns(void) | 30 | static struct uts_namespace *create_uts_ns(void) |
21 | { | 31 | { |
22 | struct uts_namespace *uts_ns; | 32 | struct uts_namespace *uts_ns; |
@@ -36,18 +46,24 @@ static struct uts_namespace *clone_uts_ns(struct user_namespace *user_ns, | |||
36 | struct uts_namespace *old_ns) | 46 | struct uts_namespace *old_ns) |
37 | { | 47 | { |
38 | struct uts_namespace *ns; | 48 | struct uts_namespace *ns; |
49 | struct ucounts *ucounts; | ||
39 | int err; | 50 | int err; |
40 | 51 | ||
52 | err = -ENOSPC; | ||
53 | ucounts = inc_uts_namespaces(user_ns); | ||
54 | if (!ucounts) | ||
55 | goto fail; | ||
56 | |||
57 | err = -ENOMEM; | ||
41 | ns = create_uts_ns(); | 58 | ns = create_uts_ns(); |
42 | if (!ns) | 59 | if (!ns) |
43 | return ERR_PTR(-ENOMEM); | 60 | goto fail_dec; |
44 | 61 | ||
45 | err = ns_alloc_inum(&ns->ns); | 62 | err = ns_alloc_inum(&ns->ns); |
46 | if (err) { | 63 | if (err) |
47 | kfree(ns); | 64 | goto fail_free; |
48 | return ERR_PTR(err); | ||
49 | } | ||
50 | 65 | ||
66 | ns->ucounts = ucounts; | ||
51 | ns->ns.ops = &utsns_operations; | 67 | ns->ns.ops = &utsns_operations; |
52 | 68 | ||
53 | down_read(&uts_sem); | 69 | down_read(&uts_sem); |
@@ -55,6 +71,13 @@ static struct uts_namespace *clone_uts_ns(struct user_namespace *user_ns, | |||
55 | ns->user_ns = get_user_ns(user_ns); | 71 | ns->user_ns = get_user_ns(user_ns); |
56 | up_read(&uts_sem); | 72 | up_read(&uts_sem); |
57 | return ns; | 73 | return ns; |
74 | |||
75 | fail_free: | ||
76 | kfree(ns); | ||
77 | fail_dec: | ||
78 | dec_uts_namespaces(ucounts); | ||
79 | fail: | ||
80 | return ERR_PTR(err); | ||
58 | } | 81 | } |
59 | 82 | ||
60 | /* | 83 | /* |
@@ -85,6 +108,7 @@ void free_uts_ns(struct kref *kref) | |||
85 | struct uts_namespace *ns; | 108 | struct uts_namespace *ns; |
86 | 109 | ||
87 | ns = container_of(kref, struct uts_namespace, kref); | 110 | ns = container_of(kref, struct uts_namespace, kref); |
111 | dec_uts_namespaces(ns->ucounts); | ||
88 | put_user_ns(ns->user_ns); | 112 | put_user_ns(ns->user_ns); |
89 | ns_free_inum(&ns->ns); | 113 | ns_free_inum(&ns->ns); |
90 | kfree(ns); | 114 | kfree(ns); |
@@ -130,10 +154,16 @@ static int utsns_install(struct nsproxy *nsproxy, struct ns_common *new) | |||
130 | return 0; | 154 | return 0; |
131 | } | 155 | } |
132 | 156 | ||
157 | static struct user_namespace *utsns_owner(struct ns_common *ns) | ||
158 | { | ||
159 | return to_uts_ns(ns)->user_ns; | ||
160 | } | ||
161 | |||
133 | const struct proc_ns_operations utsns_operations = { | 162 | const struct proc_ns_operations utsns_operations = { |
134 | .name = "uts", | 163 | .name = "uts", |
135 | .type = CLONE_NEWUTS, | 164 | .type = CLONE_NEWUTS, |
136 | .get = utsns_get, | 165 | .get = utsns_get, |
137 | .put = utsns_put, | 166 | .put = utsns_put, |
138 | .install = utsns_install, | 167 | .install = utsns_install, |
168 | .owner = utsns_owner, | ||
139 | }; | 169 | }; |
diff --git a/kernel/workqueue.c b/kernel/workqueue.c index ef071ca73fc3..479d840db286 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c | |||
@@ -2974,6 +2974,31 @@ bool flush_delayed_work(struct delayed_work *dwork) | |||
2974 | } | 2974 | } |
2975 | EXPORT_SYMBOL(flush_delayed_work); | 2975 | EXPORT_SYMBOL(flush_delayed_work); |
2976 | 2976 | ||
2977 | static bool __cancel_work(struct work_struct *work, bool is_dwork) | ||
2978 | { | ||
2979 | unsigned long flags; | ||
2980 | int ret; | ||
2981 | |||
2982 | do { | ||
2983 | ret = try_to_grab_pending(work, is_dwork, &flags); | ||
2984 | } while (unlikely(ret == -EAGAIN)); | ||
2985 | |||
2986 | if (unlikely(ret < 0)) | ||
2987 | return false; | ||
2988 | |||
2989 | set_work_pool_and_clear_pending(work, get_work_pool_id(work)); | ||
2990 | local_irq_restore(flags); | ||
2991 | return ret; | ||
2992 | } | ||
2993 | |||
2994 | /* | ||
2995 | * See cancel_delayed_work() | ||
2996 | */ | ||
2997 | bool cancel_work(struct work_struct *work) | ||
2998 | { | ||
2999 | return __cancel_work(work, false); | ||
3000 | } | ||
3001 | |||
2977 | /** | 3002 | /** |
2978 | * cancel_delayed_work - cancel a delayed work | 3003 | * cancel_delayed_work - cancel a delayed work |
2979 | * @dwork: delayed_work to cancel | 3004 | * @dwork: delayed_work to cancel |
@@ -2992,20 +3017,7 @@ EXPORT_SYMBOL(flush_delayed_work); | |||
2992 | */ | 3017 | */ |
2993 | bool cancel_delayed_work(struct delayed_work *dwork) | 3018 | bool cancel_delayed_work(struct delayed_work *dwork) |
2994 | { | 3019 | { |
2995 | unsigned long flags; | 3020 | return __cancel_work(&dwork->work, true); |
2996 | int ret; | ||
2997 | |||
2998 | do { | ||
2999 | ret = try_to_grab_pending(&dwork->work, true, &flags); | ||
3000 | } while (unlikely(ret == -EAGAIN)); | ||
3001 | |||
3002 | if (unlikely(ret < 0)) | ||
3003 | return false; | ||
3004 | |||
3005 | set_work_pool_and_clear_pending(&dwork->work, | ||
3006 | get_work_pool_id(&dwork->work)); | ||
3007 | local_irq_restore(flags); | ||
3008 | return ret; | ||
3009 | } | 3021 | } |
3010 | EXPORT_SYMBOL(cancel_delayed_work); | 3022 | EXPORT_SYMBOL(cancel_delayed_work); |
3011 | 3023 | ||
@@ -4249,7 +4261,7 @@ void print_worker_info(const char *log_lvl, struct task_struct *task) | |||
4249 | * This function is called without any synchronization and @task | 4261 | * This function is called without any synchronization and @task |
4250 | * could be in any state. Be careful with dereferences. | 4262 | * could be in any state. Be careful with dereferences. |
4251 | */ | 4263 | */ |
4252 | worker = probe_kthread_data(task); | 4264 | worker = kthread_probe_data(task); |
4253 | 4265 | ||
4254 | /* | 4266 | /* |
4255 | * Carefully copy the associated workqueue's workfn and name. Keep | 4267 | * Carefully copy the associated workqueue's workfn and name. Keep |