diff options
author | Mark Brown <broonie@kernel.org> | 2015-10-12 13:09:27 -0400 |
---|---|---|
committer | Mark Brown <broonie@kernel.org> | 2015-10-12 13:09:27 -0400 |
commit | 79828b4fa835f73cdaf4bffa48696abdcbea9d02 (patch) | |
tree | 5e0fa7156acb75ba603022bc807df8f2fedb97a8 /kernel | |
parent | 721b51fcf91898299d96f4b72cb9434cda29dce6 (diff) | |
parent | 8c1a9d6323abf0fb1e5dad96cf3f1c783505ea5a (diff) |
Merge remote-tracking branch 'asoc/fix/rt5645' into asoc-fix-rt5645
Diffstat (limited to 'kernel')
131 files changed, 7153 insertions, 6014 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index 43c4c920f30a..53abf008ecb3 100644 --- a/kernel/Makefile +++ b/kernel/Makefile | |||
@@ -45,16 +45,18 @@ ifneq ($(CONFIG_SMP),y) | |||
45 | obj-y += up.o | 45 | obj-y += up.o |
46 | endif | 46 | endif |
47 | obj-$(CONFIG_UID16) += uid16.o | 47 | obj-$(CONFIG_UID16) += uid16.o |
48 | obj-$(CONFIG_SYSTEM_TRUSTED_KEYRING) += system_keyring.o system_certificates.o | ||
49 | obj-$(CONFIG_MODULES) += module.o | 48 | obj-$(CONFIG_MODULES) += module.o |
50 | obj-$(CONFIG_MODULE_SIG) += module_signing.o | 49 | obj-$(CONFIG_MODULE_SIG) += module_signing.o |
51 | obj-$(CONFIG_KALLSYMS) += kallsyms.o | 50 | obj-$(CONFIG_KALLSYMS) += kallsyms.o |
52 | obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o | 51 | obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o |
52 | obj-$(CONFIG_KEXEC_CORE) += kexec_core.o | ||
53 | obj-$(CONFIG_KEXEC) += kexec.o | 53 | obj-$(CONFIG_KEXEC) += kexec.o |
54 | obj-$(CONFIG_KEXEC_FILE) += kexec_file.o | ||
54 | obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o | 55 | obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o |
55 | obj-$(CONFIG_COMPAT) += compat.o | 56 | obj-$(CONFIG_COMPAT) += compat.o |
56 | obj-$(CONFIG_CGROUPS) += cgroup.o | 57 | obj-$(CONFIG_CGROUPS) += cgroup.o |
57 | obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o | 58 | obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o |
59 | obj-$(CONFIG_CGROUP_PIDS) += cgroup_pids.o | ||
58 | obj-$(CONFIG_CPUSETS) += cpuset.o | 60 | obj-$(CONFIG_CPUSETS) += cpuset.o |
59 | obj-$(CONFIG_UTS_NS) += utsname.o | 61 | obj-$(CONFIG_UTS_NS) += utsname.o |
60 | obj-$(CONFIG_USER_NS) += user_namespace.o | 62 | obj-$(CONFIG_USER_NS) += user_namespace.o |
@@ -64,7 +66,7 @@ obj-$(CONFIG_SMP) += stop_machine.o | |||
64 | obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o | 66 | obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o |
65 | obj-$(CONFIG_AUDIT) += audit.o auditfilter.o | 67 | obj-$(CONFIG_AUDIT) += audit.o auditfilter.o |
66 | obj-$(CONFIG_AUDITSYSCALL) += auditsc.o | 68 | obj-$(CONFIG_AUDITSYSCALL) += auditsc.o |
67 | obj-$(CONFIG_AUDIT_WATCH) += audit_watch.o | 69 | obj-$(CONFIG_AUDIT_WATCH) += audit_watch.o audit_fsnotify.o |
68 | obj-$(CONFIG_AUDIT_TREE) += audit_tree.o | 70 | obj-$(CONFIG_AUDIT_TREE) += audit_tree.o |
69 | obj-$(CONFIG_GCOV_KERNEL) += gcov/ | 71 | obj-$(CONFIG_GCOV_KERNEL) += gcov/ |
70 | obj-$(CONFIG_KPROBES) += kprobes.o | 72 | obj-$(CONFIG_KPROBES) += kprobes.o |
@@ -98,6 +100,9 @@ obj-$(CONFIG_CRASH_DUMP) += crash_dump.o | |||
98 | obj-$(CONFIG_JUMP_LABEL) += jump_label.o | 100 | obj-$(CONFIG_JUMP_LABEL) += jump_label.o |
99 | obj-$(CONFIG_CONTEXT_TRACKING) += context_tracking.o | 101 | obj-$(CONFIG_CONTEXT_TRACKING) += context_tracking.o |
100 | obj-$(CONFIG_TORTURE_TEST) += torture.o | 102 | obj-$(CONFIG_TORTURE_TEST) += torture.o |
103 | obj-$(CONFIG_MEMBARRIER) += membarrier.o | ||
104 | |||
105 | obj-$(CONFIG_HAS_IOMEM) += memremap.o | ||
101 | 106 | ||
102 | $(obj)/configs.o: $(obj)/config_data.h | 107 | $(obj)/configs.o: $(obj)/config_data.h |
103 | 108 | ||
@@ -111,99 +116,3 @@ $(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE | |||
111 | targets += config_data.h | 116 | targets += config_data.h |
112 | $(obj)/config_data.h: $(obj)/config_data.gz FORCE | 117 | $(obj)/config_data.h: $(obj)/config_data.gz FORCE |
113 | $(call filechk,ikconfiggz) | 118 | $(call filechk,ikconfiggz) |
114 | |||
115 | ############################################################################### | ||
116 | # | ||
117 | # Roll all the X.509 certificates that we can find together and pull them into | ||
118 | # the kernel so that they get loaded into the system trusted keyring during | ||
119 | # boot. | ||
120 | # | ||
121 | # We look in the source root and the build root for all files whose name ends | ||
122 | # in ".x509". Unfortunately, this will generate duplicate filenames, so we | ||
123 | # have make canonicalise the pathnames and then sort them to discard the | ||
124 | # duplicates. | ||
125 | # | ||
126 | ############################################################################### | ||
127 | ifeq ($(CONFIG_SYSTEM_TRUSTED_KEYRING),y) | ||
128 | X509_CERTIFICATES-y := $(wildcard *.x509) $(wildcard $(srctree)/*.x509) | ||
129 | X509_CERTIFICATES-$(CONFIG_MODULE_SIG) += $(objtree)/signing_key.x509 | ||
130 | X509_CERTIFICATES-raw := $(sort $(foreach CERT,$(X509_CERTIFICATES-y), \ | ||
131 | $(or $(realpath $(CERT)),$(CERT)))) | ||
132 | X509_CERTIFICATES := $(subst $(realpath $(objtree))/,,$(X509_CERTIFICATES-raw)) | ||
133 | |||
134 | ifeq ($(X509_CERTIFICATES),) | ||
135 | $(warning *** No X.509 certificates found ***) | ||
136 | endif | ||
137 | |||
138 | ifneq ($(wildcard $(obj)/.x509.list),) | ||
139 | ifneq ($(shell cat $(obj)/.x509.list),$(X509_CERTIFICATES)) | ||
140 | $(warning X.509 certificate list changed to "$(X509_CERTIFICATES)" from "$(shell cat $(obj)/.x509.list)") | ||
141 | $(shell rm $(obj)/.x509.list) | ||
142 | endif | ||
143 | endif | ||
144 | |||
145 | kernel/system_certificates.o: $(obj)/x509_certificate_list | ||
146 | |||
147 | quiet_cmd_x509certs = CERTS $@ | ||
148 | cmd_x509certs = cat $(X509_CERTIFICATES) /dev/null >$@ $(foreach X509,$(X509_CERTIFICATES),; $(kecho) " - Including cert $(X509)") | ||
149 | |||
150 | targets += $(obj)/x509_certificate_list | ||
151 | $(obj)/x509_certificate_list: $(X509_CERTIFICATES) $(obj)/.x509.list | ||
152 | $(call if_changed,x509certs) | ||
153 | |||
154 | targets += $(obj)/.x509.list | ||
155 | $(obj)/.x509.list: | ||
156 | @echo $(X509_CERTIFICATES) >$@ | ||
157 | endif | ||
158 | |||
159 | clean-files := x509_certificate_list .x509.list | ||
160 | |||
161 | ifeq ($(CONFIG_MODULE_SIG),y) | ||
162 | ############################################################################### | ||
163 | # | ||
164 | # If module signing is requested, say by allyesconfig, but a key has not been | ||
165 | # supplied, then one will need to be generated to make sure the build does not | ||
166 | # fail and that the kernel may be used afterwards. | ||
167 | # | ||
168 | ############################################################################### | ||
169 | ifndef CONFIG_MODULE_SIG_HASH | ||
170 | $(error Could not determine digest type to use from kernel config) | ||
171 | endif | ||
172 | |||
173 | signing_key.priv signing_key.x509: x509.genkey | ||
174 | @echo "###" | ||
175 | @echo "### Now generating an X.509 key pair to be used for signing modules." | ||
176 | @echo "###" | ||
177 | @echo "### If this takes a long time, you might wish to run rngd in the" | ||
178 | @echo "### background to keep the supply of entropy topped up. It" | ||
179 | @echo "### needs to be run as root, and uses a hardware random" | ||
180 | @echo "### number generator if one is available." | ||
181 | @echo "###" | ||
182 | openssl req -new -nodes -utf8 -$(CONFIG_MODULE_SIG_HASH) -days 36500 \ | ||
183 | -batch -x509 -config x509.genkey \ | ||
184 | -outform DER -out signing_key.x509 \ | ||
185 | -keyout signing_key.priv 2>&1 | ||
186 | @echo "###" | ||
187 | @echo "### Key pair generated." | ||
188 | @echo "###" | ||
189 | |||
190 | x509.genkey: | ||
191 | @echo Generating X.509 key generation config | ||
192 | @echo >x509.genkey "[ req ]" | ||
193 | @echo >>x509.genkey "default_bits = 4096" | ||
194 | @echo >>x509.genkey "distinguished_name = req_distinguished_name" | ||
195 | @echo >>x509.genkey "prompt = no" | ||
196 | @echo >>x509.genkey "string_mask = utf8only" | ||
197 | @echo >>x509.genkey "x509_extensions = myexts" | ||
198 | @echo >>x509.genkey | ||
199 | @echo >>x509.genkey "[ req_distinguished_name ]" | ||
200 | @echo >>x509.genkey "#O = Unspecified company" | ||
201 | @echo >>x509.genkey "CN = Build time autogenerated kernel key" | ||
202 | @echo >>x509.genkey "#emailAddress = unspecified.user@unspecified.company" | ||
203 | @echo >>x509.genkey | ||
204 | @echo >>x509.genkey "[ myexts ]" | ||
205 | @echo >>x509.genkey "basicConstraints=critical,CA:FALSE" | ||
206 | @echo >>x509.genkey "keyUsage=digitalSignature" | ||
207 | @echo >>x509.genkey "subjectKeyIdentifier=hash" | ||
208 | @echo >>x509.genkey "authorityKeyIdentifier=keyid" | ||
209 | endif | ||
diff --git a/kernel/audit.c b/kernel/audit.c index f9e6065346db..662c007635fb 100644 --- a/kernel/audit.c +++ b/kernel/audit.c | |||
@@ -1761,7 +1761,7 @@ void audit_log_name(struct audit_context *context, struct audit_names *n, | |||
1761 | } else | 1761 | } else |
1762 | audit_log_format(ab, " name=(null)"); | 1762 | audit_log_format(ab, " name=(null)"); |
1763 | 1763 | ||
1764 | if (n->ino != (unsigned long)-1) | 1764 | if (n->ino != AUDIT_INO_UNSET) |
1765 | audit_log_format(ab, " inode=%lu" | 1765 | audit_log_format(ab, " inode=%lu" |
1766 | " dev=%02x:%02x mode=%#ho" | 1766 | " dev=%02x:%02x mode=%#ho" |
1767 | " ouid=%u ogid=%u rdev=%02x:%02x", | 1767 | " ouid=%u ogid=%u rdev=%02x:%02x", |
diff --git a/kernel/audit.h b/kernel/audit.h index d641f9bb3ed0..dadf86a0e59e 100644 --- a/kernel/audit.h +++ b/kernel/audit.h | |||
@@ -50,6 +50,7 @@ enum audit_state { | |||
50 | 50 | ||
51 | /* Rule lists */ | 51 | /* Rule lists */ |
52 | struct audit_watch; | 52 | struct audit_watch; |
53 | struct audit_fsnotify_mark; | ||
53 | struct audit_tree; | 54 | struct audit_tree; |
54 | struct audit_chunk; | 55 | struct audit_chunk; |
55 | 56 | ||
@@ -252,6 +253,7 @@ struct audit_net { | |||
252 | extern int selinux_audit_rule_update(void); | 253 | extern int selinux_audit_rule_update(void); |
253 | 254 | ||
254 | extern struct mutex audit_filter_mutex; | 255 | extern struct mutex audit_filter_mutex; |
256 | extern int audit_del_rule(struct audit_entry *); | ||
255 | extern void audit_free_rule_rcu(struct rcu_head *); | 257 | extern void audit_free_rule_rcu(struct rcu_head *); |
256 | extern struct list_head audit_filter_list[]; | 258 | extern struct list_head audit_filter_list[]; |
257 | 259 | ||
@@ -269,6 +271,15 @@ extern int audit_add_watch(struct audit_krule *krule, struct list_head **list); | |||
269 | extern void audit_remove_watch_rule(struct audit_krule *krule); | 271 | extern void audit_remove_watch_rule(struct audit_krule *krule); |
270 | extern char *audit_watch_path(struct audit_watch *watch); | 272 | extern char *audit_watch_path(struct audit_watch *watch); |
271 | extern int audit_watch_compare(struct audit_watch *watch, unsigned long ino, dev_t dev); | 273 | extern int audit_watch_compare(struct audit_watch *watch, unsigned long ino, dev_t dev); |
274 | |||
275 | extern struct audit_fsnotify_mark *audit_alloc_mark(struct audit_krule *krule, char *pathname, int len); | ||
276 | extern char *audit_mark_path(struct audit_fsnotify_mark *mark); | ||
277 | extern void audit_remove_mark(struct audit_fsnotify_mark *audit_mark); | ||
278 | extern void audit_remove_mark_rule(struct audit_krule *krule); | ||
279 | extern int audit_mark_compare(struct audit_fsnotify_mark *mark, unsigned long ino, dev_t dev); | ||
280 | extern int audit_dupe_exe(struct audit_krule *new, struct audit_krule *old); | ||
281 | extern int audit_exe_compare(struct task_struct *tsk, struct audit_fsnotify_mark *mark); | ||
282 | |||
272 | #else | 283 | #else |
273 | #define audit_put_watch(w) {} | 284 | #define audit_put_watch(w) {} |
274 | #define audit_get_watch(w) {} | 285 | #define audit_get_watch(w) {} |
@@ -278,6 +289,13 @@ extern int audit_watch_compare(struct audit_watch *watch, unsigned long ino, dev | |||
278 | #define audit_watch_path(w) "" | 289 | #define audit_watch_path(w) "" |
279 | #define audit_watch_compare(w, i, d) 0 | 290 | #define audit_watch_compare(w, i, d) 0 |
280 | 291 | ||
292 | #define audit_alloc_mark(k, p, l) (ERR_PTR(-EINVAL)) | ||
293 | #define audit_mark_path(m) "" | ||
294 | #define audit_remove_mark(m) | ||
295 | #define audit_remove_mark_rule(k) | ||
296 | #define audit_mark_compare(m, i, d) 0 | ||
297 | #define audit_exe_compare(t, m) (-EINVAL) | ||
298 | #define audit_dupe_exe(n, o) (-EINVAL) | ||
281 | #endif /* CONFIG_AUDIT_WATCH */ | 299 | #endif /* CONFIG_AUDIT_WATCH */ |
282 | 300 | ||
283 | #ifdef CONFIG_AUDIT_TREE | 301 | #ifdef CONFIG_AUDIT_TREE |
diff --git a/kernel/audit_fsnotify.c b/kernel/audit_fsnotify.c new file mode 100644 index 000000000000..27c6046c2c3d --- /dev/null +++ b/kernel/audit_fsnotify.c | |||
@@ -0,0 +1,216 @@ | |||
1 | /* audit_fsnotify.c -- tracking inodes | ||
2 | * | ||
3 | * Copyright 2003-2009,2014-2015 Red Hat, Inc. | ||
4 | * Copyright 2005 Hewlett-Packard Development Company, L.P. | ||
5 | * Copyright 2005 IBM Corporation | ||
6 | * | ||
7 | * This program is free software; you can redistribute it and/or modify | ||
8 | * it under the terms of the GNU General Public License as published by | ||
9 | * the Free Software Foundation; either version 2 of the License, or | ||
10 | * (at your option) any later version. | ||
11 | * | ||
12 | * This program is distributed in the hope that it will be useful, | ||
13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
15 | * GNU General Public License for more details. | ||
16 | */ | ||
17 | |||
18 | #include <linux/kernel.h> | ||
19 | #include <linux/audit.h> | ||
20 | #include <linux/kthread.h> | ||
21 | #include <linux/mutex.h> | ||
22 | #include <linux/fs.h> | ||
23 | #include <linux/fsnotify_backend.h> | ||
24 | #include <linux/namei.h> | ||
25 | #include <linux/netlink.h> | ||
26 | #include <linux/sched.h> | ||
27 | #include <linux/slab.h> | ||
28 | #include <linux/security.h> | ||
29 | #include "audit.h" | ||
30 | |||
31 | /* | ||
32 | * this mark lives on the parent directory of the inode in question. | ||
33 | * but dev, ino, and path are about the child | ||
34 | */ | ||
35 | struct audit_fsnotify_mark { | ||
36 | dev_t dev; /* associated superblock device */ | ||
37 | unsigned long ino; /* associated inode number */ | ||
38 | char *path; /* insertion path */ | ||
39 | struct fsnotify_mark mark; /* fsnotify mark on the inode */ | ||
40 | struct audit_krule *rule; | ||
41 | }; | ||
42 | |||
43 | /* fsnotify handle. */ | ||
44 | static struct fsnotify_group *audit_fsnotify_group; | ||
45 | |||
46 | /* fsnotify events we care about. */ | ||
47 | #define AUDIT_FS_EVENTS (FS_MOVE | FS_CREATE | FS_DELETE | FS_DELETE_SELF |\ | ||
48 | FS_MOVE_SELF | FS_EVENT_ON_CHILD) | ||
49 | |||
50 | static void audit_fsnotify_mark_free(struct audit_fsnotify_mark *audit_mark) | ||
51 | { | ||
52 | kfree(audit_mark->path); | ||
53 | kfree(audit_mark); | ||
54 | } | ||
55 | |||
56 | static void audit_fsnotify_free_mark(struct fsnotify_mark *mark) | ||
57 | { | ||
58 | struct audit_fsnotify_mark *audit_mark; | ||
59 | |||
60 | audit_mark = container_of(mark, struct audit_fsnotify_mark, mark); | ||
61 | audit_fsnotify_mark_free(audit_mark); | ||
62 | } | ||
63 | |||
64 | char *audit_mark_path(struct audit_fsnotify_mark *mark) | ||
65 | { | ||
66 | return mark->path; | ||
67 | } | ||
68 | |||
69 | int audit_mark_compare(struct audit_fsnotify_mark *mark, unsigned long ino, dev_t dev) | ||
70 | { | ||
71 | if (mark->ino == AUDIT_INO_UNSET) | ||
72 | return 0; | ||
73 | return (mark->ino == ino) && (mark->dev == dev); | ||
74 | } | ||
75 | |||
76 | static void audit_update_mark(struct audit_fsnotify_mark *audit_mark, | ||
77 | struct inode *inode) | ||
78 | { | ||
79 | audit_mark->dev = inode ? inode->i_sb->s_dev : AUDIT_DEV_UNSET; | ||
80 | audit_mark->ino = inode ? inode->i_ino : AUDIT_INO_UNSET; | ||
81 | } | ||
82 | |||
83 | struct audit_fsnotify_mark *audit_alloc_mark(struct audit_krule *krule, char *pathname, int len) | ||
84 | { | ||
85 | struct audit_fsnotify_mark *audit_mark; | ||
86 | struct path path; | ||
87 | struct dentry *dentry; | ||
88 | struct inode *inode; | ||
89 | int ret; | ||
90 | |||
91 | if (pathname[0] != '/' || pathname[len-1] == '/') | ||
92 | return ERR_PTR(-EINVAL); | ||
93 | |||
94 | dentry = kern_path_locked(pathname, &path); | ||
95 | if (IS_ERR(dentry)) | ||
96 | return (void *)dentry; /* returning an error */ | ||
97 | inode = path.dentry->d_inode; | ||
98 | mutex_unlock(&inode->i_mutex); | ||
99 | |||
100 | audit_mark = kzalloc(sizeof(*audit_mark), GFP_KERNEL); | ||
101 | if (unlikely(!audit_mark)) { | ||
102 | audit_mark = ERR_PTR(-ENOMEM); | ||
103 | goto out; | ||
104 | } | ||
105 | |||
106 | fsnotify_init_mark(&audit_mark->mark, audit_fsnotify_free_mark); | ||
107 | audit_mark->mark.mask = AUDIT_FS_EVENTS; | ||
108 | audit_mark->path = pathname; | ||
109 | audit_update_mark(audit_mark, dentry->d_inode); | ||
110 | audit_mark->rule = krule; | ||
111 | |||
112 | ret = fsnotify_add_mark(&audit_mark->mark, audit_fsnotify_group, inode, NULL, true); | ||
113 | if (ret < 0) { | ||
114 | audit_fsnotify_mark_free(audit_mark); | ||
115 | audit_mark = ERR_PTR(ret); | ||
116 | } | ||
117 | out: | ||
118 | dput(dentry); | ||
119 | path_put(&path); | ||
120 | return audit_mark; | ||
121 | } | ||
122 | |||
123 | static void audit_mark_log_rule_change(struct audit_fsnotify_mark *audit_mark, char *op) | ||
124 | { | ||
125 | struct audit_buffer *ab; | ||
126 | struct audit_krule *rule = audit_mark->rule; | ||
127 | |||
128 | if (!audit_enabled) | ||
129 | return; | ||
130 | ab = audit_log_start(NULL, GFP_NOFS, AUDIT_CONFIG_CHANGE); | ||
131 | if (unlikely(!ab)) | ||
132 | return; | ||
133 | audit_log_format(ab, "auid=%u ses=%u op=", | ||
134 | from_kuid(&init_user_ns, audit_get_loginuid(current)), | ||
135 | audit_get_sessionid(current)); | ||
136 | audit_log_string(ab, op); | ||
137 | audit_log_format(ab, " path="); | ||
138 | audit_log_untrustedstring(ab, audit_mark->path); | ||
139 | audit_log_key(ab, rule->filterkey); | ||
140 | audit_log_format(ab, " list=%d res=1", rule->listnr); | ||
141 | audit_log_end(ab); | ||
142 | } | ||
143 | |||
144 | void audit_remove_mark(struct audit_fsnotify_mark *audit_mark) | ||
145 | { | ||
146 | fsnotify_destroy_mark(&audit_mark->mark, audit_fsnotify_group); | ||
147 | fsnotify_put_mark(&audit_mark->mark); | ||
148 | } | ||
149 | |||
150 | void audit_remove_mark_rule(struct audit_krule *krule) | ||
151 | { | ||
152 | struct audit_fsnotify_mark *mark = krule->exe; | ||
153 | |||
154 | audit_remove_mark(mark); | ||
155 | } | ||
156 | |||
157 | static void audit_autoremove_mark_rule(struct audit_fsnotify_mark *audit_mark) | ||
158 | { | ||
159 | struct audit_krule *rule = audit_mark->rule; | ||
160 | struct audit_entry *entry = container_of(rule, struct audit_entry, rule); | ||
161 | |||
162 | audit_mark_log_rule_change(audit_mark, "autoremove_rule"); | ||
163 | audit_del_rule(entry); | ||
164 | } | ||
165 | |||
166 | /* Update mark data in audit rules based on fsnotify events. */ | ||
167 | static int audit_mark_handle_event(struct fsnotify_group *group, | ||
168 | struct inode *to_tell, | ||
169 | struct fsnotify_mark *inode_mark, | ||
170 | struct fsnotify_mark *vfsmount_mark, | ||
171 | u32 mask, void *data, int data_type, | ||
172 | const unsigned char *dname, u32 cookie) | ||
173 | { | ||
174 | struct audit_fsnotify_mark *audit_mark; | ||
175 | struct inode *inode = NULL; | ||
176 | |||
177 | audit_mark = container_of(inode_mark, struct audit_fsnotify_mark, mark); | ||
178 | |||
179 | BUG_ON(group != audit_fsnotify_group); | ||
180 | |||
181 | switch (data_type) { | ||
182 | case (FSNOTIFY_EVENT_PATH): | ||
183 | inode = ((struct path *)data)->dentry->d_inode; | ||
184 | break; | ||
185 | case (FSNOTIFY_EVENT_INODE): | ||
186 | inode = (struct inode *)data; | ||
187 | break; | ||
188 | default: | ||
189 | BUG(); | ||
190 | return 0; | ||
191 | }; | ||
192 | |||
193 | if (mask & (FS_CREATE|FS_MOVED_TO|FS_DELETE|FS_MOVED_FROM)) { | ||
194 | if (audit_compare_dname_path(dname, audit_mark->path, AUDIT_NAME_FULL)) | ||
195 | return 0; | ||
196 | audit_update_mark(audit_mark, inode); | ||
197 | } else if (mask & (FS_DELETE_SELF|FS_UNMOUNT|FS_MOVE_SELF)) | ||
198 | audit_autoremove_mark_rule(audit_mark); | ||
199 | |||
200 | return 0; | ||
201 | } | ||
202 | |||
203 | static const struct fsnotify_ops audit_mark_fsnotify_ops = { | ||
204 | .handle_event = audit_mark_handle_event, | ||
205 | }; | ||
206 | |||
207 | static int __init audit_fsnotify_init(void) | ||
208 | { | ||
209 | audit_fsnotify_group = fsnotify_alloc_group(&audit_mark_fsnotify_ops); | ||
210 | if (IS_ERR(audit_fsnotify_group)) { | ||
211 | audit_fsnotify_group = NULL; | ||
212 | audit_panic("cannot create audit fsnotify group"); | ||
213 | } | ||
214 | return 0; | ||
215 | } | ||
216 | device_initcall(audit_fsnotify_init); | ||
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index b0f9877273fc..94ecdabda8e6 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c | |||
@@ -479,6 +479,8 @@ static void kill_rules(struct audit_tree *tree) | |||
479 | if (rule->tree) { | 479 | if (rule->tree) { |
480 | /* not a half-baked one */ | 480 | /* not a half-baked one */ |
481 | audit_tree_log_remove_rule(rule); | 481 | audit_tree_log_remove_rule(rule); |
482 | if (entry->rule.exe) | ||
483 | audit_remove_mark(entry->rule.exe); | ||
482 | rule->tree = NULL; | 484 | rule->tree = NULL; |
483 | list_del_rcu(&entry->list); | 485 | list_del_rcu(&entry->list); |
484 | list_del(&entry->rule.list); | 486 | list_del(&entry->rule.list); |
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index 6e30024d9aac..656c7e93ac0d 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c | |||
@@ -138,7 +138,7 @@ char *audit_watch_path(struct audit_watch *watch) | |||
138 | 138 | ||
139 | int audit_watch_compare(struct audit_watch *watch, unsigned long ino, dev_t dev) | 139 | int audit_watch_compare(struct audit_watch *watch, unsigned long ino, dev_t dev) |
140 | { | 140 | { |
141 | return (watch->ino != (unsigned long)-1) && | 141 | return (watch->ino != AUDIT_INO_UNSET) && |
142 | (watch->ino == ino) && | 142 | (watch->ino == ino) && |
143 | (watch->dev == dev); | 143 | (watch->dev == dev); |
144 | } | 144 | } |
@@ -179,8 +179,8 @@ static struct audit_watch *audit_init_watch(char *path) | |||
179 | INIT_LIST_HEAD(&watch->rules); | 179 | INIT_LIST_HEAD(&watch->rules); |
180 | atomic_set(&watch->count, 1); | 180 | atomic_set(&watch->count, 1); |
181 | watch->path = path; | 181 | watch->path = path; |
182 | watch->dev = (dev_t)-1; | 182 | watch->dev = AUDIT_DEV_UNSET; |
183 | watch->ino = (unsigned long)-1; | 183 | watch->ino = AUDIT_INO_UNSET; |
184 | 184 | ||
185 | return watch; | 185 | return watch; |
186 | } | 186 | } |
@@ -203,7 +203,6 @@ int audit_to_watch(struct audit_krule *krule, char *path, int len, u32 op) | |||
203 | if (IS_ERR(watch)) | 203 | if (IS_ERR(watch)) |
204 | return PTR_ERR(watch); | 204 | return PTR_ERR(watch); |
205 | 205 | ||
206 | audit_get_watch(watch); | ||
207 | krule->watch = watch; | 206 | krule->watch = watch; |
208 | 207 | ||
209 | return 0; | 208 | return 0; |
@@ -313,6 +312,8 @@ static void audit_update_watch(struct audit_parent *parent, | |||
313 | list_replace(&oentry->rule.list, | 312 | list_replace(&oentry->rule.list, |
314 | &nentry->rule.list); | 313 | &nentry->rule.list); |
315 | } | 314 | } |
315 | if (oentry->rule.exe) | ||
316 | audit_remove_mark(oentry->rule.exe); | ||
316 | 317 | ||
317 | audit_watch_log_rule_change(r, owatch, "updated_rules"); | 318 | audit_watch_log_rule_change(r, owatch, "updated_rules"); |
318 | 319 | ||
@@ -343,6 +344,8 @@ static void audit_remove_parent_watches(struct audit_parent *parent) | |||
343 | list_for_each_entry_safe(r, nextr, &w->rules, rlist) { | 344 | list_for_each_entry_safe(r, nextr, &w->rules, rlist) { |
344 | e = container_of(r, struct audit_entry, rule); | 345 | e = container_of(r, struct audit_entry, rule); |
345 | audit_watch_log_rule_change(r, w, "remove_rule"); | 346 | audit_watch_log_rule_change(r, w, "remove_rule"); |
347 | if (e->rule.exe) | ||
348 | audit_remove_mark(e->rule.exe); | ||
346 | list_del(&r->rlist); | 349 | list_del(&r->rlist); |
347 | list_del(&r->list); | 350 | list_del(&r->list); |
348 | list_del_rcu(&e->list); | 351 | list_del_rcu(&e->list); |
@@ -387,19 +390,20 @@ static void audit_add_to_parent(struct audit_krule *krule, | |||
387 | 390 | ||
388 | watch_found = 1; | 391 | watch_found = 1; |
389 | 392 | ||
390 | /* put krule's and initial refs to temporary watch */ | 393 | /* put krule's ref to temporary watch */ |
391 | audit_put_watch(watch); | ||
392 | audit_put_watch(watch); | 394 | audit_put_watch(watch); |
393 | 395 | ||
394 | audit_get_watch(w); | 396 | audit_get_watch(w); |
395 | krule->watch = watch = w; | 397 | krule->watch = watch = w; |
398 | |||
399 | audit_put_parent(parent); | ||
396 | break; | 400 | break; |
397 | } | 401 | } |
398 | 402 | ||
399 | if (!watch_found) { | 403 | if (!watch_found) { |
400 | audit_get_parent(parent); | ||
401 | watch->parent = parent; | 404 | watch->parent = parent; |
402 | 405 | ||
406 | audit_get_watch(watch); | ||
403 | list_add(&watch->wlist, &parent->watches); | 407 | list_add(&watch->wlist, &parent->watches); |
404 | } | 408 | } |
405 | list_add(&krule->rlist, &watch->rules); | 409 | list_add(&krule->rlist, &watch->rules); |
@@ -437,9 +441,6 @@ int audit_add_watch(struct audit_krule *krule, struct list_head **list) | |||
437 | 441 | ||
438 | audit_add_to_parent(krule, parent); | 442 | audit_add_to_parent(krule, parent); |
439 | 443 | ||
440 | /* match get in audit_find_parent or audit_init_parent */ | ||
441 | audit_put_parent(parent); | ||
442 | |||
443 | h = audit_hash_ino((u32)watch->ino); | 444 | h = audit_hash_ino((u32)watch->ino); |
444 | *list = &audit_inode_hash[h]; | 445 | *list = &audit_inode_hash[h]; |
445 | error: | 446 | error: |
@@ -496,7 +497,7 @@ static int audit_watch_handle_event(struct fsnotify_group *group, | |||
496 | if (mask & (FS_CREATE|FS_MOVED_TO) && inode) | 497 | if (mask & (FS_CREATE|FS_MOVED_TO) && inode) |
497 | audit_update_watch(parent, dname, inode->i_sb->s_dev, inode->i_ino, 0); | 498 | audit_update_watch(parent, dname, inode->i_sb->s_dev, inode->i_ino, 0); |
498 | else if (mask & (FS_DELETE|FS_MOVED_FROM)) | 499 | else if (mask & (FS_DELETE|FS_MOVED_FROM)) |
499 | audit_update_watch(parent, dname, (dev_t)-1, (unsigned long)-1, 1); | 500 | audit_update_watch(parent, dname, AUDIT_DEV_UNSET, AUDIT_INO_UNSET, 1); |
500 | else if (mask & (FS_DELETE_SELF|FS_UNMOUNT|FS_MOVE_SELF)) | 501 | else if (mask & (FS_DELETE_SELF|FS_UNMOUNT|FS_MOVE_SELF)) |
501 | audit_remove_parent_watches(parent); | 502 | audit_remove_parent_watches(parent); |
502 | 503 | ||
@@ -517,3 +518,36 @@ static int __init audit_watch_init(void) | |||
517 | return 0; | 518 | return 0; |
518 | } | 519 | } |
519 | device_initcall(audit_watch_init); | 520 | device_initcall(audit_watch_init); |
521 | |||
522 | int audit_dupe_exe(struct audit_krule *new, struct audit_krule *old) | ||
523 | { | ||
524 | struct audit_fsnotify_mark *audit_mark; | ||
525 | char *pathname; | ||
526 | |||
527 | pathname = kstrdup(audit_mark_path(old->exe), GFP_KERNEL); | ||
528 | if (!pathname) | ||
529 | return -ENOMEM; | ||
530 | |||
531 | audit_mark = audit_alloc_mark(new, pathname, strlen(pathname)); | ||
532 | if (IS_ERR(audit_mark)) { | ||
533 | kfree(pathname); | ||
534 | return PTR_ERR(audit_mark); | ||
535 | } | ||
536 | new->exe = audit_mark; | ||
537 | |||
538 | return 0; | ||
539 | } | ||
540 | |||
541 | int audit_exe_compare(struct task_struct *tsk, struct audit_fsnotify_mark *mark) | ||
542 | { | ||
543 | struct file *exe_file; | ||
544 | unsigned long ino; | ||
545 | dev_t dev; | ||
546 | |||
547 | rcu_read_lock(); | ||
548 | exe_file = rcu_dereference(tsk->mm->exe_file); | ||
549 | ino = exe_file->f_inode->i_ino; | ||
550 | dev = exe_file->f_inode->i_sb->s_dev; | ||
551 | rcu_read_unlock(); | ||
552 | return audit_mark_compare(mark, ino, dev); | ||
553 | } | ||
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 72e1660a79a3..7714d93edb85 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c | |||
@@ -405,6 +405,12 @@ static int audit_field_valid(struct audit_entry *entry, struct audit_field *f) | |||
405 | if (f->val > AUDIT_MAX_FIELD_COMPARE) | 405 | if (f->val > AUDIT_MAX_FIELD_COMPARE) |
406 | return -EINVAL; | 406 | return -EINVAL; |
407 | break; | 407 | break; |
408 | case AUDIT_EXE: | ||
409 | if (f->op != Audit_equal) | ||
410 | return -EINVAL; | ||
411 | if (entry->rule.listnr != AUDIT_FILTER_EXIT) | ||
412 | return -EINVAL; | ||
413 | break; | ||
408 | }; | 414 | }; |
409 | return 0; | 415 | return 0; |
410 | } | 416 | } |
@@ -419,6 +425,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, | |||
419 | size_t remain = datasz - sizeof(struct audit_rule_data); | 425 | size_t remain = datasz - sizeof(struct audit_rule_data); |
420 | int i; | 426 | int i; |
421 | char *str; | 427 | char *str; |
428 | struct audit_fsnotify_mark *audit_mark; | ||
422 | 429 | ||
423 | entry = audit_to_entry_common(data); | 430 | entry = audit_to_entry_common(data); |
424 | if (IS_ERR(entry)) | 431 | if (IS_ERR(entry)) |
@@ -539,6 +546,24 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, | |||
539 | entry->rule.buflen += f->val; | 546 | entry->rule.buflen += f->val; |
540 | entry->rule.filterkey = str; | 547 | entry->rule.filterkey = str; |
541 | break; | 548 | break; |
549 | case AUDIT_EXE: | ||
550 | if (entry->rule.exe || f->val > PATH_MAX) | ||
551 | goto exit_free; | ||
552 | str = audit_unpack_string(&bufp, &remain, f->val); | ||
553 | if (IS_ERR(str)) { | ||
554 | err = PTR_ERR(str); | ||
555 | goto exit_free; | ||
556 | } | ||
557 | entry->rule.buflen += f->val; | ||
558 | |||
559 | audit_mark = audit_alloc_mark(&entry->rule, str, f->val); | ||
560 | if (IS_ERR(audit_mark)) { | ||
561 | kfree(str); | ||
562 | err = PTR_ERR(audit_mark); | ||
563 | goto exit_free; | ||
564 | } | ||
565 | entry->rule.exe = audit_mark; | ||
566 | break; | ||
542 | } | 567 | } |
543 | } | 568 | } |
544 | 569 | ||
@@ -549,10 +574,10 @@ exit_nofree: | |||
549 | return entry; | 574 | return entry; |
550 | 575 | ||
551 | exit_free: | 576 | exit_free: |
552 | if (entry->rule.watch) | ||
553 | audit_put_watch(entry->rule.watch); /* matches initial get */ | ||
554 | if (entry->rule.tree) | 577 | if (entry->rule.tree) |
555 | audit_put_tree(entry->rule.tree); /* that's the temporary one */ | 578 | audit_put_tree(entry->rule.tree); /* that's the temporary one */ |
579 | if (entry->rule.exe) | ||
580 | audit_remove_mark(entry->rule.exe); /* that's the template one */ | ||
556 | audit_free_rule(entry); | 581 | audit_free_rule(entry); |
557 | return ERR_PTR(err); | 582 | return ERR_PTR(err); |
558 | } | 583 | } |
@@ -617,6 +642,10 @@ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule) | |||
617 | data->buflen += data->values[i] = | 642 | data->buflen += data->values[i] = |
618 | audit_pack_string(&bufp, krule->filterkey); | 643 | audit_pack_string(&bufp, krule->filterkey); |
619 | break; | 644 | break; |
645 | case AUDIT_EXE: | ||
646 | data->buflen += data->values[i] = | ||
647 | audit_pack_string(&bufp, audit_mark_path(krule->exe)); | ||
648 | break; | ||
620 | case AUDIT_LOGINUID_SET: | 649 | case AUDIT_LOGINUID_SET: |
621 | if (krule->pflags & AUDIT_LOGINUID_LEGACY && !f->val) { | 650 | if (krule->pflags & AUDIT_LOGINUID_LEGACY && !f->val) { |
622 | data->fields[i] = AUDIT_LOGINUID; | 651 | data->fields[i] = AUDIT_LOGINUID; |
@@ -680,6 +709,12 @@ static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b) | |||
680 | if (strcmp(a->filterkey, b->filterkey)) | 709 | if (strcmp(a->filterkey, b->filterkey)) |
681 | return 1; | 710 | return 1; |
682 | break; | 711 | break; |
712 | case AUDIT_EXE: | ||
713 | /* both paths exist based on above type compare */ | ||
714 | if (strcmp(audit_mark_path(a->exe), | ||
715 | audit_mark_path(b->exe))) | ||
716 | return 1; | ||
717 | break; | ||
683 | case AUDIT_UID: | 718 | case AUDIT_UID: |
684 | case AUDIT_EUID: | 719 | case AUDIT_EUID: |
685 | case AUDIT_SUID: | 720 | case AUDIT_SUID: |
@@ -801,8 +836,14 @@ struct audit_entry *audit_dupe_rule(struct audit_krule *old) | |||
801 | err = -ENOMEM; | 836 | err = -ENOMEM; |
802 | else | 837 | else |
803 | new->filterkey = fk; | 838 | new->filterkey = fk; |
839 | break; | ||
840 | case AUDIT_EXE: | ||
841 | err = audit_dupe_exe(new, old); | ||
842 | break; | ||
804 | } | 843 | } |
805 | if (err) { | 844 | if (err) { |
845 | if (new->exe) | ||
846 | audit_remove_mark(new->exe); | ||
806 | audit_free_rule(entry); | 847 | audit_free_rule(entry); |
807 | return ERR_PTR(err); | 848 | return ERR_PTR(err); |
808 | } | 849 | } |
@@ -863,7 +904,7 @@ static inline int audit_add_rule(struct audit_entry *entry) | |||
863 | struct audit_watch *watch = entry->rule.watch; | 904 | struct audit_watch *watch = entry->rule.watch; |
864 | struct audit_tree *tree = entry->rule.tree; | 905 | struct audit_tree *tree = entry->rule.tree; |
865 | struct list_head *list; | 906 | struct list_head *list; |
866 | int err; | 907 | int err = 0; |
867 | #ifdef CONFIG_AUDITSYSCALL | 908 | #ifdef CONFIG_AUDITSYSCALL |
868 | int dont_count = 0; | 909 | int dont_count = 0; |
869 | 910 | ||
@@ -881,7 +922,7 @@ static inline int audit_add_rule(struct audit_entry *entry) | |||
881 | /* normally audit_add_tree_rule() will free it on failure */ | 922 | /* normally audit_add_tree_rule() will free it on failure */ |
882 | if (tree) | 923 | if (tree) |
883 | audit_put_tree(tree); | 924 | audit_put_tree(tree); |
884 | goto error; | 925 | return err; |
885 | } | 926 | } |
886 | 927 | ||
887 | if (watch) { | 928 | if (watch) { |
@@ -895,14 +936,14 @@ static inline int audit_add_rule(struct audit_entry *entry) | |||
895 | */ | 936 | */ |
896 | if (tree) | 937 | if (tree) |
897 | audit_put_tree(tree); | 938 | audit_put_tree(tree); |
898 | goto error; | 939 | return err; |
899 | } | 940 | } |
900 | } | 941 | } |
901 | if (tree) { | 942 | if (tree) { |
902 | err = audit_add_tree_rule(&entry->rule); | 943 | err = audit_add_tree_rule(&entry->rule); |
903 | if (err) { | 944 | if (err) { |
904 | mutex_unlock(&audit_filter_mutex); | 945 | mutex_unlock(&audit_filter_mutex); |
905 | goto error; | 946 | return err; |
906 | } | 947 | } |
907 | } | 948 | } |
908 | 949 | ||
@@ -933,19 +974,13 @@ static inline int audit_add_rule(struct audit_entry *entry) | |||
933 | #endif | 974 | #endif |
934 | mutex_unlock(&audit_filter_mutex); | 975 | mutex_unlock(&audit_filter_mutex); |
935 | 976 | ||
936 | return 0; | ||
937 | |||
938 | error: | ||
939 | if (watch) | ||
940 | audit_put_watch(watch); /* tmp watch, matches initial get */ | ||
941 | return err; | 977 | return err; |
942 | } | 978 | } |
943 | 979 | ||
944 | /* Remove an existing rule from filterlist. */ | 980 | /* Remove an existing rule from filterlist. */ |
945 | static inline int audit_del_rule(struct audit_entry *entry) | 981 | int audit_del_rule(struct audit_entry *entry) |
946 | { | 982 | { |
947 | struct audit_entry *e; | 983 | struct audit_entry *e; |
948 | struct audit_watch *watch = entry->rule.watch; | ||
949 | struct audit_tree *tree = entry->rule.tree; | 984 | struct audit_tree *tree = entry->rule.tree; |
950 | struct list_head *list; | 985 | struct list_head *list; |
951 | int ret = 0; | 986 | int ret = 0; |
@@ -961,7 +996,6 @@ static inline int audit_del_rule(struct audit_entry *entry) | |||
961 | mutex_lock(&audit_filter_mutex); | 996 | mutex_lock(&audit_filter_mutex); |
962 | e = audit_find_rule(entry, &list); | 997 | e = audit_find_rule(entry, &list); |
963 | if (!e) { | 998 | if (!e) { |
964 | mutex_unlock(&audit_filter_mutex); | ||
965 | ret = -ENOENT; | 999 | ret = -ENOENT; |
966 | goto out; | 1000 | goto out; |
967 | } | 1001 | } |
@@ -972,9 +1006,8 @@ static inline int audit_del_rule(struct audit_entry *entry) | |||
972 | if (e->rule.tree) | 1006 | if (e->rule.tree) |
973 | audit_remove_tree_rule(&e->rule); | 1007 | audit_remove_tree_rule(&e->rule); |
974 | 1008 | ||
975 | list_del_rcu(&e->list); | 1009 | if (e->rule.exe) |
976 | list_del(&e->rule.list); | 1010 | audit_remove_mark_rule(&e->rule); |
977 | call_rcu(&e->rcu, audit_free_rule_rcu); | ||
978 | 1011 | ||
979 | #ifdef CONFIG_AUDITSYSCALL | 1012 | #ifdef CONFIG_AUDITSYSCALL |
980 | if (!dont_count) | 1013 | if (!dont_count) |
@@ -983,11 +1016,14 @@ static inline int audit_del_rule(struct audit_entry *entry) | |||
983 | if (!audit_match_signal(entry)) | 1016 | if (!audit_match_signal(entry)) |
984 | audit_signals--; | 1017 | audit_signals--; |
985 | #endif | 1018 | #endif |
986 | mutex_unlock(&audit_filter_mutex); | 1019 | |
1020 | list_del_rcu(&e->list); | ||
1021 | list_del(&e->rule.list); | ||
1022 | call_rcu(&e->rcu, audit_free_rule_rcu); | ||
987 | 1023 | ||
988 | out: | 1024 | out: |
989 | if (watch) | 1025 | mutex_unlock(&audit_filter_mutex); |
990 | audit_put_watch(watch); /* match initial get */ | 1026 | |
991 | if (tree) | 1027 | if (tree) |
992 | audit_put_tree(tree); /* that's the temporary one */ | 1028 | audit_put_tree(tree); /* that's the temporary one */ |
993 | 1029 | ||
@@ -1077,8 +1113,11 @@ int audit_rule_change(int type, __u32 portid, int seq, void *data, | |||
1077 | WARN_ON(1); | 1113 | WARN_ON(1); |
1078 | } | 1114 | } |
1079 | 1115 | ||
1080 | if (err || type == AUDIT_DEL_RULE) | 1116 | if (err || type == AUDIT_DEL_RULE) { |
1117 | if (entry->rule.exe) | ||
1118 | audit_remove_mark(entry->rule.exe); | ||
1081 | audit_free_rule(entry); | 1119 | audit_free_rule(entry); |
1120 | } | ||
1082 | 1121 | ||
1083 | return err; | 1122 | return err; |
1084 | } | 1123 | } |
@@ -1370,6 +1409,8 @@ static int update_lsm_rule(struct audit_krule *r) | |||
1370 | return 0; | 1409 | return 0; |
1371 | 1410 | ||
1372 | nentry = audit_dupe_rule(r); | 1411 | nentry = audit_dupe_rule(r); |
1412 | if (entry->rule.exe) | ||
1413 | audit_remove_mark(entry->rule.exe); | ||
1373 | if (IS_ERR(nentry)) { | 1414 | if (IS_ERR(nentry)) { |
1374 | /* save the first error encountered for the | 1415 | /* save the first error encountered for the |
1375 | * return value */ | 1416 | * return value */ |
diff --git a/kernel/auditsc.c b/kernel/auditsc.c index e85bdfd15fed..b86cc04959de 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c | |||
@@ -180,7 +180,7 @@ static int audit_match_filetype(struct audit_context *ctx, int val) | |||
180 | return 0; | 180 | return 0; |
181 | 181 | ||
182 | list_for_each_entry(n, &ctx->names_list, list) { | 182 | list_for_each_entry(n, &ctx->names_list, list) { |
183 | if ((n->ino != -1) && | 183 | if ((n->ino != AUDIT_INO_UNSET) && |
184 | ((n->mode & S_IFMT) == mode)) | 184 | ((n->mode & S_IFMT) == mode)) |
185 | return 1; | 185 | return 1; |
186 | } | 186 | } |
@@ -466,6 +466,9 @@ static int audit_filter_rules(struct task_struct *tsk, | |||
466 | result = audit_comparator(ctx->ppid, f->op, f->val); | 466 | result = audit_comparator(ctx->ppid, f->op, f->val); |
467 | } | 467 | } |
468 | break; | 468 | break; |
469 | case AUDIT_EXE: | ||
470 | result = audit_exe_compare(tsk, rule->exe); | ||
471 | break; | ||
469 | case AUDIT_UID: | 472 | case AUDIT_UID: |
470 | result = audit_uid_comparator(cred->uid, f->op, f->uid); | 473 | result = audit_uid_comparator(cred->uid, f->op, f->uid); |
471 | break; | 474 | break; |
@@ -1680,7 +1683,7 @@ static struct audit_names *audit_alloc_name(struct audit_context *context, | |||
1680 | aname->should_free = true; | 1683 | aname->should_free = true; |
1681 | } | 1684 | } |
1682 | 1685 | ||
1683 | aname->ino = (unsigned long)-1; | 1686 | aname->ino = AUDIT_INO_UNSET; |
1684 | aname->type = type; | 1687 | aname->type = type; |
1685 | list_add_tail(&aname->list, &context->names_list); | 1688 | list_add_tail(&aname->list, &context->names_list); |
1686 | 1689 | ||
@@ -1922,7 +1925,7 @@ void __audit_inode_child(const struct inode *parent, | |||
1922 | if (inode) | 1925 | if (inode) |
1923 | audit_copy_inode(found_child, dentry, inode); | 1926 | audit_copy_inode(found_child, dentry, inode); |
1924 | else | 1927 | else |
1925 | found_child->ino = (unsigned long)-1; | 1928 | found_child->ino = AUDIT_INO_UNSET; |
1926 | } | 1929 | } |
1927 | EXPORT_SYMBOL_GPL(__audit_inode_child); | 1930 | EXPORT_SYMBOL_GPL(__audit_inode_child); |
1928 | 1931 | ||
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index cb31229a6fa4..29ace107f236 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c | |||
@@ -150,15 +150,15 @@ static int __init register_array_map(void) | |||
150 | } | 150 | } |
151 | late_initcall(register_array_map); | 151 | late_initcall(register_array_map); |
152 | 152 | ||
153 | static struct bpf_map *prog_array_map_alloc(union bpf_attr *attr) | 153 | static struct bpf_map *fd_array_map_alloc(union bpf_attr *attr) |
154 | { | 154 | { |
155 | /* only bpf_prog file descriptors can be stored in prog_array map */ | 155 | /* only file descriptors can be stored in this type of map */ |
156 | if (attr->value_size != sizeof(u32)) | 156 | if (attr->value_size != sizeof(u32)) |
157 | return ERR_PTR(-EINVAL); | 157 | return ERR_PTR(-EINVAL); |
158 | return array_map_alloc(attr); | 158 | return array_map_alloc(attr); |
159 | } | 159 | } |
160 | 160 | ||
161 | static void prog_array_map_free(struct bpf_map *map) | 161 | static void fd_array_map_free(struct bpf_map *map) |
162 | { | 162 | { |
163 | struct bpf_array *array = container_of(map, struct bpf_array, map); | 163 | struct bpf_array *array = container_of(map, struct bpf_array, map); |
164 | int i; | 164 | int i; |
@@ -167,21 +167,21 @@ static void prog_array_map_free(struct bpf_map *map) | |||
167 | 167 | ||
168 | /* make sure it's empty */ | 168 | /* make sure it's empty */ |
169 | for (i = 0; i < array->map.max_entries; i++) | 169 | for (i = 0; i < array->map.max_entries; i++) |
170 | BUG_ON(array->prog[i] != NULL); | 170 | BUG_ON(array->ptrs[i] != NULL); |
171 | kvfree(array); | 171 | kvfree(array); |
172 | } | 172 | } |
173 | 173 | ||
174 | static void *prog_array_map_lookup_elem(struct bpf_map *map, void *key) | 174 | static void *fd_array_map_lookup_elem(struct bpf_map *map, void *key) |
175 | { | 175 | { |
176 | return NULL; | 176 | return NULL; |
177 | } | 177 | } |
178 | 178 | ||
179 | /* only called from syscall */ | 179 | /* only called from syscall */ |
180 | static int prog_array_map_update_elem(struct bpf_map *map, void *key, | 180 | static int fd_array_map_update_elem(struct bpf_map *map, void *key, |
181 | void *value, u64 map_flags) | 181 | void *value, u64 map_flags) |
182 | { | 182 | { |
183 | struct bpf_array *array = container_of(map, struct bpf_array, map); | 183 | struct bpf_array *array = container_of(map, struct bpf_array, map); |
184 | struct bpf_prog *prog, *old_prog; | 184 | void *new_ptr, *old_ptr; |
185 | u32 index = *(u32 *)key, ufd; | 185 | u32 index = *(u32 *)key, ufd; |
186 | 186 | ||
187 | if (map_flags != BPF_ANY) | 187 | if (map_flags != BPF_ANY) |
@@ -191,57 +191,75 @@ static int prog_array_map_update_elem(struct bpf_map *map, void *key, | |||
191 | return -E2BIG; | 191 | return -E2BIG; |
192 | 192 | ||
193 | ufd = *(u32 *)value; | 193 | ufd = *(u32 *)value; |
194 | prog = bpf_prog_get(ufd); | 194 | new_ptr = map->ops->map_fd_get_ptr(map, ufd); |
195 | if (IS_ERR(prog)) | 195 | if (IS_ERR(new_ptr)) |
196 | return PTR_ERR(prog); | 196 | return PTR_ERR(new_ptr); |
197 | |||
198 | if (!bpf_prog_array_compatible(array, prog)) { | ||
199 | bpf_prog_put(prog); | ||
200 | return -EINVAL; | ||
201 | } | ||
202 | 197 | ||
203 | old_prog = xchg(array->prog + index, prog); | 198 | old_ptr = xchg(array->ptrs + index, new_ptr); |
204 | if (old_prog) | 199 | if (old_ptr) |
205 | bpf_prog_put_rcu(old_prog); | 200 | map->ops->map_fd_put_ptr(old_ptr); |
206 | 201 | ||
207 | return 0; | 202 | return 0; |
208 | } | 203 | } |
209 | 204 | ||
210 | static int prog_array_map_delete_elem(struct bpf_map *map, void *key) | 205 | static int fd_array_map_delete_elem(struct bpf_map *map, void *key) |
211 | { | 206 | { |
212 | struct bpf_array *array = container_of(map, struct bpf_array, map); | 207 | struct bpf_array *array = container_of(map, struct bpf_array, map); |
213 | struct bpf_prog *old_prog; | 208 | void *old_ptr; |
214 | u32 index = *(u32 *)key; | 209 | u32 index = *(u32 *)key; |
215 | 210 | ||
216 | if (index >= array->map.max_entries) | 211 | if (index >= array->map.max_entries) |
217 | return -E2BIG; | 212 | return -E2BIG; |
218 | 213 | ||
219 | old_prog = xchg(array->prog + index, NULL); | 214 | old_ptr = xchg(array->ptrs + index, NULL); |
220 | if (old_prog) { | 215 | if (old_ptr) { |
221 | bpf_prog_put_rcu(old_prog); | 216 | map->ops->map_fd_put_ptr(old_ptr); |
222 | return 0; | 217 | return 0; |
223 | } else { | 218 | } else { |
224 | return -ENOENT; | 219 | return -ENOENT; |
225 | } | 220 | } |
226 | } | 221 | } |
227 | 222 | ||
223 | static void *prog_fd_array_get_ptr(struct bpf_map *map, int fd) | ||
224 | { | ||
225 | struct bpf_array *array = container_of(map, struct bpf_array, map); | ||
226 | struct bpf_prog *prog = bpf_prog_get(fd); | ||
227 | if (IS_ERR(prog)) | ||
228 | return prog; | ||
229 | |||
230 | if (!bpf_prog_array_compatible(array, prog)) { | ||
231 | bpf_prog_put(prog); | ||
232 | return ERR_PTR(-EINVAL); | ||
233 | } | ||
234 | return prog; | ||
235 | } | ||
236 | |||
237 | static void prog_fd_array_put_ptr(void *ptr) | ||
238 | { | ||
239 | struct bpf_prog *prog = ptr; | ||
240 | |||
241 | bpf_prog_put_rcu(prog); | ||
242 | } | ||
243 | |||
228 | /* decrement refcnt of all bpf_progs that are stored in this map */ | 244 | /* decrement refcnt of all bpf_progs that are stored in this map */ |
229 | void bpf_prog_array_map_clear(struct bpf_map *map) | 245 | void bpf_fd_array_map_clear(struct bpf_map *map) |
230 | { | 246 | { |
231 | struct bpf_array *array = container_of(map, struct bpf_array, map); | 247 | struct bpf_array *array = container_of(map, struct bpf_array, map); |
232 | int i; | 248 | int i; |
233 | 249 | ||
234 | for (i = 0; i < array->map.max_entries; i++) | 250 | for (i = 0; i < array->map.max_entries; i++) |
235 | prog_array_map_delete_elem(map, &i); | 251 | fd_array_map_delete_elem(map, &i); |
236 | } | 252 | } |
237 | 253 | ||
238 | static const struct bpf_map_ops prog_array_ops = { | 254 | static const struct bpf_map_ops prog_array_ops = { |
239 | .map_alloc = prog_array_map_alloc, | 255 | .map_alloc = fd_array_map_alloc, |
240 | .map_free = prog_array_map_free, | 256 | .map_free = fd_array_map_free, |
241 | .map_get_next_key = array_map_get_next_key, | 257 | .map_get_next_key = array_map_get_next_key, |
242 | .map_lookup_elem = prog_array_map_lookup_elem, | 258 | .map_lookup_elem = fd_array_map_lookup_elem, |
243 | .map_update_elem = prog_array_map_update_elem, | 259 | .map_update_elem = fd_array_map_update_elem, |
244 | .map_delete_elem = prog_array_map_delete_elem, | 260 | .map_delete_elem = fd_array_map_delete_elem, |
261 | .map_fd_get_ptr = prog_fd_array_get_ptr, | ||
262 | .map_fd_put_ptr = prog_fd_array_put_ptr, | ||
245 | }; | 263 | }; |
246 | 264 | ||
247 | static struct bpf_map_type_list prog_array_type __read_mostly = { | 265 | static struct bpf_map_type_list prog_array_type __read_mostly = { |
@@ -255,3 +273,60 @@ static int __init register_prog_array_map(void) | |||
255 | return 0; | 273 | return 0; |
256 | } | 274 | } |
257 | late_initcall(register_prog_array_map); | 275 | late_initcall(register_prog_array_map); |
276 | |||
277 | static void perf_event_array_map_free(struct bpf_map *map) | ||
278 | { | ||
279 | bpf_fd_array_map_clear(map); | ||
280 | fd_array_map_free(map); | ||
281 | } | ||
282 | |||
283 | static void *perf_event_fd_array_get_ptr(struct bpf_map *map, int fd) | ||
284 | { | ||
285 | struct perf_event *event; | ||
286 | const struct perf_event_attr *attr; | ||
287 | |||
288 | event = perf_event_get(fd); | ||
289 | if (IS_ERR(event)) | ||
290 | return event; | ||
291 | |||
292 | attr = perf_event_attrs(event); | ||
293 | if (IS_ERR(attr)) | ||
294 | return (void *)attr; | ||
295 | |||
296 | if (attr->type != PERF_TYPE_RAW && | ||
297 | attr->type != PERF_TYPE_HARDWARE) { | ||
298 | perf_event_release_kernel(event); | ||
299 | return ERR_PTR(-EINVAL); | ||
300 | } | ||
301 | return event; | ||
302 | } | ||
303 | |||
304 | static void perf_event_fd_array_put_ptr(void *ptr) | ||
305 | { | ||
306 | struct perf_event *event = ptr; | ||
307 | |||
308 | perf_event_release_kernel(event); | ||
309 | } | ||
310 | |||
311 | static const struct bpf_map_ops perf_event_array_ops = { | ||
312 | .map_alloc = fd_array_map_alloc, | ||
313 | .map_free = perf_event_array_map_free, | ||
314 | .map_get_next_key = array_map_get_next_key, | ||
315 | .map_lookup_elem = fd_array_map_lookup_elem, | ||
316 | .map_update_elem = fd_array_map_update_elem, | ||
317 | .map_delete_elem = fd_array_map_delete_elem, | ||
318 | .map_fd_get_ptr = perf_event_fd_array_get_ptr, | ||
319 | .map_fd_put_ptr = perf_event_fd_array_put_ptr, | ||
320 | }; | ||
321 | |||
322 | static struct bpf_map_type_list perf_event_array_type __read_mostly = { | ||
323 | .ops = &perf_event_array_ops, | ||
324 | .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY, | ||
325 | }; | ||
326 | |||
327 | static int __init register_perf_event_array_map(void) | ||
328 | { | ||
329 | bpf_register_map_type(&perf_event_array_type); | ||
330 | return 0; | ||
331 | } | ||
332 | late_initcall(register_perf_event_array_map); | ||
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index c5bedc82bc1c..67c380cfa9ca 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c | |||
@@ -177,6 +177,7 @@ noinline u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) | |||
177 | { | 177 | { |
178 | return 0; | 178 | return 0; |
179 | } | 179 | } |
180 | EXPORT_SYMBOL_GPL(__bpf_call_base); | ||
180 | 181 | ||
181 | /** | 182 | /** |
182 | * __bpf_prog_run - run eBPF program on a given context | 183 | * __bpf_prog_run - run eBPF program on a given context |
@@ -449,11 +450,15 @@ select_insn: | |||
449 | 450 | ||
450 | tail_call_cnt++; | 451 | tail_call_cnt++; |
451 | 452 | ||
452 | prog = READ_ONCE(array->prog[index]); | 453 | prog = READ_ONCE(array->ptrs[index]); |
453 | if (unlikely(!prog)) | 454 | if (unlikely(!prog)) |
454 | goto out; | 455 | goto out; |
455 | 456 | ||
456 | ARG1 = BPF_R1; | 457 | /* ARG1 at this point is guaranteed to point to CTX from |
458 | * the verifier side due to the fact that the tail call is | ||
459 | * handeled like a helper, that is, bpf_tail_call_proto, | ||
460 | * where arg1_type is ARG_PTR_TO_CTX. | ||
461 | */ | ||
457 | insn = prog->insnsi; | 462 | insn = prog->insnsi; |
458 | goto select_insn; | 463 | goto select_insn; |
459 | out: | 464 | out: |
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index a1b14d197a4f..35bac8e8b071 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c | |||
@@ -72,7 +72,7 @@ static int bpf_map_release(struct inode *inode, struct file *filp) | |||
72 | /* prog_array stores refcnt-ed bpf_prog pointers | 72 | /* prog_array stores refcnt-ed bpf_prog pointers |
73 | * release them all when user space closes prog_array_fd | 73 | * release them all when user space closes prog_array_fd |
74 | */ | 74 | */ |
75 | bpf_prog_array_map_clear(map); | 75 | bpf_fd_array_map_clear(map); |
76 | 76 | ||
77 | bpf_map_put(map); | 77 | bpf_map_put(map); |
78 | return 0; | 78 | return 0; |
@@ -155,14 +155,15 @@ static int map_lookup_elem(union bpf_attr *attr) | |||
155 | void __user *ukey = u64_to_ptr(attr->key); | 155 | void __user *ukey = u64_to_ptr(attr->key); |
156 | void __user *uvalue = u64_to_ptr(attr->value); | 156 | void __user *uvalue = u64_to_ptr(attr->value); |
157 | int ufd = attr->map_fd; | 157 | int ufd = attr->map_fd; |
158 | struct fd f = fdget(ufd); | ||
159 | struct bpf_map *map; | 158 | struct bpf_map *map; |
160 | void *key, *value, *ptr; | 159 | void *key, *value, *ptr; |
160 | struct fd f; | ||
161 | int err; | 161 | int err; |
162 | 162 | ||
163 | if (CHECK_ATTR(BPF_MAP_LOOKUP_ELEM)) | 163 | if (CHECK_ATTR(BPF_MAP_LOOKUP_ELEM)) |
164 | return -EINVAL; | 164 | return -EINVAL; |
165 | 165 | ||
166 | f = fdget(ufd); | ||
166 | map = bpf_map_get(f); | 167 | map = bpf_map_get(f); |
167 | if (IS_ERR(map)) | 168 | if (IS_ERR(map)) |
168 | return PTR_ERR(map); | 169 | return PTR_ERR(map); |
@@ -213,14 +214,15 @@ static int map_update_elem(union bpf_attr *attr) | |||
213 | void __user *ukey = u64_to_ptr(attr->key); | 214 | void __user *ukey = u64_to_ptr(attr->key); |
214 | void __user *uvalue = u64_to_ptr(attr->value); | 215 | void __user *uvalue = u64_to_ptr(attr->value); |
215 | int ufd = attr->map_fd; | 216 | int ufd = attr->map_fd; |
216 | struct fd f = fdget(ufd); | ||
217 | struct bpf_map *map; | 217 | struct bpf_map *map; |
218 | void *key, *value; | 218 | void *key, *value; |
219 | struct fd f; | ||
219 | int err; | 220 | int err; |
220 | 221 | ||
221 | if (CHECK_ATTR(BPF_MAP_UPDATE_ELEM)) | 222 | if (CHECK_ATTR(BPF_MAP_UPDATE_ELEM)) |
222 | return -EINVAL; | 223 | return -EINVAL; |
223 | 224 | ||
225 | f = fdget(ufd); | ||
224 | map = bpf_map_get(f); | 226 | map = bpf_map_get(f); |
225 | if (IS_ERR(map)) | 227 | if (IS_ERR(map)) |
226 | return PTR_ERR(map); | 228 | return PTR_ERR(map); |
@@ -265,14 +267,15 @@ static int map_delete_elem(union bpf_attr *attr) | |||
265 | { | 267 | { |
266 | void __user *ukey = u64_to_ptr(attr->key); | 268 | void __user *ukey = u64_to_ptr(attr->key); |
267 | int ufd = attr->map_fd; | 269 | int ufd = attr->map_fd; |
268 | struct fd f = fdget(ufd); | ||
269 | struct bpf_map *map; | 270 | struct bpf_map *map; |
271 | struct fd f; | ||
270 | void *key; | 272 | void *key; |
271 | int err; | 273 | int err; |
272 | 274 | ||
273 | if (CHECK_ATTR(BPF_MAP_DELETE_ELEM)) | 275 | if (CHECK_ATTR(BPF_MAP_DELETE_ELEM)) |
274 | return -EINVAL; | 276 | return -EINVAL; |
275 | 277 | ||
278 | f = fdget(ufd); | ||
276 | map = bpf_map_get(f); | 279 | map = bpf_map_get(f); |
277 | if (IS_ERR(map)) | 280 | if (IS_ERR(map)) |
278 | return PTR_ERR(map); | 281 | return PTR_ERR(map); |
@@ -305,14 +308,15 @@ static int map_get_next_key(union bpf_attr *attr) | |||
305 | void __user *ukey = u64_to_ptr(attr->key); | 308 | void __user *ukey = u64_to_ptr(attr->key); |
306 | void __user *unext_key = u64_to_ptr(attr->next_key); | 309 | void __user *unext_key = u64_to_ptr(attr->next_key); |
307 | int ufd = attr->map_fd; | 310 | int ufd = attr->map_fd; |
308 | struct fd f = fdget(ufd); | ||
309 | struct bpf_map *map; | 311 | struct bpf_map *map; |
310 | void *key, *next_key; | 312 | void *key, *next_key; |
313 | struct fd f; | ||
311 | int err; | 314 | int err; |
312 | 315 | ||
313 | if (CHECK_ATTR(BPF_MAP_GET_NEXT_KEY)) | 316 | if (CHECK_ATTR(BPF_MAP_GET_NEXT_KEY)) |
314 | return -EINVAL; | 317 | return -EINVAL; |
315 | 318 | ||
319 | f = fdget(ufd); | ||
316 | map = bpf_map_get(f); | 320 | map = bpf_map_get(f); |
317 | if (IS_ERR(map)) | 321 | if (IS_ERR(map)) |
318 | return PTR_ERR(map); | 322 | return PTR_ERR(map); |
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 039d866fd36a..b074b23000d6 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c | |||
@@ -238,6 +238,14 @@ static const char * const reg_type_str[] = { | |||
238 | [CONST_IMM] = "imm", | 238 | [CONST_IMM] = "imm", |
239 | }; | 239 | }; |
240 | 240 | ||
241 | static const struct { | ||
242 | int map_type; | ||
243 | int func_id; | ||
244 | } func_limit[] = { | ||
245 | {BPF_MAP_TYPE_PROG_ARRAY, BPF_FUNC_tail_call}, | ||
246 | {BPF_MAP_TYPE_PERF_EVENT_ARRAY, BPF_FUNC_perf_event_read}, | ||
247 | }; | ||
248 | |||
241 | static void print_verifier_state(struct verifier_env *env) | 249 | static void print_verifier_state(struct verifier_env *env) |
242 | { | 250 | { |
243 | enum bpf_reg_type t; | 251 | enum bpf_reg_type t; |
@@ -275,7 +283,7 @@ static const char *const bpf_class_string[] = { | |||
275 | [BPF_ALU64] = "alu64", | 283 | [BPF_ALU64] = "alu64", |
276 | }; | 284 | }; |
277 | 285 | ||
278 | static const char *const bpf_alu_string[] = { | 286 | static const char *const bpf_alu_string[16] = { |
279 | [BPF_ADD >> 4] = "+=", | 287 | [BPF_ADD >> 4] = "+=", |
280 | [BPF_SUB >> 4] = "-=", | 288 | [BPF_SUB >> 4] = "-=", |
281 | [BPF_MUL >> 4] = "*=", | 289 | [BPF_MUL >> 4] = "*=", |
@@ -299,7 +307,7 @@ static const char *const bpf_ldst_string[] = { | |||
299 | [BPF_DW >> 3] = "u64", | 307 | [BPF_DW >> 3] = "u64", |
300 | }; | 308 | }; |
301 | 309 | ||
302 | static const char *const bpf_jmp_string[] = { | 310 | static const char *const bpf_jmp_string[16] = { |
303 | [BPF_JA >> 4] = "jmp", | 311 | [BPF_JA >> 4] = "jmp", |
304 | [BPF_JEQ >> 4] = "==", | 312 | [BPF_JEQ >> 4] = "==", |
305 | [BPF_JGT >> 4] = ">", | 313 | [BPF_JGT >> 4] = ">", |
@@ -648,6 +656,9 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off, | |||
648 | struct verifier_state *state = &env->cur_state; | 656 | struct verifier_state *state = &env->cur_state; |
649 | int size, err = 0; | 657 | int size, err = 0; |
650 | 658 | ||
659 | if (state->regs[regno].type == PTR_TO_STACK) | ||
660 | off += state->regs[regno].imm; | ||
661 | |||
651 | size = bpf_size_to_bytes(bpf_size); | 662 | size = bpf_size_to_bytes(bpf_size); |
652 | if (size < 0) | 663 | if (size < 0) |
653 | return size; | 664 | return size; |
@@ -667,7 +678,8 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off, | |||
667 | if (!err && t == BPF_READ && value_regno >= 0) | 678 | if (!err && t == BPF_READ && value_regno >= 0) |
668 | mark_reg_unknown_value(state->regs, value_regno); | 679 | mark_reg_unknown_value(state->regs, value_regno); |
669 | 680 | ||
670 | } else if (state->regs[regno].type == FRAME_PTR) { | 681 | } else if (state->regs[regno].type == FRAME_PTR || |
682 | state->regs[regno].type == PTR_TO_STACK) { | ||
671 | if (off >= 0 || off < -MAX_BPF_STACK) { | 683 | if (off >= 0 || off < -MAX_BPF_STACK) { |
672 | verbose("invalid stack off=%d size=%d\n", off, size); | 684 | verbose("invalid stack off=%d size=%d\n", off, size); |
673 | return -EACCES; | 685 | return -EACCES; |
@@ -833,6 +845,28 @@ static int check_func_arg(struct verifier_env *env, u32 regno, | |||
833 | return err; | 845 | return err; |
834 | } | 846 | } |
835 | 847 | ||
848 | static int check_map_func_compatibility(struct bpf_map *map, int func_id) | ||
849 | { | ||
850 | bool bool_map, bool_func; | ||
851 | int i; | ||
852 | |||
853 | if (!map) | ||
854 | return 0; | ||
855 | |||
856 | for (i = 0; i < ARRAY_SIZE(func_limit); i++) { | ||
857 | bool_map = (map->map_type == func_limit[i].map_type); | ||
858 | bool_func = (func_id == func_limit[i].func_id); | ||
859 | /* only when map & func pair match it can continue. | ||
860 | * don't allow any other map type to be passed into | ||
861 | * the special func; | ||
862 | */ | ||
863 | if (bool_map != bool_func) | ||
864 | return -EINVAL; | ||
865 | } | ||
866 | |||
867 | return 0; | ||
868 | } | ||
869 | |||
836 | static int check_call(struct verifier_env *env, int func_id) | 870 | static int check_call(struct verifier_env *env, int func_id) |
837 | { | 871 | { |
838 | struct verifier_state *state = &env->cur_state; | 872 | struct verifier_state *state = &env->cur_state; |
@@ -908,21 +942,9 @@ static int check_call(struct verifier_env *env, int func_id) | |||
908 | return -EINVAL; | 942 | return -EINVAL; |
909 | } | 943 | } |
910 | 944 | ||
911 | if (map && map->map_type == BPF_MAP_TYPE_PROG_ARRAY && | 945 | err = check_map_func_compatibility(map, func_id); |
912 | func_id != BPF_FUNC_tail_call) | 946 | if (err) |
913 | /* prog_array map type needs extra care: | 947 | return err; |
914 | * only allow to pass it into bpf_tail_call() for now. | ||
915 | * bpf_map_delete_elem() can be allowed in the future, | ||
916 | * while bpf_map_update_elem() must only be done via syscall | ||
917 | */ | ||
918 | return -EINVAL; | ||
919 | |||
920 | if (func_id == BPF_FUNC_tail_call && | ||
921 | map->map_type != BPF_MAP_TYPE_PROG_ARRAY) | ||
922 | /* don't allow any other map type to be passed into | ||
923 | * bpf_tail_call() | ||
924 | */ | ||
925 | return -EINVAL; | ||
926 | 948 | ||
927 | return 0; | 949 | return 0; |
928 | } | 950 | } |
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index f89d9292eee6..2cf0f79f1fc9 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c | |||
@@ -107,8 +107,8 @@ static DEFINE_SPINLOCK(release_agent_path_lock); | |||
107 | struct percpu_rw_semaphore cgroup_threadgroup_rwsem; | 107 | struct percpu_rw_semaphore cgroup_threadgroup_rwsem; |
108 | 108 | ||
109 | #define cgroup_assert_mutex_or_rcu_locked() \ | 109 | #define cgroup_assert_mutex_or_rcu_locked() \ |
110 | rcu_lockdep_assert(rcu_read_lock_held() || \ | 110 | RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \ |
111 | lockdep_is_held(&cgroup_mutex), \ | 111 | !lockdep_is_held(&cgroup_mutex), \ |
112 | "cgroup_mutex or RCU read lock required"); | 112 | "cgroup_mutex or RCU read lock required"); |
113 | 113 | ||
114 | /* | 114 | /* |
@@ -145,6 +145,7 @@ static const char *cgroup_subsys_name[] = { | |||
145 | * part of that cgroup. | 145 | * part of that cgroup. |
146 | */ | 146 | */ |
147 | struct cgroup_root cgrp_dfl_root; | 147 | struct cgroup_root cgrp_dfl_root; |
148 | EXPORT_SYMBOL_GPL(cgrp_dfl_root); | ||
148 | 149 | ||
149 | /* | 150 | /* |
150 | * The default hierarchy always exists but is hidden until mounted for the | 151 | * The default hierarchy always exists but is hidden until mounted for the |
@@ -186,6 +187,9 @@ static u64 css_serial_nr_next = 1; | |||
186 | static unsigned long have_fork_callback __read_mostly; | 187 | static unsigned long have_fork_callback __read_mostly; |
187 | static unsigned long have_exit_callback __read_mostly; | 188 | static unsigned long have_exit_callback __read_mostly; |
188 | 189 | ||
190 | /* Ditto for the can_fork callback. */ | ||
191 | static unsigned long have_canfork_callback __read_mostly; | ||
192 | |||
189 | static struct cftype cgroup_dfl_base_files[]; | 193 | static struct cftype cgroup_dfl_base_files[]; |
190 | static struct cftype cgroup_legacy_base_files[]; | 194 | static struct cftype cgroup_legacy_base_files[]; |
191 | 195 | ||
@@ -207,7 +211,7 @@ static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end, | |||
207 | 211 | ||
208 | idr_preload(gfp_mask); | 212 | idr_preload(gfp_mask); |
209 | spin_lock_bh(&cgroup_idr_lock); | 213 | spin_lock_bh(&cgroup_idr_lock); |
210 | ret = idr_alloc(idr, ptr, start, end, gfp_mask); | 214 | ret = idr_alloc(idr, ptr, start, end, gfp_mask & ~__GFP_WAIT); |
211 | spin_unlock_bh(&cgroup_idr_lock); | 215 | spin_unlock_bh(&cgroup_idr_lock); |
212 | idr_preload_end(); | 216 | idr_preload_end(); |
213 | return ret; | 217 | return ret; |
@@ -1027,10 +1031,13 @@ static const struct file_operations proc_cgroupstats_operations; | |||
1027 | static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft, | 1031 | static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft, |
1028 | char *buf) | 1032 | char *buf) |
1029 | { | 1033 | { |
1034 | struct cgroup_subsys *ss = cft->ss; | ||
1035 | |||
1030 | if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) && | 1036 | if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) && |
1031 | !(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) | 1037 | !(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) |
1032 | snprintf(buf, CGROUP_FILE_NAME_MAX, "%s.%s", | 1038 | snprintf(buf, CGROUP_FILE_NAME_MAX, "%s.%s", |
1033 | cft->ss->name, cft->name); | 1039 | cgroup_on_dfl(cgrp) ? ss->name : ss->legacy_name, |
1040 | cft->name); | ||
1034 | else | 1041 | else |
1035 | strncpy(buf, cft->name, CGROUP_FILE_NAME_MAX); | 1042 | strncpy(buf, cft->name, CGROUP_FILE_NAME_MAX); |
1036 | return buf; | 1043 | return buf; |
@@ -1332,9 +1339,10 @@ static int cgroup_show_options(struct seq_file *seq, | |||
1332 | struct cgroup_subsys *ss; | 1339 | struct cgroup_subsys *ss; |
1333 | int ssid; | 1340 | int ssid; |
1334 | 1341 | ||
1335 | for_each_subsys(ss, ssid) | 1342 | if (root != &cgrp_dfl_root) |
1336 | if (root->subsys_mask & (1 << ssid)) | 1343 | for_each_subsys(ss, ssid) |
1337 | seq_printf(seq, ",%s", ss->name); | 1344 | if (root->subsys_mask & (1 << ssid)) |
1345 | seq_show_option(seq, ss->legacy_name, NULL); | ||
1338 | if (root->flags & CGRP_ROOT_NOPREFIX) | 1346 | if (root->flags & CGRP_ROOT_NOPREFIX) |
1339 | seq_puts(seq, ",noprefix"); | 1347 | seq_puts(seq, ",noprefix"); |
1340 | if (root->flags & CGRP_ROOT_XATTR) | 1348 | if (root->flags & CGRP_ROOT_XATTR) |
@@ -1342,13 +1350,14 @@ static int cgroup_show_options(struct seq_file *seq, | |||
1342 | 1350 | ||
1343 | spin_lock(&release_agent_path_lock); | 1351 | spin_lock(&release_agent_path_lock); |
1344 | if (strlen(root->release_agent_path)) | 1352 | if (strlen(root->release_agent_path)) |
1345 | seq_printf(seq, ",release_agent=%s", root->release_agent_path); | 1353 | seq_show_option(seq, "release_agent", |
1354 | root->release_agent_path); | ||
1346 | spin_unlock(&release_agent_path_lock); | 1355 | spin_unlock(&release_agent_path_lock); |
1347 | 1356 | ||
1348 | if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags)) | 1357 | if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags)) |
1349 | seq_puts(seq, ",clone_children"); | 1358 | seq_puts(seq, ",clone_children"); |
1350 | if (strlen(root->name)) | 1359 | if (strlen(root->name)) |
1351 | seq_printf(seq, ",name=%s", root->name); | 1360 | seq_show_option(seq, "name", root->name); |
1352 | return 0; | 1361 | return 0; |
1353 | } | 1362 | } |
1354 | 1363 | ||
@@ -1447,7 +1456,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) | |||
1447 | } | 1456 | } |
1448 | 1457 | ||
1449 | for_each_subsys(ss, i) { | 1458 | for_each_subsys(ss, i) { |
1450 | if (strcmp(token, ss->name)) | 1459 | if (strcmp(token, ss->legacy_name)) |
1451 | continue; | 1460 | continue; |
1452 | if (ss->disabled) | 1461 | if (ss->disabled) |
1453 | continue; | 1462 | continue; |
@@ -1666,7 +1675,7 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask) | |||
1666 | 1675 | ||
1667 | lockdep_assert_held(&cgroup_mutex); | 1676 | lockdep_assert_held(&cgroup_mutex); |
1668 | 1677 | ||
1669 | ret = cgroup_idr_alloc(&root->cgroup_idr, root_cgrp, 1, 2, GFP_NOWAIT); | 1678 | ret = cgroup_idr_alloc(&root->cgroup_idr, root_cgrp, 1, 2, GFP_KERNEL); |
1670 | if (ret < 0) | 1679 | if (ret < 0) |
1671 | goto out; | 1680 | goto out; |
1672 | root_cgrp->id = ret; | 1681 | root_cgrp->id = ret; |
@@ -4579,7 +4588,7 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss, | |||
4579 | if (err) | 4588 | if (err) |
4580 | goto err_free_css; | 4589 | goto err_free_css; |
4581 | 4590 | ||
4582 | err = cgroup_idr_alloc(&ss->css_idr, NULL, 2, 0, GFP_NOWAIT); | 4591 | err = cgroup_idr_alloc(&ss->css_idr, NULL, 2, 0, GFP_KERNEL); |
4583 | if (err < 0) | 4592 | if (err < 0) |
4584 | goto err_free_percpu_ref; | 4593 | goto err_free_percpu_ref; |
4585 | css->id = err; | 4594 | css->id = err; |
@@ -4656,7 +4665,7 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, | |||
4656 | * Temporarily set the pointer to NULL, so idr_find() won't return | 4665 | * Temporarily set the pointer to NULL, so idr_find() won't return |
4657 | * a half-baked cgroup. | 4666 | * a half-baked cgroup. |
4658 | */ | 4667 | */ |
4659 | cgrp->id = cgroup_idr_alloc(&root->cgroup_idr, NULL, 2, 0, GFP_NOWAIT); | 4668 | cgrp->id = cgroup_idr_alloc(&root->cgroup_idr, NULL, 2, 0, GFP_KERNEL); |
4660 | if (cgrp->id < 0) { | 4669 | if (cgrp->id < 0) { |
4661 | ret = -ENOMEM; | 4670 | ret = -ENOMEM; |
4662 | goto out_cancel_ref; | 4671 | goto out_cancel_ref; |
@@ -4955,6 +4964,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early) | |||
4955 | 4964 | ||
4956 | have_fork_callback |= (bool)ss->fork << ss->id; | 4965 | have_fork_callback |= (bool)ss->fork << ss->id; |
4957 | have_exit_callback |= (bool)ss->exit << ss->id; | 4966 | have_exit_callback |= (bool)ss->exit << ss->id; |
4967 | have_canfork_callback |= (bool)ss->can_fork << ss->id; | ||
4958 | 4968 | ||
4959 | /* At system boot, before all subsystems have been | 4969 | /* At system boot, before all subsystems have been |
4960 | * registered, no tasks have been forked, so we don't | 4970 | * registered, no tasks have been forked, so we don't |
@@ -4993,6 +5003,8 @@ int __init cgroup_init_early(void) | |||
4993 | 5003 | ||
4994 | ss->id = i; | 5004 | ss->id = i; |
4995 | ss->name = cgroup_subsys_name[i]; | 5005 | ss->name = cgroup_subsys_name[i]; |
5006 | if (!ss->legacy_name) | ||
5007 | ss->legacy_name = cgroup_subsys_name[i]; | ||
4996 | 5008 | ||
4997 | if (ss->early_init) | 5009 | if (ss->early_init) |
4998 | cgroup_init_subsys(ss, true); | 5010 | cgroup_init_subsys(ss, true); |
@@ -5136,9 +5148,11 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, | |||
5136 | continue; | 5148 | continue; |
5137 | 5149 | ||
5138 | seq_printf(m, "%d:", root->hierarchy_id); | 5150 | seq_printf(m, "%d:", root->hierarchy_id); |
5139 | for_each_subsys(ss, ssid) | 5151 | if (root != &cgrp_dfl_root) |
5140 | if (root->subsys_mask & (1 << ssid)) | 5152 | for_each_subsys(ss, ssid) |
5141 | seq_printf(m, "%s%s", count++ ? "," : "", ss->name); | 5153 | if (root->subsys_mask & (1 << ssid)) |
5154 | seq_printf(m, "%s%s", count++ ? "," : "", | ||
5155 | ss->legacy_name); | ||
5142 | if (strlen(root->name)) | 5156 | if (strlen(root->name)) |
5143 | seq_printf(m, "%sname=%s", count ? "," : "", | 5157 | seq_printf(m, "%sname=%s", count ? "," : "", |
5144 | root->name); | 5158 | root->name); |
@@ -5178,7 +5192,7 @@ static int proc_cgroupstats_show(struct seq_file *m, void *v) | |||
5178 | 5192 | ||
5179 | for_each_subsys(ss, i) | 5193 | for_each_subsys(ss, i) |
5180 | seq_printf(m, "%s\t%d\t%d\t%d\n", | 5194 | seq_printf(m, "%s\t%d\t%d\t%d\n", |
5181 | ss->name, ss->root->hierarchy_id, | 5195 | ss->legacy_name, ss->root->hierarchy_id, |
5182 | atomic_read(&ss->root->nr_cgrps), !ss->disabled); | 5196 | atomic_read(&ss->root->nr_cgrps), !ss->disabled); |
5183 | 5197 | ||
5184 | mutex_unlock(&cgroup_mutex); | 5198 | mutex_unlock(&cgroup_mutex); |
@@ -5197,6 +5211,19 @@ static const struct file_operations proc_cgroupstats_operations = { | |||
5197 | .release = single_release, | 5211 | .release = single_release, |
5198 | }; | 5212 | }; |
5199 | 5213 | ||
5214 | static void **subsys_canfork_priv_p(void *ss_priv[CGROUP_CANFORK_COUNT], int i) | ||
5215 | { | ||
5216 | if (CGROUP_CANFORK_START <= i && i < CGROUP_CANFORK_END) | ||
5217 | return &ss_priv[i - CGROUP_CANFORK_START]; | ||
5218 | return NULL; | ||
5219 | } | ||
5220 | |||
5221 | static void *subsys_canfork_priv(void *ss_priv[CGROUP_CANFORK_COUNT], int i) | ||
5222 | { | ||
5223 | void **private = subsys_canfork_priv_p(ss_priv, i); | ||
5224 | return private ? *private : NULL; | ||
5225 | } | ||
5226 | |||
5200 | /** | 5227 | /** |
5201 | * cgroup_fork - initialize cgroup related fields during copy_process() | 5228 | * cgroup_fork - initialize cgroup related fields during copy_process() |
5202 | * @child: pointer to task_struct of forking parent process. | 5229 | * @child: pointer to task_struct of forking parent process. |
@@ -5212,6 +5239,57 @@ void cgroup_fork(struct task_struct *child) | |||
5212 | } | 5239 | } |
5213 | 5240 | ||
5214 | /** | 5241 | /** |
5242 | * cgroup_can_fork - called on a new task before the process is exposed | ||
5243 | * @child: the task in question. | ||
5244 | * | ||
5245 | * This calls the subsystem can_fork() callbacks. If the can_fork() callback | ||
5246 | * returns an error, the fork aborts with that error code. This allows for | ||
5247 | * a cgroup subsystem to conditionally allow or deny new forks. | ||
5248 | */ | ||
5249 | int cgroup_can_fork(struct task_struct *child, | ||
5250 | void *ss_priv[CGROUP_CANFORK_COUNT]) | ||
5251 | { | ||
5252 | struct cgroup_subsys *ss; | ||
5253 | int i, j, ret; | ||
5254 | |||
5255 | for_each_subsys_which(ss, i, &have_canfork_callback) { | ||
5256 | ret = ss->can_fork(child, subsys_canfork_priv_p(ss_priv, i)); | ||
5257 | if (ret) | ||
5258 | goto out_revert; | ||
5259 | } | ||
5260 | |||
5261 | return 0; | ||
5262 | |||
5263 | out_revert: | ||
5264 | for_each_subsys(ss, j) { | ||
5265 | if (j >= i) | ||
5266 | break; | ||
5267 | if (ss->cancel_fork) | ||
5268 | ss->cancel_fork(child, subsys_canfork_priv(ss_priv, j)); | ||
5269 | } | ||
5270 | |||
5271 | return ret; | ||
5272 | } | ||
5273 | |||
5274 | /** | ||
5275 | * cgroup_cancel_fork - called if a fork failed after cgroup_can_fork() | ||
5276 | * @child: the task in question | ||
5277 | * | ||
5278 | * This calls the cancel_fork() callbacks if a fork failed *after* | ||
5279 | * cgroup_can_fork() succeded. | ||
5280 | */ | ||
5281 | void cgroup_cancel_fork(struct task_struct *child, | ||
5282 | void *ss_priv[CGROUP_CANFORK_COUNT]) | ||
5283 | { | ||
5284 | struct cgroup_subsys *ss; | ||
5285 | int i; | ||
5286 | |||
5287 | for_each_subsys(ss, i) | ||
5288 | if (ss->cancel_fork) | ||
5289 | ss->cancel_fork(child, subsys_canfork_priv(ss_priv, i)); | ||
5290 | } | ||
5291 | |||
5292 | /** | ||
5215 | * cgroup_post_fork - called on a new task after adding it to the task list | 5293 | * cgroup_post_fork - called on a new task after adding it to the task list |
5216 | * @child: the task in question | 5294 | * @child: the task in question |
5217 | * | 5295 | * |
@@ -5221,7 +5299,8 @@ void cgroup_fork(struct task_struct *child) | |||
5221 | * cgroup_task_iter_start() - to guarantee that the new task ends up on its | 5299 | * cgroup_task_iter_start() - to guarantee that the new task ends up on its |
5222 | * list. | 5300 | * list. |
5223 | */ | 5301 | */ |
5224 | void cgroup_post_fork(struct task_struct *child) | 5302 | void cgroup_post_fork(struct task_struct *child, |
5303 | void *old_ss_priv[CGROUP_CANFORK_COUNT]) | ||
5225 | { | 5304 | { |
5226 | struct cgroup_subsys *ss; | 5305 | struct cgroup_subsys *ss; |
5227 | int i; | 5306 | int i; |
@@ -5266,7 +5345,7 @@ void cgroup_post_fork(struct task_struct *child) | |||
5266 | * and addition to css_set. | 5345 | * and addition to css_set. |
5267 | */ | 5346 | */ |
5268 | for_each_subsys_which(ss, i, &have_fork_callback) | 5347 | for_each_subsys_which(ss, i, &have_fork_callback) |
5269 | ss->fork(child); | 5348 | ss->fork(child, subsys_canfork_priv(old_ss_priv, i)); |
5270 | } | 5349 | } |
5271 | 5350 | ||
5272 | /** | 5351 | /** |
@@ -5400,12 +5479,14 @@ static int __init cgroup_disable(char *str) | |||
5400 | continue; | 5479 | continue; |
5401 | 5480 | ||
5402 | for_each_subsys(ss, i) { | 5481 | for_each_subsys(ss, i) { |
5403 | if (!strcmp(token, ss->name)) { | 5482 | if (strcmp(token, ss->name) && |
5404 | ss->disabled = 1; | 5483 | strcmp(token, ss->legacy_name)) |
5405 | printk(KERN_INFO "Disabling %s control group" | 5484 | continue; |
5406 | " subsystem\n", ss->name); | 5485 | |
5407 | break; | 5486 | ss->disabled = 1; |
5408 | } | 5487 | printk(KERN_INFO "Disabling %s control group subsystem\n", |
5488 | ss->name); | ||
5489 | break; | ||
5409 | } | 5490 | } |
5410 | } | 5491 | } |
5411 | return 1; | 5492 | return 1; |
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index 92b98cc0ee76..f1b30ad5dc6d 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c | |||
@@ -203,7 +203,7 @@ static void freezer_attach(struct cgroup_subsys_state *new_css, | |||
203 | * to do anything as freezer_attach() will put @task into the appropriate | 203 | * to do anything as freezer_attach() will put @task into the appropriate |
204 | * state. | 204 | * state. |
205 | */ | 205 | */ |
206 | static void freezer_fork(struct task_struct *task) | 206 | static void freezer_fork(struct task_struct *task, void *private) |
207 | { | 207 | { |
208 | struct freezer *freezer; | 208 | struct freezer *freezer; |
209 | 209 | ||
diff --git a/kernel/cgroup_pids.c b/kernel/cgroup_pids.c new file mode 100644 index 000000000000..806cd7693ac8 --- /dev/null +++ b/kernel/cgroup_pids.c | |||
@@ -0,0 +1,355 @@ | |||
1 | /* | ||
2 | * Process number limiting controller for cgroups. | ||
3 | * | ||
4 | * Used to allow a cgroup hierarchy to stop any new processes from fork()ing | ||
5 | * after a certain limit is reached. | ||
6 | * | ||
7 | * Since it is trivial to hit the task limit without hitting any kmemcg limits | ||
8 | * in place, PIDs are a fundamental resource. As such, PID exhaustion must be | ||
9 | * preventable in the scope of a cgroup hierarchy by allowing resource limiting | ||
10 | * of the number of tasks in a cgroup. | ||
11 | * | ||
12 | * In order to use the `pids` controller, set the maximum number of tasks in | ||
13 | * pids.max (this is not available in the root cgroup for obvious reasons). The | ||
14 | * number of processes currently in the cgroup is given by pids.current. | ||
15 | * Organisational operations are not blocked by cgroup policies, so it is | ||
16 | * possible to have pids.current > pids.max. However, it is not possible to | ||
17 | * violate a cgroup policy through fork(). fork() will return -EAGAIN if forking | ||
18 | * would cause a cgroup policy to be violated. | ||
19 | * | ||
20 | * To set a cgroup to have no limit, set pids.max to "max". This is the default | ||
21 | * for all new cgroups (N.B. that PID limits are hierarchical, so the most | ||
22 | * stringent limit in the hierarchy is followed). | ||
23 | * | ||
24 | * pids.current tracks all child cgroup hierarchies, so parent/pids.current is | ||
25 | * a superset of parent/child/pids.current. | ||
26 | * | ||
27 | * Copyright (C) 2015 Aleksa Sarai <cyphar@cyphar.com> | ||
28 | * | ||
29 | * This file is subject to the terms and conditions of version 2 of the GNU | ||
30 | * General Public License. See the file COPYING in the main directory of the | ||
31 | * Linux distribution for more details. | ||
32 | */ | ||
33 | |||
34 | #include <linux/kernel.h> | ||
35 | #include <linux/threads.h> | ||
36 | #include <linux/atomic.h> | ||
37 | #include <linux/cgroup.h> | ||
38 | #include <linux/slab.h> | ||
39 | |||
40 | #define PIDS_MAX (PID_MAX_LIMIT + 1ULL) | ||
41 | #define PIDS_MAX_STR "max" | ||
42 | |||
43 | struct pids_cgroup { | ||
44 | struct cgroup_subsys_state css; | ||
45 | |||
46 | /* | ||
47 | * Use 64-bit types so that we can safely represent "max" as | ||
48 | * %PIDS_MAX = (%PID_MAX_LIMIT + 1). | ||
49 | */ | ||
50 | atomic64_t counter; | ||
51 | int64_t limit; | ||
52 | }; | ||
53 | |||
54 | static struct pids_cgroup *css_pids(struct cgroup_subsys_state *css) | ||
55 | { | ||
56 | return container_of(css, struct pids_cgroup, css); | ||
57 | } | ||
58 | |||
59 | static struct pids_cgroup *parent_pids(struct pids_cgroup *pids) | ||
60 | { | ||
61 | return css_pids(pids->css.parent); | ||
62 | } | ||
63 | |||
64 | static struct cgroup_subsys_state * | ||
65 | pids_css_alloc(struct cgroup_subsys_state *parent) | ||
66 | { | ||
67 | struct pids_cgroup *pids; | ||
68 | |||
69 | pids = kzalloc(sizeof(struct pids_cgroup), GFP_KERNEL); | ||
70 | if (!pids) | ||
71 | return ERR_PTR(-ENOMEM); | ||
72 | |||
73 | pids->limit = PIDS_MAX; | ||
74 | atomic64_set(&pids->counter, 0); | ||
75 | return &pids->css; | ||
76 | } | ||
77 | |||
78 | static void pids_css_free(struct cgroup_subsys_state *css) | ||
79 | { | ||
80 | kfree(css_pids(css)); | ||
81 | } | ||
82 | |||
83 | /** | ||
84 | * pids_cancel - uncharge the local pid count | ||
85 | * @pids: the pid cgroup state | ||
86 | * @num: the number of pids to cancel | ||
87 | * | ||
88 | * This function will WARN if the pid count goes under 0, because such a case is | ||
89 | * a bug in the pids controller proper. | ||
90 | */ | ||
91 | static void pids_cancel(struct pids_cgroup *pids, int num) | ||
92 | { | ||
93 | /* | ||
94 | * A negative count (or overflow for that matter) is invalid, | ||
95 | * and indicates a bug in the `pids` controller proper. | ||
96 | */ | ||
97 | WARN_ON_ONCE(atomic64_add_negative(-num, &pids->counter)); | ||
98 | } | ||
99 | |||
100 | /** | ||
101 | * pids_uncharge - hierarchically uncharge the pid count | ||
102 | * @pids: the pid cgroup state | ||
103 | * @num: the number of pids to uncharge | ||
104 | */ | ||
105 | static void pids_uncharge(struct pids_cgroup *pids, int num) | ||
106 | { | ||
107 | struct pids_cgroup *p; | ||
108 | |||
109 | for (p = pids; p; p = parent_pids(p)) | ||
110 | pids_cancel(p, num); | ||
111 | } | ||
112 | |||
113 | /** | ||
114 | * pids_charge - hierarchically charge the pid count | ||
115 | * @pids: the pid cgroup state | ||
116 | * @num: the number of pids to charge | ||
117 | * | ||
118 | * This function does *not* follow the pid limit set. It cannot fail and the new | ||
119 | * pid count may exceed the limit. This is only used for reverting failed | ||
120 | * attaches, where there is no other way out than violating the limit. | ||
121 | */ | ||
122 | static void pids_charge(struct pids_cgroup *pids, int num) | ||
123 | { | ||
124 | struct pids_cgroup *p; | ||
125 | |||
126 | for (p = pids; p; p = parent_pids(p)) | ||
127 | atomic64_add(num, &p->counter); | ||
128 | } | ||
129 | |||
130 | /** | ||
131 | * pids_try_charge - hierarchically try to charge the pid count | ||
132 | * @pids: the pid cgroup state | ||
133 | * @num: the number of pids to charge | ||
134 | * | ||
135 | * This function follows the set limit. It will fail if the charge would cause | ||
136 | * the new value to exceed the hierarchical limit. Returns 0 if the charge | ||
137 | * succeded, otherwise -EAGAIN. | ||
138 | */ | ||
139 | static int pids_try_charge(struct pids_cgroup *pids, int num) | ||
140 | { | ||
141 | struct pids_cgroup *p, *q; | ||
142 | |||
143 | for (p = pids; p; p = parent_pids(p)) { | ||
144 | int64_t new = atomic64_add_return(num, &p->counter); | ||
145 | |||
146 | /* | ||
147 | * Since new is capped to the maximum number of pid_t, if | ||
148 | * p->limit is %PIDS_MAX then we know that this test will never | ||
149 | * fail. | ||
150 | */ | ||
151 | if (new > p->limit) | ||
152 | goto revert; | ||
153 | } | ||
154 | |||
155 | return 0; | ||
156 | |||
157 | revert: | ||
158 | for (q = pids; q != p; q = parent_pids(q)) | ||
159 | pids_cancel(q, num); | ||
160 | pids_cancel(p, num); | ||
161 | |||
162 | return -EAGAIN; | ||
163 | } | ||
164 | |||
165 | static int pids_can_attach(struct cgroup_subsys_state *css, | ||
166 | struct cgroup_taskset *tset) | ||
167 | { | ||
168 | struct pids_cgroup *pids = css_pids(css); | ||
169 | struct task_struct *task; | ||
170 | |||
171 | cgroup_taskset_for_each(task, tset) { | ||
172 | struct cgroup_subsys_state *old_css; | ||
173 | struct pids_cgroup *old_pids; | ||
174 | |||
175 | /* | ||
176 | * No need to pin @old_css between here and cancel_attach() | ||
177 | * because cgroup core protects it from being freed before | ||
178 | * the migration completes or fails. | ||
179 | */ | ||
180 | old_css = task_css(task, pids_cgrp_id); | ||
181 | old_pids = css_pids(old_css); | ||
182 | |||
183 | pids_charge(pids, 1); | ||
184 | pids_uncharge(old_pids, 1); | ||
185 | } | ||
186 | |||
187 | return 0; | ||
188 | } | ||
189 | |||
190 | static void pids_cancel_attach(struct cgroup_subsys_state *css, | ||
191 | struct cgroup_taskset *tset) | ||
192 | { | ||
193 | struct pids_cgroup *pids = css_pids(css); | ||
194 | struct task_struct *task; | ||
195 | |||
196 | cgroup_taskset_for_each(task, tset) { | ||
197 | struct cgroup_subsys_state *old_css; | ||
198 | struct pids_cgroup *old_pids; | ||
199 | |||
200 | old_css = task_css(task, pids_cgrp_id); | ||
201 | old_pids = css_pids(old_css); | ||
202 | |||
203 | pids_charge(old_pids, 1); | ||
204 | pids_uncharge(pids, 1); | ||
205 | } | ||
206 | } | ||
207 | |||
208 | static int pids_can_fork(struct task_struct *task, void **priv_p) | ||
209 | { | ||
210 | struct cgroup_subsys_state *css; | ||
211 | struct pids_cgroup *pids; | ||
212 | int err; | ||
213 | |||
214 | /* | ||
215 | * Use the "current" task_css for the pids subsystem as the tentative | ||
216 | * css. It is possible we will charge the wrong hierarchy, in which | ||
217 | * case we will forcefully revert/reapply the charge on the right | ||
218 | * hierarchy after it is committed to the task proper. | ||
219 | */ | ||
220 | css = task_get_css(current, pids_cgrp_id); | ||
221 | pids = css_pids(css); | ||
222 | |||
223 | err = pids_try_charge(pids, 1); | ||
224 | if (err) | ||
225 | goto err_css_put; | ||
226 | |||
227 | *priv_p = css; | ||
228 | return 0; | ||
229 | |||
230 | err_css_put: | ||
231 | css_put(css); | ||
232 | return err; | ||
233 | } | ||
234 | |||
235 | static void pids_cancel_fork(struct task_struct *task, void *priv) | ||
236 | { | ||
237 | struct cgroup_subsys_state *css = priv; | ||
238 | struct pids_cgroup *pids = css_pids(css); | ||
239 | |||
240 | pids_uncharge(pids, 1); | ||
241 | css_put(css); | ||
242 | } | ||
243 | |||
244 | static void pids_fork(struct task_struct *task, void *priv) | ||
245 | { | ||
246 | struct cgroup_subsys_state *css; | ||
247 | struct cgroup_subsys_state *old_css = priv; | ||
248 | struct pids_cgroup *pids; | ||
249 | struct pids_cgroup *old_pids = css_pids(old_css); | ||
250 | |||
251 | css = task_get_css(task, pids_cgrp_id); | ||
252 | pids = css_pids(css); | ||
253 | |||
254 | /* | ||
255 | * If the association has changed, we have to revert and reapply the | ||
256 | * charge/uncharge on the wrong hierarchy to the current one. Since | ||
257 | * the association can only change due to an organisation event, its | ||
258 | * okay for us to ignore the limit in this case. | ||
259 | */ | ||
260 | if (pids != old_pids) { | ||
261 | pids_uncharge(old_pids, 1); | ||
262 | pids_charge(pids, 1); | ||
263 | } | ||
264 | |||
265 | css_put(css); | ||
266 | css_put(old_css); | ||
267 | } | ||
268 | |||
269 | static void pids_exit(struct cgroup_subsys_state *css, | ||
270 | struct cgroup_subsys_state *old_css, | ||
271 | struct task_struct *task) | ||
272 | { | ||
273 | struct pids_cgroup *pids = css_pids(old_css); | ||
274 | |||
275 | pids_uncharge(pids, 1); | ||
276 | } | ||
277 | |||
278 | static ssize_t pids_max_write(struct kernfs_open_file *of, char *buf, | ||
279 | size_t nbytes, loff_t off) | ||
280 | { | ||
281 | struct cgroup_subsys_state *css = of_css(of); | ||
282 | struct pids_cgroup *pids = css_pids(css); | ||
283 | int64_t limit; | ||
284 | int err; | ||
285 | |||
286 | buf = strstrip(buf); | ||
287 | if (!strcmp(buf, PIDS_MAX_STR)) { | ||
288 | limit = PIDS_MAX; | ||
289 | goto set_limit; | ||
290 | } | ||
291 | |||
292 | err = kstrtoll(buf, 0, &limit); | ||
293 | if (err) | ||
294 | return err; | ||
295 | |||
296 | if (limit < 0 || limit >= PIDS_MAX) | ||
297 | return -EINVAL; | ||
298 | |||
299 | set_limit: | ||
300 | /* | ||
301 | * Limit updates don't need to be mutex'd, since it isn't | ||
302 | * critical that any racing fork()s follow the new limit. | ||
303 | */ | ||
304 | pids->limit = limit; | ||
305 | return nbytes; | ||
306 | } | ||
307 | |||
308 | static int pids_max_show(struct seq_file *sf, void *v) | ||
309 | { | ||
310 | struct cgroup_subsys_state *css = seq_css(sf); | ||
311 | struct pids_cgroup *pids = css_pids(css); | ||
312 | int64_t limit = pids->limit; | ||
313 | |||
314 | if (limit >= PIDS_MAX) | ||
315 | seq_printf(sf, "%s\n", PIDS_MAX_STR); | ||
316 | else | ||
317 | seq_printf(sf, "%lld\n", limit); | ||
318 | |||
319 | return 0; | ||
320 | } | ||
321 | |||
322 | static s64 pids_current_read(struct cgroup_subsys_state *css, | ||
323 | struct cftype *cft) | ||
324 | { | ||
325 | struct pids_cgroup *pids = css_pids(css); | ||
326 | |||
327 | return atomic64_read(&pids->counter); | ||
328 | } | ||
329 | |||
330 | static struct cftype pids_files[] = { | ||
331 | { | ||
332 | .name = "max", | ||
333 | .write = pids_max_write, | ||
334 | .seq_show = pids_max_show, | ||
335 | .flags = CFTYPE_NOT_ON_ROOT, | ||
336 | }, | ||
337 | { | ||
338 | .name = "current", | ||
339 | .read_s64 = pids_current_read, | ||
340 | }, | ||
341 | { } /* terminate */ | ||
342 | }; | ||
343 | |||
344 | struct cgroup_subsys pids_cgrp_subsys = { | ||
345 | .css_alloc = pids_css_alloc, | ||
346 | .css_free = pids_css_free, | ||
347 | .can_attach = pids_can_attach, | ||
348 | .cancel_attach = pids_cancel_attach, | ||
349 | .can_fork = pids_can_fork, | ||
350 | .cancel_fork = pids_cancel_fork, | ||
351 | .fork = pids_fork, | ||
352 | .exit = pids_exit, | ||
353 | .legacy_cftypes = pids_files, | ||
354 | .dfl_cftypes = pids_files, | ||
355 | }; | ||
diff --git a/kernel/cpu.c b/kernel/cpu.c index 6a374544d495..82cf9dff4295 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c | |||
@@ -191,21 +191,22 @@ void cpu_hotplug_done(void) | |||
191 | void cpu_hotplug_disable(void) | 191 | void cpu_hotplug_disable(void) |
192 | { | 192 | { |
193 | cpu_maps_update_begin(); | 193 | cpu_maps_update_begin(); |
194 | cpu_hotplug_disabled = 1; | 194 | cpu_hotplug_disabled++; |
195 | cpu_maps_update_done(); | 195 | cpu_maps_update_done(); |
196 | } | 196 | } |
197 | EXPORT_SYMBOL_GPL(cpu_hotplug_disable); | ||
197 | 198 | ||
198 | void cpu_hotplug_enable(void) | 199 | void cpu_hotplug_enable(void) |
199 | { | 200 | { |
200 | cpu_maps_update_begin(); | 201 | cpu_maps_update_begin(); |
201 | cpu_hotplug_disabled = 0; | 202 | WARN_ON(--cpu_hotplug_disabled < 0); |
202 | cpu_maps_update_done(); | 203 | cpu_maps_update_done(); |
203 | } | 204 | } |
204 | 205 | EXPORT_SYMBOL_GPL(cpu_hotplug_enable); | |
205 | #endif /* CONFIG_HOTPLUG_CPU */ | 206 | #endif /* CONFIG_HOTPLUG_CPU */ |
206 | 207 | ||
207 | /* Need to know about CPUs going up/down? */ | 208 | /* Need to know about CPUs going up/down? */ |
208 | int __ref register_cpu_notifier(struct notifier_block *nb) | 209 | int register_cpu_notifier(struct notifier_block *nb) |
209 | { | 210 | { |
210 | int ret; | 211 | int ret; |
211 | cpu_maps_update_begin(); | 212 | cpu_maps_update_begin(); |
@@ -214,7 +215,7 @@ int __ref register_cpu_notifier(struct notifier_block *nb) | |||
214 | return ret; | 215 | return ret; |
215 | } | 216 | } |
216 | 217 | ||
217 | int __ref __register_cpu_notifier(struct notifier_block *nb) | 218 | int __register_cpu_notifier(struct notifier_block *nb) |
218 | { | 219 | { |
219 | return raw_notifier_chain_register(&cpu_chain, nb); | 220 | return raw_notifier_chain_register(&cpu_chain, nb); |
220 | } | 221 | } |
@@ -244,7 +245,7 @@ static void cpu_notify_nofail(unsigned long val, void *v) | |||
244 | EXPORT_SYMBOL(register_cpu_notifier); | 245 | EXPORT_SYMBOL(register_cpu_notifier); |
245 | EXPORT_SYMBOL(__register_cpu_notifier); | 246 | EXPORT_SYMBOL(__register_cpu_notifier); |
246 | 247 | ||
247 | void __ref unregister_cpu_notifier(struct notifier_block *nb) | 248 | void unregister_cpu_notifier(struct notifier_block *nb) |
248 | { | 249 | { |
249 | cpu_maps_update_begin(); | 250 | cpu_maps_update_begin(); |
250 | raw_notifier_chain_unregister(&cpu_chain, nb); | 251 | raw_notifier_chain_unregister(&cpu_chain, nb); |
@@ -252,7 +253,7 @@ void __ref unregister_cpu_notifier(struct notifier_block *nb) | |||
252 | } | 253 | } |
253 | EXPORT_SYMBOL(unregister_cpu_notifier); | 254 | EXPORT_SYMBOL(unregister_cpu_notifier); |
254 | 255 | ||
255 | void __ref __unregister_cpu_notifier(struct notifier_block *nb) | 256 | void __unregister_cpu_notifier(struct notifier_block *nb) |
256 | { | 257 | { |
257 | raw_notifier_chain_unregister(&cpu_chain, nb); | 258 | raw_notifier_chain_unregister(&cpu_chain, nb); |
258 | } | 259 | } |
@@ -329,7 +330,7 @@ struct take_cpu_down_param { | |||
329 | }; | 330 | }; |
330 | 331 | ||
331 | /* Take this CPU down. */ | 332 | /* Take this CPU down. */ |
332 | static int __ref take_cpu_down(void *_param) | 333 | static int take_cpu_down(void *_param) |
333 | { | 334 | { |
334 | struct take_cpu_down_param *param = _param; | 335 | struct take_cpu_down_param *param = _param; |
335 | int err; | 336 | int err; |
@@ -348,7 +349,7 @@ static int __ref take_cpu_down(void *_param) | |||
348 | } | 349 | } |
349 | 350 | ||
350 | /* Requires cpu_add_remove_lock to be held */ | 351 | /* Requires cpu_add_remove_lock to be held */ |
351 | static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) | 352 | static int _cpu_down(unsigned int cpu, int tasks_frozen) |
352 | { | 353 | { |
353 | int err, nr_calls = 0; | 354 | int err, nr_calls = 0; |
354 | void *hcpu = (void *)(long)cpu; | 355 | void *hcpu = (void *)(long)cpu; |
@@ -381,14 +382,14 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) | |||
381 | * will observe it. | 382 | * will observe it. |
382 | * | 383 | * |
383 | * For CONFIG_PREEMPT we have preemptible RCU and its sync_rcu() might | 384 | * For CONFIG_PREEMPT we have preemptible RCU and its sync_rcu() might |
384 | * not imply sync_sched(), so explicitly call both. | 385 | * not imply sync_sched(), so wait for both. |
385 | * | 386 | * |
386 | * Do sync before park smpboot threads to take care the rcu boost case. | 387 | * Do sync before park smpboot threads to take care the rcu boost case. |
387 | */ | 388 | */ |
388 | #ifdef CONFIG_PREEMPT | 389 | if (IS_ENABLED(CONFIG_PREEMPT)) |
389 | synchronize_sched(); | 390 | synchronize_rcu_mult(call_rcu, call_rcu_sched); |
390 | #endif | 391 | else |
391 | synchronize_rcu(); | 392 | synchronize_rcu(); |
392 | 393 | ||
393 | smpboot_park_threads(cpu); | 394 | smpboot_park_threads(cpu); |
394 | 395 | ||
@@ -401,7 +402,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) | |||
401 | /* | 402 | /* |
402 | * So now all preempt/rcu users must observe !cpu_active(). | 403 | * So now all preempt/rcu users must observe !cpu_active(). |
403 | */ | 404 | */ |
404 | err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu)); | 405 | err = stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu)); |
405 | if (err) { | 406 | if (err) { |
406 | /* CPU didn't die: tell everyone. Can't complain. */ | 407 | /* CPU didn't die: tell everyone. Can't complain. */ |
407 | cpu_notify_nofail(CPU_DOWN_FAILED | mod, hcpu); | 408 | cpu_notify_nofail(CPU_DOWN_FAILED | mod, hcpu); |
@@ -442,7 +443,7 @@ out_release: | |||
442 | return err; | 443 | return err; |
443 | } | 444 | } |
444 | 445 | ||
445 | int __ref cpu_down(unsigned int cpu) | 446 | int cpu_down(unsigned int cpu) |
446 | { | 447 | { |
447 | int err; | 448 | int err; |
448 | 449 | ||
@@ -527,18 +528,9 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen) | |||
527 | goto out_notify; | 528 | goto out_notify; |
528 | } | 529 | } |
529 | 530 | ||
530 | /* | ||
531 | * Some architectures have to walk the irq descriptors to | ||
532 | * setup the vector space for the cpu which comes online. | ||
533 | * Prevent irq alloc/free across the bringup. | ||
534 | */ | ||
535 | irq_lock_sparse(); | ||
536 | |||
537 | /* Arch-specific enabling code. */ | 531 | /* Arch-specific enabling code. */ |
538 | ret = __cpu_up(cpu, idle); | 532 | ret = __cpu_up(cpu, idle); |
539 | 533 | ||
540 | irq_unlock_sparse(); | ||
541 | |||
542 | if (ret != 0) | 534 | if (ret != 0) |
543 | goto out_notify; | 535 | goto out_notify; |
544 | BUG_ON(!cpu_online(cpu)); | 536 | BUG_ON(!cpu_online(cpu)); |
@@ -617,13 +609,18 @@ int disable_nonboot_cpus(void) | |||
617 | } | 609 | } |
618 | } | 610 | } |
619 | 611 | ||
620 | if (!error) { | 612 | if (!error) |
621 | BUG_ON(num_online_cpus() > 1); | 613 | BUG_ON(num_online_cpus() > 1); |
622 | /* Make sure the CPUs won't be enabled by someone else */ | 614 | else |
623 | cpu_hotplug_disabled = 1; | ||
624 | } else { | ||
625 | pr_err("Non-boot CPUs are not disabled\n"); | 615 | pr_err("Non-boot CPUs are not disabled\n"); |
626 | } | 616 | |
617 | /* | ||
618 | * Make sure the CPUs won't be enabled by someone else. We need to do | ||
619 | * this even in case of failure as all disable_nonboot_cpus() users are | ||
620 | * supposed to do enable_nonboot_cpus() on the failure path. | ||
621 | */ | ||
622 | cpu_hotplug_disabled++; | ||
623 | |||
627 | cpu_maps_update_done(); | 624 | cpu_maps_update_done(); |
628 | return error; | 625 | return error; |
629 | } | 626 | } |
@@ -636,13 +633,13 @@ void __weak arch_enable_nonboot_cpus_end(void) | |||
636 | { | 633 | { |
637 | } | 634 | } |
638 | 635 | ||
639 | void __ref enable_nonboot_cpus(void) | 636 | void enable_nonboot_cpus(void) |
640 | { | 637 | { |
641 | int cpu, error; | 638 | int cpu, error; |
642 | 639 | ||
643 | /* Allow everyone to use the CPU hotplug again */ | 640 | /* Allow everyone to use the CPU hotplug again */ |
644 | cpu_maps_update_begin(); | 641 | cpu_maps_update_begin(); |
645 | cpu_hotplug_disabled = 0; | 642 | WARN_ON(--cpu_hotplug_disabled < 0); |
646 | if (cpumask_empty(frozen_cpus)) | 643 | if (cpumask_empty(frozen_cpus)) |
647 | goto out; | 644 | goto out; |
648 | 645 | ||
diff --git a/kernel/cpu_pm.c b/kernel/cpu_pm.c index 9656a3c36503..009cc9a17d95 100644 --- a/kernel/cpu_pm.c +++ b/kernel/cpu_pm.c | |||
@@ -180,7 +180,7 @@ EXPORT_SYMBOL_GPL(cpu_cluster_pm_enter); | |||
180 | * low power state that may have caused some blocks in the same power domain | 180 | * low power state that may have caused some blocks in the same power domain |
181 | * to reset. | 181 | * to reset. |
182 | * | 182 | * |
183 | * Must be called after cpu_pm_exit has been called on all cpus in the power | 183 | * Must be called after cpu_cluster_pm_enter has been called for the power |
184 | * domain, and before cpu_pm_exit has been called on any cpu in the power | 184 | * domain, and before cpu_pm_exit has been called on any cpu in the power |
185 | * domain. Notified drivers can include VFP co-processor, interrupt controller | 185 | * domain. Notified drivers can include VFP co-processor, interrupt controller |
186 | * and its PM extensions, local CPU timers context save/restore which | 186 | * and its PM extensions, local CPU timers context save/restore which |
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index ee14e3a35a29..f0acff0f66c9 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
@@ -1223,7 +1223,7 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, | |||
1223 | spin_unlock_irq(&callback_lock); | 1223 | spin_unlock_irq(&callback_lock); |
1224 | 1224 | ||
1225 | /* use trialcs->mems_allowed as a temp variable */ | 1225 | /* use trialcs->mems_allowed as a temp variable */ |
1226 | update_nodemasks_hier(cs, &cs->mems_allowed); | 1226 | update_nodemasks_hier(cs, &trialcs->mems_allowed); |
1227 | done: | 1227 | done: |
1228 | return retval; | 1228 | return retval; |
1229 | } | 1229 | } |
diff --git a/kernel/cred.c b/kernel/cred.c index ec1c07667ec1..71179a09c1d6 100644 --- a/kernel/cred.c +++ b/kernel/cred.c | |||
@@ -20,11 +20,16 @@ | |||
20 | #include <linux/cn_proc.h> | 20 | #include <linux/cn_proc.h> |
21 | 21 | ||
22 | #if 0 | 22 | #if 0 |
23 | #define kdebug(FMT, ...) \ | 23 | #define kdebug(FMT, ...) \ |
24 | printk("[%-5.5s%5u] "FMT"\n", current->comm, current->pid ,##__VA_ARGS__) | 24 | printk("[%-5.5s%5u] " FMT "\n", \ |
25 | current->comm, current->pid, ##__VA_ARGS__) | ||
25 | #else | 26 | #else |
26 | #define kdebug(FMT, ...) \ | 27 | #define kdebug(FMT, ...) \ |
27 | no_printk("[%-5.5s%5u] "FMT"\n", current->comm, current->pid ,##__VA_ARGS__) | 28 | do { \ |
29 | if (0) \ | ||
30 | no_printk("[%-5.5s%5u] " FMT "\n", \ | ||
31 | current->comm, current->pid, ##__VA_ARGS__); \ | ||
32 | } while (0) | ||
28 | #endif | 33 | #endif |
29 | 34 | ||
30 | static struct kmem_cache *cred_jar; | 35 | static struct kmem_cache *cred_jar; |
diff --git a/kernel/events/core.c b/kernel/events/core.c index d3dae3419b99..f548f69c4299 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c | |||
@@ -163,6 +163,7 @@ static atomic_t nr_mmap_events __read_mostly; | |||
163 | static atomic_t nr_comm_events __read_mostly; | 163 | static atomic_t nr_comm_events __read_mostly; |
164 | static atomic_t nr_task_events __read_mostly; | 164 | static atomic_t nr_task_events __read_mostly; |
165 | static atomic_t nr_freq_events __read_mostly; | 165 | static atomic_t nr_freq_events __read_mostly; |
166 | static atomic_t nr_switch_events __read_mostly; | ||
166 | 167 | ||
167 | static LIST_HEAD(pmus); | 168 | static LIST_HEAD(pmus); |
168 | static DEFINE_MUTEX(pmus_lock); | 169 | static DEFINE_MUTEX(pmus_lock); |
@@ -1868,8 +1869,6 @@ event_sched_in(struct perf_event *event, | |||
1868 | 1869 | ||
1869 | perf_pmu_disable(event->pmu); | 1870 | perf_pmu_disable(event->pmu); |
1870 | 1871 | ||
1871 | event->tstamp_running += tstamp - event->tstamp_stopped; | ||
1872 | |||
1873 | perf_set_shadow_time(event, ctx, tstamp); | 1872 | perf_set_shadow_time(event, ctx, tstamp); |
1874 | 1873 | ||
1875 | perf_log_itrace_start(event); | 1874 | perf_log_itrace_start(event); |
@@ -1881,6 +1880,8 @@ event_sched_in(struct perf_event *event, | |||
1881 | goto out; | 1880 | goto out; |
1882 | } | 1881 | } |
1883 | 1882 | ||
1883 | event->tstamp_running += tstamp - event->tstamp_stopped; | ||
1884 | |||
1884 | if (!is_software_event(event)) | 1885 | if (!is_software_event(event)) |
1885 | cpuctx->active_oncpu++; | 1886 | cpuctx->active_oncpu++; |
1886 | if (!ctx->nr_active++) | 1887 | if (!ctx->nr_active++) |
@@ -2619,6 +2620,9 @@ static void perf_pmu_sched_task(struct task_struct *prev, | |||
2619 | local_irq_restore(flags); | 2620 | local_irq_restore(flags); |
2620 | } | 2621 | } |
2621 | 2622 | ||
2623 | static void perf_event_switch(struct task_struct *task, | ||
2624 | struct task_struct *next_prev, bool sched_in); | ||
2625 | |||
2622 | #define for_each_task_context_nr(ctxn) \ | 2626 | #define for_each_task_context_nr(ctxn) \ |
2623 | for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++) | 2627 | for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++) |
2624 | 2628 | ||
@@ -2641,6 +2645,9 @@ void __perf_event_task_sched_out(struct task_struct *task, | |||
2641 | if (__this_cpu_read(perf_sched_cb_usages)) | 2645 | if (__this_cpu_read(perf_sched_cb_usages)) |
2642 | perf_pmu_sched_task(task, next, false); | 2646 | perf_pmu_sched_task(task, next, false); |
2643 | 2647 | ||
2648 | if (atomic_read(&nr_switch_events)) | ||
2649 | perf_event_switch(task, next, false); | ||
2650 | |||
2644 | for_each_task_context_nr(ctxn) | 2651 | for_each_task_context_nr(ctxn) |
2645 | perf_event_context_sched_out(task, ctxn, next); | 2652 | perf_event_context_sched_out(task, ctxn, next); |
2646 | 2653 | ||
@@ -2831,6 +2838,9 @@ void __perf_event_task_sched_in(struct task_struct *prev, | |||
2831 | if (atomic_read(this_cpu_ptr(&perf_cgroup_events))) | 2838 | if (atomic_read(this_cpu_ptr(&perf_cgroup_events))) |
2832 | perf_cgroup_sched_in(prev, task); | 2839 | perf_cgroup_sched_in(prev, task); |
2833 | 2840 | ||
2841 | if (atomic_read(&nr_switch_events)) | ||
2842 | perf_event_switch(task, prev, true); | ||
2843 | |||
2834 | if (__this_cpu_read(perf_sched_cb_usages)) | 2844 | if (__this_cpu_read(perf_sched_cb_usages)) |
2835 | perf_pmu_sched_task(prev, task, true); | 2845 | perf_pmu_sched_task(prev, task, true); |
2836 | } | 2846 | } |
@@ -3212,6 +3222,59 @@ static inline u64 perf_event_count(struct perf_event *event) | |||
3212 | return __perf_event_count(event); | 3222 | return __perf_event_count(event); |
3213 | } | 3223 | } |
3214 | 3224 | ||
3225 | /* | ||
3226 | * NMI-safe method to read a local event, that is an event that | ||
3227 | * is: | ||
3228 | * - either for the current task, or for this CPU | ||
3229 | * - does not have inherit set, for inherited task events | ||
3230 | * will not be local and we cannot read them atomically | ||
3231 | * - must not have a pmu::count method | ||
3232 | */ | ||
3233 | u64 perf_event_read_local(struct perf_event *event) | ||
3234 | { | ||
3235 | unsigned long flags; | ||
3236 | u64 val; | ||
3237 | |||
3238 | /* | ||
3239 | * Disabling interrupts avoids all counter scheduling (context | ||
3240 | * switches, timer based rotation and IPIs). | ||
3241 | */ | ||
3242 | local_irq_save(flags); | ||
3243 | |||
3244 | /* If this is a per-task event, it must be for current */ | ||
3245 | WARN_ON_ONCE((event->attach_state & PERF_ATTACH_TASK) && | ||
3246 | event->hw.target != current); | ||
3247 | |||
3248 | /* If this is a per-CPU event, it must be for this CPU */ | ||
3249 | WARN_ON_ONCE(!(event->attach_state & PERF_ATTACH_TASK) && | ||
3250 | event->cpu != smp_processor_id()); | ||
3251 | |||
3252 | /* | ||
3253 | * It must not be an event with inherit set, we cannot read | ||
3254 | * all child counters from atomic context. | ||
3255 | */ | ||
3256 | WARN_ON_ONCE(event->attr.inherit); | ||
3257 | |||
3258 | /* | ||
3259 | * It must not have a pmu::count method, those are not | ||
3260 | * NMI safe. | ||
3261 | */ | ||
3262 | WARN_ON_ONCE(event->pmu->count); | ||
3263 | |||
3264 | /* | ||
3265 | * If the event is currently on this CPU, its either a per-task event, | ||
3266 | * or local to this CPU. Furthermore it means its ACTIVE (otherwise | ||
3267 | * oncpu == -1). | ||
3268 | */ | ||
3269 | if (event->oncpu == smp_processor_id()) | ||
3270 | event->pmu->read(event); | ||
3271 | |||
3272 | val = local64_read(&event->count); | ||
3273 | local_irq_restore(flags); | ||
3274 | |||
3275 | return val; | ||
3276 | } | ||
3277 | |||
3215 | static u64 perf_event_read(struct perf_event *event) | 3278 | static u64 perf_event_read(struct perf_event *event) |
3216 | { | 3279 | { |
3217 | /* | 3280 | /* |
@@ -3454,6 +3517,10 @@ static void unaccount_event(struct perf_event *event) | |||
3454 | atomic_dec(&nr_task_events); | 3517 | atomic_dec(&nr_task_events); |
3455 | if (event->attr.freq) | 3518 | if (event->attr.freq) |
3456 | atomic_dec(&nr_freq_events); | 3519 | atomic_dec(&nr_freq_events); |
3520 | if (event->attr.context_switch) { | ||
3521 | static_key_slow_dec_deferred(&perf_sched_events); | ||
3522 | atomic_dec(&nr_switch_events); | ||
3523 | } | ||
3457 | if (is_cgroup_event(event)) | 3524 | if (is_cgroup_event(event)) |
3458 | static_key_slow_dec_deferred(&perf_sched_events); | 3525 | static_key_slow_dec_deferred(&perf_sched_events); |
3459 | if (has_branch_stack(event)) | 3526 | if (has_branch_stack(event)) |
@@ -3958,28 +4025,21 @@ static void perf_event_for_each(struct perf_event *event, | |||
3958 | perf_event_for_each_child(sibling, func); | 4025 | perf_event_for_each_child(sibling, func); |
3959 | } | 4026 | } |
3960 | 4027 | ||
3961 | static int perf_event_period(struct perf_event *event, u64 __user *arg) | 4028 | struct period_event { |
3962 | { | 4029 | struct perf_event *event; |
3963 | struct perf_event_context *ctx = event->ctx; | ||
3964 | int ret = 0, active; | ||
3965 | u64 value; | 4030 | u64 value; |
4031 | }; | ||
3966 | 4032 | ||
3967 | if (!is_sampling_event(event)) | 4033 | static int __perf_event_period(void *info) |
3968 | return -EINVAL; | 4034 | { |
3969 | 4035 | struct period_event *pe = info; | |
3970 | if (copy_from_user(&value, arg, sizeof(value))) | 4036 | struct perf_event *event = pe->event; |
3971 | return -EFAULT; | 4037 | struct perf_event_context *ctx = event->ctx; |
3972 | 4038 | u64 value = pe->value; | |
3973 | if (!value) | 4039 | bool active; |
3974 | return -EINVAL; | ||
3975 | 4040 | ||
3976 | raw_spin_lock_irq(&ctx->lock); | 4041 | raw_spin_lock(&ctx->lock); |
3977 | if (event->attr.freq) { | 4042 | if (event->attr.freq) { |
3978 | if (value > sysctl_perf_event_sample_rate) { | ||
3979 | ret = -EINVAL; | ||
3980 | goto unlock; | ||
3981 | } | ||
3982 | |||
3983 | event->attr.sample_freq = value; | 4043 | event->attr.sample_freq = value; |
3984 | } else { | 4044 | } else { |
3985 | event->attr.sample_period = value; | 4045 | event->attr.sample_period = value; |
@@ -3998,11 +4058,53 @@ static int perf_event_period(struct perf_event *event, u64 __user *arg) | |||
3998 | event->pmu->start(event, PERF_EF_RELOAD); | 4058 | event->pmu->start(event, PERF_EF_RELOAD); |
3999 | perf_pmu_enable(ctx->pmu); | 4059 | perf_pmu_enable(ctx->pmu); |
4000 | } | 4060 | } |
4061 | raw_spin_unlock(&ctx->lock); | ||
4001 | 4062 | ||
4002 | unlock: | 4063 | return 0; |
4064 | } | ||
4065 | |||
4066 | static int perf_event_period(struct perf_event *event, u64 __user *arg) | ||
4067 | { | ||
4068 | struct period_event pe = { .event = event, }; | ||
4069 | struct perf_event_context *ctx = event->ctx; | ||
4070 | struct task_struct *task; | ||
4071 | u64 value; | ||
4072 | |||
4073 | if (!is_sampling_event(event)) | ||
4074 | return -EINVAL; | ||
4075 | |||
4076 | if (copy_from_user(&value, arg, sizeof(value))) | ||
4077 | return -EFAULT; | ||
4078 | |||
4079 | if (!value) | ||
4080 | return -EINVAL; | ||
4081 | |||
4082 | if (event->attr.freq && value > sysctl_perf_event_sample_rate) | ||
4083 | return -EINVAL; | ||
4084 | |||
4085 | task = ctx->task; | ||
4086 | pe.value = value; | ||
4087 | |||
4088 | if (!task) { | ||
4089 | cpu_function_call(event->cpu, __perf_event_period, &pe); | ||
4090 | return 0; | ||
4091 | } | ||
4092 | |||
4093 | retry: | ||
4094 | if (!task_function_call(task, __perf_event_period, &pe)) | ||
4095 | return 0; | ||
4096 | |||
4097 | raw_spin_lock_irq(&ctx->lock); | ||
4098 | if (ctx->is_active) { | ||
4099 | raw_spin_unlock_irq(&ctx->lock); | ||
4100 | task = ctx->task; | ||
4101 | goto retry; | ||
4102 | } | ||
4103 | |||
4104 | __perf_event_period(&pe); | ||
4003 | raw_spin_unlock_irq(&ctx->lock); | 4105 | raw_spin_unlock_irq(&ctx->lock); |
4004 | 4106 | ||
4005 | return ret; | 4107 | return 0; |
4006 | } | 4108 | } |
4007 | 4109 | ||
4008 | static const struct file_operations perf_fops; | 4110 | static const struct file_operations perf_fops; |
@@ -4740,12 +4842,20 @@ static const struct file_operations perf_fops = { | |||
4740 | * to user-space before waking everybody up. | 4842 | * to user-space before waking everybody up. |
4741 | */ | 4843 | */ |
4742 | 4844 | ||
4845 | static inline struct fasync_struct **perf_event_fasync(struct perf_event *event) | ||
4846 | { | ||
4847 | /* only the parent has fasync state */ | ||
4848 | if (event->parent) | ||
4849 | event = event->parent; | ||
4850 | return &event->fasync; | ||
4851 | } | ||
4852 | |||
4743 | void perf_event_wakeup(struct perf_event *event) | 4853 | void perf_event_wakeup(struct perf_event *event) |
4744 | { | 4854 | { |
4745 | ring_buffer_wakeup(event); | 4855 | ring_buffer_wakeup(event); |
4746 | 4856 | ||
4747 | if (event->pending_kill) { | 4857 | if (event->pending_kill) { |
4748 | kill_fasync(&event->fasync, SIGIO, event->pending_kill); | 4858 | kill_fasync(perf_event_fasync(event), SIGIO, event->pending_kill); |
4749 | event->pending_kill = 0; | 4859 | event->pending_kill = 0; |
4750 | } | 4860 | } |
4751 | } | 4861 | } |
@@ -5982,6 +6092,91 @@ void perf_log_lost_samples(struct perf_event *event, u64 lost) | |||
5982 | } | 6092 | } |
5983 | 6093 | ||
5984 | /* | 6094 | /* |
6095 | * context_switch tracking | ||
6096 | */ | ||
6097 | |||
6098 | struct perf_switch_event { | ||
6099 | struct task_struct *task; | ||
6100 | struct task_struct *next_prev; | ||
6101 | |||
6102 | struct { | ||
6103 | struct perf_event_header header; | ||
6104 | u32 next_prev_pid; | ||
6105 | u32 next_prev_tid; | ||
6106 | } event_id; | ||
6107 | }; | ||
6108 | |||
6109 | static int perf_event_switch_match(struct perf_event *event) | ||
6110 | { | ||
6111 | return event->attr.context_switch; | ||
6112 | } | ||
6113 | |||
6114 | static void perf_event_switch_output(struct perf_event *event, void *data) | ||
6115 | { | ||
6116 | struct perf_switch_event *se = data; | ||
6117 | struct perf_output_handle handle; | ||
6118 | struct perf_sample_data sample; | ||
6119 | int ret; | ||
6120 | |||
6121 | if (!perf_event_switch_match(event)) | ||
6122 | return; | ||
6123 | |||
6124 | /* Only CPU-wide events are allowed to see next/prev pid/tid */ | ||
6125 | if (event->ctx->task) { | ||
6126 | se->event_id.header.type = PERF_RECORD_SWITCH; | ||
6127 | se->event_id.header.size = sizeof(se->event_id.header); | ||
6128 | } else { | ||
6129 | se->event_id.header.type = PERF_RECORD_SWITCH_CPU_WIDE; | ||
6130 | se->event_id.header.size = sizeof(se->event_id); | ||
6131 | se->event_id.next_prev_pid = | ||
6132 | perf_event_pid(event, se->next_prev); | ||
6133 | se->event_id.next_prev_tid = | ||
6134 | perf_event_tid(event, se->next_prev); | ||
6135 | } | ||
6136 | |||
6137 | perf_event_header__init_id(&se->event_id.header, &sample, event); | ||
6138 | |||
6139 | ret = perf_output_begin(&handle, event, se->event_id.header.size); | ||
6140 | if (ret) | ||
6141 | return; | ||
6142 | |||
6143 | if (event->ctx->task) | ||
6144 | perf_output_put(&handle, se->event_id.header); | ||
6145 | else | ||
6146 | perf_output_put(&handle, se->event_id); | ||
6147 | |||
6148 | perf_event__output_id_sample(event, &handle, &sample); | ||
6149 | |||
6150 | perf_output_end(&handle); | ||
6151 | } | ||
6152 | |||
6153 | static void perf_event_switch(struct task_struct *task, | ||
6154 | struct task_struct *next_prev, bool sched_in) | ||
6155 | { | ||
6156 | struct perf_switch_event switch_event; | ||
6157 | |||
6158 | /* N.B. caller checks nr_switch_events != 0 */ | ||
6159 | |||
6160 | switch_event = (struct perf_switch_event){ | ||
6161 | .task = task, | ||
6162 | .next_prev = next_prev, | ||
6163 | .event_id = { | ||
6164 | .header = { | ||
6165 | /* .type */ | ||
6166 | .misc = sched_in ? 0 : PERF_RECORD_MISC_SWITCH_OUT, | ||
6167 | /* .size */ | ||
6168 | }, | ||
6169 | /* .next_prev_pid */ | ||
6170 | /* .next_prev_tid */ | ||
6171 | }, | ||
6172 | }; | ||
6173 | |||
6174 | perf_event_aux(perf_event_switch_output, | ||
6175 | &switch_event, | ||
6176 | NULL); | ||
6177 | } | ||
6178 | |||
6179 | /* | ||
5985 | * IRQ throttle logging | 6180 | * IRQ throttle logging |
5986 | */ | 6181 | */ |
5987 | 6182 | ||
@@ -6040,8 +6235,6 @@ static void perf_log_itrace_start(struct perf_event *event) | |||
6040 | event->hw.itrace_started) | 6235 | event->hw.itrace_started) |
6041 | return; | 6236 | return; |
6042 | 6237 | ||
6043 | event->hw.itrace_started = 1; | ||
6044 | |||
6045 | rec.header.type = PERF_RECORD_ITRACE_START; | 6238 | rec.header.type = PERF_RECORD_ITRACE_START; |
6046 | rec.header.misc = 0; | 6239 | rec.header.misc = 0; |
6047 | rec.header.size = sizeof(rec); | 6240 | rec.header.size = sizeof(rec); |
@@ -6124,7 +6317,7 @@ static int __perf_event_overflow(struct perf_event *event, | |||
6124 | else | 6317 | else |
6125 | perf_event_output(event, data, regs); | 6318 | perf_event_output(event, data, regs); |
6126 | 6319 | ||
6127 | if (event->fasync && event->pending_kill) { | 6320 | if (*perf_event_fasync(event) && event->pending_kill) { |
6128 | event->pending_wakeup = 1; | 6321 | event->pending_wakeup = 1; |
6129 | irq_work_queue(&event->pending); | 6322 | irq_work_queue(&event->pending); |
6130 | } | 6323 | } |
@@ -6749,8 +6942,8 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) | |||
6749 | if (event->tp_event->prog) | 6942 | if (event->tp_event->prog) |
6750 | return -EEXIST; | 6943 | return -EEXIST; |
6751 | 6944 | ||
6752 | if (!(event->tp_event->flags & TRACE_EVENT_FL_KPROBE)) | 6945 | if (!(event->tp_event->flags & TRACE_EVENT_FL_UKPROBE)) |
6753 | /* bpf programs can only be attached to kprobes */ | 6946 | /* bpf programs can only be attached to u/kprobes */ |
6754 | return -EINVAL; | 6947 | return -EINVAL; |
6755 | 6948 | ||
6756 | prog = bpf_prog_get(prog_fd); | 6949 | prog = bpf_prog_get(prog_fd); |
@@ -7479,6 +7672,10 @@ static void account_event(struct perf_event *event) | |||
7479 | if (atomic_inc_return(&nr_freq_events) == 1) | 7672 | if (atomic_inc_return(&nr_freq_events) == 1) |
7480 | tick_nohz_full_kick_all(); | 7673 | tick_nohz_full_kick_all(); |
7481 | } | 7674 | } |
7675 | if (event->attr.context_switch) { | ||
7676 | atomic_inc(&nr_switch_events); | ||
7677 | static_key_slow_inc(&perf_sched_events.key); | ||
7678 | } | ||
7482 | if (has_branch_stack(event)) | 7679 | if (has_branch_stack(event)) |
7483 | static_key_slow_inc(&perf_sched_events.key); | 7680 | static_key_slow_inc(&perf_sched_events.key); |
7484 | if (is_cgroup_event(event)) | 7681 | if (is_cgroup_event(event)) |
@@ -8574,6 +8771,31 @@ void perf_event_delayed_put(struct task_struct *task) | |||
8574 | WARN_ON_ONCE(task->perf_event_ctxp[ctxn]); | 8771 | WARN_ON_ONCE(task->perf_event_ctxp[ctxn]); |
8575 | } | 8772 | } |
8576 | 8773 | ||
8774 | struct perf_event *perf_event_get(unsigned int fd) | ||
8775 | { | ||
8776 | int err; | ||
8777 | struct fd f; | ||
8778 | struct perf_event *event; | ||
8779 | |||
8780 | err = perf_fget_light(fd, &f); | ||
8781 | if (err) | ||
8782 | return ERR_PTR(err); | ||
8783 | |||
8784 | event = f.file->private_data; | ||
8785 | atomic_long_inc(&event->refcount); | ||
8786 | fdput(f); | ||
8787 | |||
8788 | return event; | ||
8789 | } | ||
8790 | |||
8791 | const struct perf_event_attr *perf_event_attrs(struct perf_event *event) | ||
8792 | { | ||
8793 | if (!event) | ||
8794 | return ERR_PTR(-EINVAL); | ||
8795 | |||
8796 | return &event->attr; | ||
8797 | } | ||
8798 | |||
8577 | /* | 8799 | /* |
8578 | * inherit a event from parent task to child task: | 8800 | * inherit a event from parent task to child task: |
8579 | */ | 8801 | */ |
@@ -8872,7 +9094,7 @@ static void perf_event_init_cpu(int cpu) | |||
8872 | mutex_unlock(&swhash->hlist_mutex); | 9094 | mutex_unlock(&swhash->hlist_mutex); |
8873 | } | 9095 | } |
8874 | 9096 | ||
8875 | #if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC | 9097 | #if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC_CORE |
8876 | static void __perf_event_exit_context(void *__info) | 9098 | static void __perf_event_exit_context(void *__info) |
8877 | { | 9099 | { |
8878 | struct remove_event re = { .detach_group = true }; | 9100 | struct remove_event re = { .detach_group = true }; |
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index b2be01b1aa9d..182bc30899d5 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c | |||
@@ -437,7 +437,10 @@ static struct page *rb_alloc_aux_page(int node, int order) | |||
437 | 437 | ||
438 | if (page && order) { | 438 | if (page && order) { |
439 | /* | 439 | /* |
440 | * Communicate the allocation size to the driver | 440 | * Communicate the allocation size to the driver: |
441 | * if we managed to secure a high-order allocation, | ||
442 | * set its first page's private to this order; | ||
443 | * !PagePrivate(page) means it's just a normal page. | ||
441 | */ | 444 | */ |
442 | split_page(page, order); | 445 | split_page(page, order); |
443 | SetPagePrivate(page); | 446 | SetPagePrivate(page); |
@@ -559,11 +562,13 @@ static void __rb_free_aux(struct ring_buffer *rb) | |||
559 | rb->aux_priv = NULL; | 562 | rb->aux_priv = NULL; |
560 | } | 563 | } |
561 | 564 | ||
562 | for (pg = 0; pg < rb->aux_nr_pages; pg++) | 565 | if (rb->aux_nr_pages) { |
563 | rb_free_aux_page(rb, pg); | 566 | for (pg = 0; pg < rb->aux_nr_pages; pg++) |
567 | rb_free_aux_page(rb, pg); | ||
564 | 568 | ||
565 | kfree(rb->aux_pages); | 569 | kfree(rb->aux_pages); |
566 | rb->aux_nr_pages = 0; | 570 | rb->aux_nr_pages = 0; |
571 | } | ||
567 | } | 572 | } |
568 | 573 | ||
569 | void rb_free_aux(struct ring_buffer *rb) | 574 | void rb_free_aux(struct ring_buffer *rb) |
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index cb346f26a22d..4e5e9798aa0c 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c | |||
@@ -86,15 +86,6 @@ struct uprobe { | |||
86 | struct arch_uprobe arch; | 86 | struct arch_uprobe arch; |
87 | }; | 87 | }; |
88 | 88 | ||
89 | struct return_instance { | ||
90 | struct uprobe *uprobe; | ||
91 | unsigned long func; | ||
92 | unsigned long orig_ret_vaddr; /* original return address */ | ||
93 | bool chained; /* true, if instance is nested */ | ||
94 | |||
95 | struct return_instance *next; /* keep as stack */ | ||
96 | }; | ||
97 | |||
98 | /* | 89 | /* |
99 | * Execute out of line area: anonymous executable mapping installed | 90 | * Execute out of line area: anonymous executable mapping installed |
100 | * by the probed task to execute the copy of the original instruction | 91 | * by the probed task to execute the copy of the original instruction |
@@ -105,17 +96,18 @@ struct return_instance { | |||
105 | * allocated. | 96 | * allocated. |
106 | */ | 97 | */ |
107 | struct xol_area { | 98 | struct xol_area { |
108 | wait_queue_head_t wq; /* if all slots are busy */ | 99 | wait_queue_head_t wq; /* if all slots are busy */ |
109 | atomic_t slot_count; /* number of in-use slots */ | 100 | atomic_t slot_count; /* number of in-use slots */ |
110 | unsigned long *bitmap; /* 0 = free slot */ | 101 | unsigned long *bitmap; /* 0 = free slot */ |
111 | struct page *page; | ||
112 | 102 | ||
103 | struct vm_special_mapping xol_mapping; | ||
104 | struct page *pages[2]; | ||
113 | /* | 105 | /* |
114 | * We keep the vma's vm_start rather than a pointer to the vma | 106 | * We keep the vma's vm_start rather than a pointer to the vma |
115 | * itself. The probed process or a naughty kernel module could make | 107 | * itself. The probed process or a naughty kernel module could make |
116 | * the vma go away, and we must handle that reasonably gracefully. | 108 | * the vma go away, and we must handle that reasonably gracefully. |
117 | */ | 109 | */ |
118 | unsigned long vaddr; /* Page(s) of instruction slots */ | 110 | unsigned long vaddr; /* Page(s) of instruction slots */ |
119 | }; | 111 | }; |
120 | 112 | ||
121 | /* | 113 | /* |
@@ -366,6 +358,18 @@ set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long v | |||
366 | return uprobe_write_opcode(mm, vaddr, *(uprobe_opcode_t *)&auprobe->insn); | 358 | return uprobe_write_opcode(mm, vaddr, *(uprobe_opcode_t *)&auprobe->insn); |
367 | } | 359 | } |
368 | 360 | ||
361 | static struct uprobe *get_uprobe(struct uprobe *uprobe) | ||
362 | { | ||
363 | atomic_inc(&uprobe->ref); | ||
364 | return uprobe; | ||
365 | } | ||
366 | |||
367 | static void put_uprobe(struct uprobe *uprobe) | ||
368 | { | ||
369 | if (atomic_dec_and_test(&uprobe->ref)) | ||
370 | kfree(uprobe); | ||
371 | } | ||
372 | |||
369 | static int match_uprobe(struct uprobe *l, struct uprobe *r) | 373 | static int match_uprobe(struct uprobe *l, struct uprobe *r) |
370 | { | 374 | { |
371 | if (l->inode < r->inode) | 375 | if (l->inode < r->inode) |
@@ -393,10 +397,8 @@ static struct uprobe *__find_uprobe(struct inode *inode, loff_t offset) | |||
393 | while (n) { | 397 | while (n) { |
394 | uprobe = rb_entry(n, struct uprobe, rb_node); | 398 | uprobe = rb_entry(n, struct uprobe, rb_node); |
395 | match = match_uprobe(&u, uprobe); | 399 | match = match_uprobe(&u, uprobe); |
396 | if (!match) { | 400 | if (!match) |
397 | atomic_inc(&uprobe->ref); | 401 | return get_uprobe(uprobe); |
398 | return uprobe; | ||
399 | } | ||
400 | 402 | ||
401 | if (match < 0) | 403 | if (match < 0) |
402 | n = n->rb_left; | 404 | n = n->rb_left; |
@@ -432,10 +434,8 @@ static struct uprobe *__insert_uprobe(struct uprobe *uprobe) | |||
432 | parent = *p; | 434 | parent = *p; |
433 | u = rb_entry(parent, struct uprobe, rb_node); | 435 | u = rb_entry(parent, struct uprobe, rb_node); |
434 | match = match_uprobe(uprobe, u); | 436 | match = match_uprobe(uprobe, u); |
435 | if (!match) { | 437 | if (!match) |
436 | atomic_inc(&u->ref); | 438 | return get_uprobe(u); |
437 | return u; | ||
438 | } | ||
439 | 439 | ||
440 | if (match < 0) | 440 | if (match < 0) |
441 | p = &parent->rb_left; | 441 | p = &parent->rb_left; |
@@ -472,12 +472,6 @@ static struct uprobe *insert_uprobe(struct uprobe *uprobe) | |||
472 | return u; | 472 | return u; |
473 | } | 473 | } |
474 | 474 | ||
475 | static void put_uprobe(struct uprobe *uprobe) | ||
476 | { | ||
477 | if (atomic_dec_and_test(&uprobe->ref)) | ||
478 | kfree(uprobe); | ||
479 | } | ||
480 | |||
481 | static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset) | 475 | static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset) |
482 | { | 476 | { |
483 | struct uprobe *uprobe, *cur_uprobe; | 477 | struct uprobe *uprobe, *cur_uprobe; |
@@ -1039,14 +1033,14 @@ static void build_probe_list(struct inode *inode, | |||
1039 | if (u->inode != inode || u->offset < min) | 1033 | if (u->inode != inode || u->offset < min) |
1040 | break; | 1034 | break; |
1041 | list_add(&u->pending_list, head); | 1035 | list_add(&u->pending_list, head); |
1042 | atomic_inc(&u->ref); | 1036 | get_uprobe(u); |
1043 | } | 1037 | } |
1044 | for (t = n; (t = rb_next(t)); ) { | 1038 | for (t = n; (t = rb_next(t)); ) { |
1045 | u = rb_entry(t, struct uprobe, rb_node); | 1039 | u = rb_entry(t, struct uprobe, rb_node); |
1046 | if (u->inode != inode || u->offset > max) | 1040 | if (u->inode != inode || u->offset > max) |
1047 | break; | 1041 | break; |
1048 | list_add(&u->pending_list, head); | 1042 | list_add(&u->pending_list, head); |
1049 | atomic_inc(&u->ref); | 1043 | get_uprobe(u); |
1050 | } | 1044 | } |
1051 | } | 1045 | } |
1052 | spin_unlock(&uprobes_treelock); | 1046 | spin_unlock(&uprobes_treelock); |
@@ -1132,11 +1126,14 @@ void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned lon | |||
1132 | /* Slot allocation for XOL */ | 1126 | /* Slot allocation for XOL */ |
1133 | static int xol_add_vma(struct mm_struct *mm, struct xol_area *area) | 1127 | static int xol_add_vma(struct mm_struct *mm, struct xol_area *area) |
1134 | { | 1128 | { |
1135 | int ret = -EALREADY; | 1129 | struct vm_area_struct *vma; |
1130 | int ret; | ||
1136 | 1131 | ||
1137 | down_write(&mm->mmap_sem); | 1132 | down_write(&mm->mmap_sem); |
1138 | if (mm->uprobes_state.xol_area) | 1133 | if (mm->uprobes_state.xol_area) { |
1134 | ret = -EALREADY; | ||
1139 | goto fail; | 1135 | goto fail; |
1136 | } | ||
1140 | 1137 | ||
1141 | if (!area->vaddr) { | 1138 | if (!area->vaddr) { |
1142 | /* Try to map as high as possible, this is only a hint. */ | 1139 | /* Try to map as high as possible, this is only a hint. */ |
@@ -1148,11 +1145,15 @@ static int xol_add_vma(struct mm_struct *mm, struct xol_area *area) | |||
1148 | } | 1145 | } |
1149 | } | 1146 | } |
1150 | 1147 | ||
1151 | ret = install_special_mapping(mm, area->vaddr, PAGE_SIZE, | 1148 | vma = _install_special_mapping(mm, area->vaddr, PAGE_SIZE, |
1152 | VM_EXEC|VM_MAYEXEC|VM_DONTCOPY|VM_IO, &area->page); | 1149 | VM_EXEC|VM_MAYEXEC|VM_DONTCOPY|VM_IO, |
1153 | if (ret) | 1150 | &area->xol_mapping); |
1151 | if (IS_ERR(vma)) { | ||
1152 | ret = PTR_ERR(vma); | ||
1154 | goto fail; | 1153 | goto fail; |
1154 | } | ||
1155 | 1155 | ||
1156 | ret = 0; | ||
1156 | smp_wmb(); /* pairs with get_xol_area() */ | 1157 | smp_wmb(); /* pairs with get_xol_area() */ |
1157 | mm->uprobes_state.xol_area = area; | 1158 | mm->uprobes_state.xol_area = area; |
1158 | fail: | 1159 | fail: |
@@ -1175,21 +1176,24 @@ static struct xol_area *__create_xol_area(unsigned long vaddr) | |||
1175 | if (!area->bitmap) | 1176 | if (!area->bitmap) |
1176 | goto free_area; | 1177 | goto free_area; |
1177 | 1178 | ||
1178 | area->page = alloc_page(GFP_HIGHUSER); | 1179 | area->xol_mapping.name = "[uprobes]"; |
1179 | if (!area->page) | 1180 | area->xol_mapping.pages = area->pages; |
1181 | area->pages[0] = alloc_page(GFP_HIGHUSER); | ||
1182 | if (!area->pages[0]) | ||
1180 | goto free_bitmap; | 1183 | goto free_bitmap; |
1184 | area->pages[1] = NULL; | ||
1181 | 1185 | ||
1182 | area->vaddr = vaddr; | 1186 | area->vaddr = vaddr; |
1183 | init_waitqueue_head(&area->wq); | 1187 | init_waitqueue_head(&area->wq); |
1184 | /* Reserve the 1st slot for get_trampoline_vaddr() */ | 1188 | /* Reserve the 1st slot for get_trampoline_vaddr() */ |
1185 | set_bit(0, area->bitmap); | 1189 | set_bit(0, area->bitmap); |
1186 | atomic_set(&area->slot_count, 1); | 1190 | atomic_set(&area->slot_count, 1); |
1187 | copy_to_page(area->page, 0, &insn, UPROBE_SWBP_INSN_SIZE); | 1191 | copy_to_page(area->pages[0], 0, &insn, UPROBE_SWBP_INSN_SIZE); |
1188 | 1192 | ||
1189 | if (!xol_add_vma(mm, area)) | 1193 | if (!xol_add_vma(mm, area)) |
1190 | return area; | 1194 | return area; |
1191 | 1195 | ||
1192 | __free_page(area->page); | 1196 | __free_page(area->pages[0]); |
1193 | free_bitmap: | 1197 | free_bitmap: |
1194 | kfree(area->bitmap); | 1198 | kfree(area->bitmap); |
1195 | free_area: | 1199 | free_area: |
@@ -1227,7 +1231,7 @@ void uprobe_clear_state(struct mm_struct *mm) | |||
1227 | if (!area) | 1231 | if (!area) |
1228 | return; | 1232 | return; |
1229 | 1233 | ||
1230 | put_page(area->page); | 1234 | put_page(area->pages[0]); |
1231 | kfree(area->bitmap); | 1235 | kfree(area->bitmap); |
1232 | kfree(area); | 1236 | kfree(area); |
1233 | } | 1237 | } |
@@ -1296,7 +1300,7 @@ static unsigned long xol_get_insn_slot(struct uprobe *uprobe) | |||
1296 | if (unlikely(!xol_vaddr)) | 1300 | if (unlikely(!xol_vaddr)) |
1297 | return 0; | 1301 | return 0; |
1298 | 1302 | ||
1299 | arch_uprobe_copy_ixol(area->page, xol_vaddr, | 1303 | arch_uprobe_copy_ixol(area->pages[0], xol_vaddr, |
1300 | &uprobe->arch.ixol, sizeof(uprobe->arch.ixol)); | 1304 | &uprobe->arch.ixol, sizeof(uprobe->arch.ixol)); |
1301 | 1305 | ||
1302 | return xol_vaddr; | 1306 | return xol_vaddr; |
@@ -1333,6 +1337,7 @@ static void xol_free_insn_slot(struct task_struct *tsk) | |||
1333 | 1337 | ||
1334 | clear_bit(slot_nr, area->bitmap); | 1338 | clear_bit(slot_nr, area->bitmap); |
1335 | atomic_dec(&area->slot_count); | 1339 | atomic_dec(&area->slot_count); |
1340 | smp_mb__after_atomic(); /* pairs with prepare_to_wait() */ | ||
1336 | if (waitqueue_active(&area->wq)) | 1341 | if (waitqueue_active(&area->wq)) |
1337 | wake_up(&area->wq); | 1342 | wake_up(&area->wq); |
1338 | 1343 | ||
@@ -1376,6 +1381,14 @@ unsigned long uprobe_get_trap_addr(struct pt_regs *regs) | |||
1376 | return instruction_pointer(regs); | 1381 | return instruction_pointer(regs); |
1377 | } | 1382 | } |
1378 | 1383 | ||
1384 | static struct return_instance *free_ret_instance(struct return_instance *ri) | ||
1385 | { | ||
1386 | struct return_instance *next = ri->next; | ||
1387 | put_uprobe(ri->uprobe); | ||
1388 | kfree(ri); | ||
1389 | return next; | ||
1390 | } | ||
1391 | |||
1379 | /* | 1392 | /* |
1380 | * Called with no locks held. | 1393 | * Called with no locks held. |
1381 | * Called in context of a exiting or a exec-ing thread. | 1394 | * Called in context of a exiting or a exec-ing thread. |
@@ -1383,7 +1396,7 @@ unsigned long uprobe_get_trap_addr(struct pt_regs *regs) | |||
1383 | void uprobe_free_utask(struct task_struct *t) | 1396 | void uprobe_free_utask(struct task_struct *t) |
1384 | { | 1397 | { |
1385 | struct uprobe_task *utask = t->utask; | 1398 | struct uprobe_task *utask = t->utask; |
1386 | struct return_instance *ri, *tmp; | 1399 | struct return_instance *ri; |
1387 | 1400 | ||
1388 | if (!utask) | 1401 | if (!utask) |
1389 | return; | 1402 | return; |
@@ -1392,13 +1405,8 @@ void uprobe_free_utask(struct task_struct *t) | |||
1392 | put_uprobe(utask->active_uprobe); | 1405 | put_uprobe(utask->active_uprobe); |
1393 | 1406 | ||
1394 | ri = utask->return_instances; | 1407 | ri = utask->return_instances; |
1395 | while (ri) { | 1408 | while (ri) |
1396 | tmp = ri; | 1409 | ri = free_ret_instance(ri); |
1397 | ri = ri->next; | ||
1398 | |||
1399 | put_uprobe(tmp->uprobe); | ||
1400 | kfree(tmp); | ||
1401 | } | ||
1402 | 1410 | ||
1403 | xol_free_insn_slot(t); | 1411 | xol_free_insn_slot(t); |
1404 | kfree(utask); | 1412 | kfree(utask); |
@@ -1437,7 +1445,7 @@ static int dup_utask(struct task_struct *t, struct uprobe_task *o_utask) | |||
1437 | return -ENOMEM; | 1445 | return -ENOMEM; |
1438 | 1446 | ||
1439 | *n = *o; | 1447 | *n = *o; |
1440 | atomic_inc(&n->uprobe->ref); | 1448 | get_uprobe(n->uprobe); |
1441 | n->next = NULL; | 1449 | n->next = NULL; |
1442 | 1450 | ||
1443 | *p = n; | 1451 | *p = n; |
@@ -1515,12 +1523,25 @@ static unsigned long get_trampoline_vaddr(void) | |||
1515 | return trampoline_vaddr; | 1523 | return trampoline_vaddr; |
1516 | } | 1524 | } |
1517 | 1525 | ||
1526 | static void cleanup_return_instances(struct uprobe_task *utask, bool chained, | ||
1527 | struct pt_regs *regs) | ||
1528 | { | ||
1529 | struct return_instance *ri = utask->return_instances; | ||
1530 | enum rp_check ctx = chained ? RP_CHECK_CHAIN_CALL : RP_CHECK_CALL; | ||
1531 | |||
1532 | while (ri && !arch_uretprobe_is_alive(ri, ctx, regs)) { | ||
1533 | ri = free_ret_instance(ri); | ||
1534 | utask->depth--; | ||
1535 | } | ||
1536 | utask->return_instances = ri; | ||
1537 | } | ||
1538 | |||
1518 | static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs) | 1539 | static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs) |
1519 | { | 1540 | { |
1520 | struct return_instance *ri; | 1541 | struct return_instance *ri; |
1521 | struct uprobe_task *utask; | 1542 | struct uprobe_task *utask; |
1522 | unsigned long orig_ret_vaddr, trampoline_vaddr; | 1543 | unsigned long orig_ret_vaddr, trampoline_vaddr; |
1523 | bool chained = false; | 1544 | bool chained; |
1524 | 1545 | ||
1525 | if (!get_xol_area()) | 1546 | if (!get_xol_area()) |
1526 | return; | 1547 | return; |
@@ -1536,49 +1557,47 @@ static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs) | |||
1536 | return; | 1557 | return; |
1537 | } | 1558 | } |
1538 | 1559 | ||
1539 | ri = kzalloc(sizeof(struct return_instance), GFP_KERNEL); | 1560 | ri = kmalloc(sizeof(struct return_instance), GFP_KERNEL); |
1540 | if (!ri) | 1561 | if (!ri) |
1541 | goto fail; | 1562 | return; |
1542 | 1563 | ||
1543 | trampoline_vaddr = get_trampoline_vaddr(); | 1564 | trampoline_vaddr = get_trampoline_vaddr(); |
1544 | orig_ret_vaddr = arch_uretprobe_hijack_return_addr(trampoline_vaddr, regs); | 1565 | orig_ret_vaddr = arch_uretprobe_hijack_return_addr(trampoline_vaddr, regs); |
1545 | if (orig_ret_vaddr == -1) | 1566 | if (orig_ret_vaddr == -1) |
1546 | goto fail; | 1567 | goto fail; |
1547 | 1568 | ||
1569 | /* drop the entries invalidated by longjmp() */ | ||
1570 | chained = (orig_ret_vaddr == trampoline_vaddr); | ||
1571 | cleanup_return_instances(utask, chained, regs); | ||
1572 | |||
1548 | /* | 1573 | /* |
1549 | * We don't want to keep trampoline address in stack, rather keep the | 1574 | * We don't want to keep trampoline address in stack, rather keep the |
1550 | * original return address of first caller thru all the consequent | 1575 | * original return address of first caller thru all the consequent |
1551 | * instances. This also makes breakpoint unwrapping easier. | 1576 | * instances. This also makes breakpoint unwrapping easier. |
1552 | */ | 1577 | */ |
1553 | if (orig_ret_vaddr == trampoline_vaddr) { | 1578 | if (chained) { |
1554 | if (!utask->return_instances) { | 1579 | if (!utask->return_instances) { |
1555 | /* | 1580 | /* |
1556 | * This situation is not possible. Likely we have an | 1581 | * This situation is not possible. Likely we have an |
1557 | * attack from user-space. | 1582 | * attack from user-space. |
1558 | */ | 1583 | */ |
1559 | pr_warn("uprobe: unable to set uretprobe pid/tgid=%d/%d\n", | 1584 | uprobe_warn(current, "handle tail call"); |
1560 | current->pid, current->tgid); | ||
1561 | goto fail; | 1585 | goto fail; |
1562 | } | 1586 | } |
1563 | |||
1564 | chained = true; | ||
1565 | orig_ret_vaddr = utask->return_instances->orig_ret_vaddr; | 1587 | orig_ret_vaddr = utask->return_instances->orig_ret_vaddr; |
1566 | } | 1588 | } |
1567 | 1589 | ||
1568 | atomic_inc(&uprobe->ref); | 1590 | ri->uprobe = get_uprobe(uprobe); |
1569 | ri->uprobe = uprobe; | ||
1570 | ri->func = instruction_pointer(regs); | 1591 | ri->func = instruction_pointer(regs); |
1592 | ri->stack = user_stack_pointer(regs); | ||
1571 | ri->orig_ret_vaddr = orig_ret_vaddr; | 1593 | ri->orig_ret_vaddr = orig_ret_vaddr; |
1572 | ri->chained = chained; | 1594 | ri->chained = chained; |
1573 | 1595 | ||
1574 | utask->depth++; | 1596 | utask->depth++; |
1575 | |||
1576 | /* add instance to the stack */ | ||
1577 | ri->next = utask->return_instances; | 1597 | ri->next = utask->return_instances; |
1578 | utask->return_instances = ri; | 1598 | utask->return_instances = ri; |
1579 | 1599 | ||
1580 | return; | 1600 | return; |
1581 | |||
1582 | fail: | 1601 | fail: |
1583 | kfree(ri); | 1602 | kfree(ri); |
1584 | } | 1603 | } |
@@ -1766,46 +1785,58 @@ handle_uretprobe_chain(struct return_instance *ri, struct pt_regs *regs) | |||
1766 | up_read(&uprobe->register_rwsem); | 1785 | up_read(&uprobe->register_rwsem); |
1767 | } | 1786 | } |
1768 | 1787 | ||
1769 | static bool handle_trampoline(struct pt_regs *regs) | 1788 | static struct return_instance *find_next_ret_chain(struct return_instance *ri) |
1770 | { | 1789 | { |
1771 | struct uprobe_task *utask; | ||
1772 | struct return_instance *ri, *tmp; | ||
1773 | bool chained; | 1790 | bool chained; |
1774 | 1791 | ||
1792 | do { | ||
1793 | chained = ri->chained; | ||
1794 | ri = ri->next; /* can't be NULL if chained */ | ||
1795 | } while (chained); | ||
1796 | |||
1797 | return ri; | ||
1798 | } | ||
1799 | |||
1800 | static void handle_trampoline(struct pt_regs *regs) | ||
1801 | { | ||
1802 | struct uprobe_task *utask; | ||
1803 | struct return_instance *ri, *next; | ||
1804 | bool valid; | ||
1805 | |||
1775 | utask = current->utask; | 1806 | utask = current->utask; |
1776 | if (!utask) | 1807 | if (!utask) |
1777 | return false; | 1808 | goto sigill; |
1778 | 1809 | ||
1779 | ri = utask->return_instances; | 1810 | ri = utask->return_instances; |
1780 | if (!ri) | 1811 | if (!ri) |
1781 | return false; | 1812 | goto sigill; |
1782 | |||
1783 | /* | ||
1784 | * TODO: we should throw out return_instance's invalidated by | ||
1785 | * longjmp(), currently we assume that the probed function always | ||
1786 | * returns. | ||
1787 | */ | ||
1788 | instruction_pointer_set(regs, ri->orig_ret_vaddr); | ||
1789 | |||
1790 | for (;;) { | ||
1791 | handle_uretprobe_chain(ri, regs); | ||
1792 | |||
1793 | chained = ri->chained; | ||
1794 | put_uprobe(ri->uprobe); | ||
1795 | |||
1796 | tmp = ri; | ||
1797 | ri = ri->next; | ||
1798 | kfree(tmp); | ||
1799 | utask->depth--; | ||
1800 | 1813 | ||
1801 | if (!chained) | 1814 | do { |
1802 | break; | 1815 | /* |
1803 | BUG_ON(!ri); | 1816 | * We should throw out the frames invalidated by longjmp(). |
1804 | } | 1817 | * If this chain is valid, then the next one should be alive |
1818 | * or NULL; the latter case means that nobody but ri->func | ||
1819 | * could hit this trampoline on return. TODO: sigaltstack(). | ||
1820 | */ | ||
1821 | next = find_next_ret_chain(ri); | ||
1822 | valid = !next || arch_uretprobe_is_alive(next, RP_CHECK_RET, regs); | ||
1823 | |||
1824 | instruction_pointer_set(regs, ri->orig_ret_vaddr); | ||
1825 | do { | ||
1826 | if (valid) | ||
1827 | handle_uretprobe_chain(ri, regs); | ||
1828 | ri = free_ret_instance(ri); | ||
1829 | utask->depth--; | ||
1830 | } while (ri != next); | ||
1831 | } while (!valid); | ||
1805 | 1832 | ||
1806 | utask->return_instances = ri; | 1833 | utask->return_instances = ri; |
1834 | return; | ||
1835 | |||
1836 | sigill: | ||
1837 | uprobe_warn(current, "handle uretprobe, sending SIGILL."); | ||
1838 | force_sig_info(SIGILL, SEND_SIG_FORCED, current); | ||
1807 | 1839 | ||
1808 | return true; | ||
1809 | } | 1840 | } |
1810 | 1841 | ||
1811 | bool __weak arch_uprobe_ignore(struct arch_uprobe *aup, struct pt_regs *regs) | 1842 | bool __weak arch_uprobe_ignore(struct arch_uprobe *aup, struct pt_regs *regs) |
@@ -1813,6 +1844,12 @@ bool __weak arch_uprobe_ignore(struct arch_uprobe *aup, struct pt_regs *regs) | |||
1813 | return false; | 1844 | return false; |
1814 | } | 1845 | } |
1815 | 1846 | ||
1847 | bool __weak arch_uretprobe_is_alive(struct return_instance *ret, enum rp_check ctx, | ||
1848 | struct pt_regs *regs) | ||
1849 | { | ||
1850 | return true; | ||
1851 | } | ||
1852 | |||
1816 | /* | 1853 | /* |
1817 | * Run handler and ask thread to singlestep. | 1854 | * Run handler and ask thread to singlestep. |
1818 | * Ensure all non-fatal signals cannot interrupt thread while it singlesteps. | 1855 | * Ensure all non-fatal signals cannot interrupt thread while it singlesteps. |
@@ -1824,13 +1861,8 @@ static void handle_swbp(struct pt_regs *regs) | |||
1824 | int uninitialized_var(is_swbp); | 1861 | int uninitialized_var(is_swbp); |
1825 | 1862 | ||
1826 | bp_vaddr = uprobe_get_swbp_addr(regs); | 1863 | bp_vaddr = uprobe_get_swbp_addr(regs); |
1827 | if (bp_vaddr == get_trampoline_vaddr()) { | 1864 | if (bp_vaddr == get_trampoline_vaddr()) |
1828 | if (handle_trampoline(regs)) | 1865 | return handle_trampoline(regs); |
1829 | return; | ||
1830 | |||
1831 | pr_warn("uprobe: unable to handle uretprobe pid/tgid=%d/%d\n", | ||
1832 | current->pid, current->tgid); | ||
1833 | } | ||
1834 | 1866 | ||
1835 | uprobe = find_active_uprobe(bp_vaddr, &is_swbp); | 1867 | uprobe = find_active_uprobe(bp_vaddr, &is_swbp); |
1836 | if (!uprobe) { | 1868 | if (!uprobe) { |
diff --git a/kernel/exit.c b/kernel/exit.c index 031325e9acf9..ea95ee1b5ef7 100644 --- a/kernel/exit.c +++ b/kernel/exit.c | |||
@@ -1471,7 +1471,7 @@ static long do_wait(struct wait_opts *wo) | |||
1471 | add_wait_queue(¤t->signal->wait_chldexit, &wo->child_wait); | 1471 | add_wait_queue(¤t->signal->wait_chldexit, &wo->child_wait); |
1472 | repeat: | 1472 | repeat: |
1473 | /* | 1473 | /* |
1474 | * If there is nothing that can match our critiera just get out. | 1474 | * If there is nothing that can match our criteria, just get out. |
1475 | * We will clear ->notask_error to zero if we see any child that | 1475 | * We will clear ->notask_error to zero if we see any child that |
1476 | * might later match our criteria, even if we are not able to reap | 1476 | * might later match our criteria, even if we are not able to reap |
1477 | * it yet. | 1477 | * it yet. |
diff --git a/kernel/extable.c b/kernel/extable.c index c98f926277a8..e820ccee9846 100644 --- a/kernel/extable.c +++ b/kernel/extable.c | |||
@@ -18,7 +18,6 @@ | |||
18 | #include <linux/ftrace.h> | 18 | #include <linux/ftrace.h> |
19 | #include <linux/memory.h> | 19 | #include <linux/memory.h> |
20 | #include <linux/module.h> | 20 | #include <linux/module.h> |
21 | #include <linux/ftrace.h> | ||
22 | #include <linux/mutex.h> | 21 | #include <linux/mutex.h> |
23 | #include <linux/init.h> | 22 | #include <linux/init.h> |
24 | 23 | ||
diff --git a/kernel/fork.c b/kernel/fork.c index 1bfefc6f96a4..7d5f0f118a63 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -287,6 +287,11 @@ static void set_max_threads(unsigned int max_threads_suggested) | |||
287 | max_threads = clamp_t(u64, threads, MIN_THREADS, MAX_THREADS); | 287 | max_threads = clamp_t(u64, threads, MIN_THREADS, MAX_THREADS); |
288 | } | 288 | } |
289 | 289 | ||
290 | #ifdef CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT | ||
291 | /* Initialized by the architecture: */ | ||
292 | int arch_task_struct_size __read_mostly; | ||
293 | #endif | ||
294 | |||
290 | void __init fork_init(void) | 295 | void __init fork_init(void) |
291 | { | 296 | { |
292 | #ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR | 297 | #ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR |
@@ -295,7 +300,7 @@ void __init fork_init(void) | |||
295 | #endif | 300 | #endif |
296 | /* create a slab on which task_structs can be allocated */ | 301 | /* create a slab on which task_structs can be allocated */ |
297 | task_struct_cachep = | 302 | task_struct_cachep = |
298 | kmem_cache_create("task_struct", sizeof(struct task_struct), | 303 | kmem_cache_create("task_struct", arch_task_struct_size, |
299 | ARCH_MIN_TASKALIGN, SLAB_PANIC | SLAB_NOTRACK, NULL); | 304 | ARCH_MIN_TASKALIGN, SLAB_PANIC | SLAB_NOTRACK, NULL); |
300 | #endif | 305 | #endif |
301 | 306 | ||
@@ -449,8 +454,9 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) | |||
449 | tmp->vm_mm = mm; | 454 | tmp->vm_mm = mm; |
450 | if (anon_vma_fork(tmp, mpnt)) | 455 | if (anon_vma_fork(tmp, mpnt)) |
451 | goto fail_nomem_anon_vma_fork; | 456 | goto fail_nomem_anon_vma_fork; |
452 | tmp->vm_flags &= ~VM_LOCKED; | 457 | tmp->vm_flags &= ~(VM_LOCKED|VM_UFFD_MISSING|VM_UFFD_WP); |
453 | tmp->vm_next = tmp->vm_prev = NULL; | 458 | tmp->vm_next = tmp->vm_prev = NULL; |
459 | tmp->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; | ||
454 | file = tmp->vm_file; | 460 | file = tmp->vm_file; |
455 | if (file) { | 461 | if (file) { |
456 | struct inode *inode = file_inode(file); | 462 | struct inode *inode = file_inode(file); |
@@ -1067,6 +1073,7 @@ static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk) | |||
1067 | rcu_assign_pointer(tsk->sighand, sig); | 1073 | rcu_assign_pointer(tsk->sighand, sig); |
1068 | if (!sig) | 1074 | if (!sig) |
1069 | return -ENOMEM; | 1075 | return -ENOMEM; |
1076 | |||
1070 | atomic_set(&sig->count, 1); | 1077 | atomic_set(&sig->count, 1); |
1071 | memcpy(sig->action, current->sighand->action, sizeof(sig->action)); | 1078 | memcpy(sig->action, current->sighand->action, sizeof(sig->action)); |
1072 | return 0; | 1079 | return 0; |
@@ -1128,6 +1135,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) | |||
1128 | init_sigpending(&sig->shared_pending); | 1135 | init_sigpending(&sig->shared_pending); |
1129 | INIT_LIST_HEAD(&sig->posix_timers); | 1136 | INIT_LIST_HEAD(&sig->posix_timers); |
1130 | seqlock_init(&sig->stats_lock); | 1137 | seqlock_init(&sig->stats_lock); |
1138 | prev_cputime_init(&sig->prev_cputime); | ||
1131 | 1139 | ||
1132 | hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); | 1140 | hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); |
1133 | sig->real_timer.function = it_real_fn; | 1141 | sig->real_timer.function = it_real_fn; |
@@ -1239,6 +1247,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1239 | { | 1247 | { |
1240 | int retval; | 1248 | int retval; |
1241 | struct task_struct *p; | 1249 | struct task_struct *p; |
1250 | void *cgrp_ss_priv[CGROUP_CANFORK_COUNT] = {}; | ||
1242 | 1251 | ||
1243 | if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS)) | 1252 | if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS)) |
1244 | return ERR_PTR(-EINVAL); | 1253 | return ERR_PTR(-EINVAL); |
@@ -1273,10 +1282,9 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1273 | 1282 | ||
1274 | /* | 1283 | /* |
1275 | * If the new process will be in a different pid or user namespace | 1284 | * If the new process will be in a different pid or user namespace |
1276 | * do not allow it to share a thread group or signal handlers or | 1285 | * do not allow it to share a thread group with the forking task. |
1277 | * parent with the forking task. | ||
1278 | */ | 1286 | */ |
1279 | if (clone_flags & CLONE_SIGHAND) { | 1287 | if (clone_flags & CLONE_THREAD) { |
1280 | if ((clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) || | 1288 | if ((clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) || |
1281 | (task_active_pid_ns(current) != | 1289 | (task_active_pid_ns(current) != |
1282 | current->nsproxy->pid_ns_for_children)) | 1290 | current->nsproxy->pid_ns_for_children)) |
@@ -1335,9 +1343,8 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1335 | 1343 | ||
1336 | p->utime = p->stime = p->gtime = 0; | 1344 | p->utime = p->stime = p->gtime = 0; |
1337 | p->utimescaled = p->stimescaled = 0; | 1345 | p->utimescaled = p->stimescaled = 0; |
1338 | #ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE | 1346 | prev_cputime_init(&p->prev_cputime); |
1339 | p->prev_cputime.utime = p->prev_cputime.stime = 0; | 1347 | |
1340 | #endif | ||
1341 | #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN | 1348 | #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN |
1342 | seqlock_init(&p->vtime_seqlock); | 1349 | seqlock_init(&p->vtime_seqlock); |
1343 | p->vtime_snap = 0; | 1350 | p->vtime_snap = 0; |
@@ -1513,6 +1520,16 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1513 | p->task_works = NULL; | 1520 | p->task_works = NULL; |
1514 | 1521 | ||
1515 | /* | 1522 | /* |
1523 | * Ensure that the cgroup subsystem policies allow the new process to be | ||
1524 | * forked. It should be noted the the new process's css_set can be changed | ||
1525 | * between here and cgroup_post_fork() if an organisation operation is in | ||
1526 | * progress. | ||
1527 | */ | ||
1528 | retval = cgroup_can_fork(p, cgrp_ss_priv); | ||
1529 | if (retval) | ||
1530 | goto bad_fork_free_pid; | ||
1531 | |||
1532 | /* | ||
1516 | * Make it visible to the rest of the system, but dont wake it up yet. | 1533 | * Make it visible to the rest of the system, but dont wake it up yet. |
1517 | * Need tasklist lock for parent etc handling! | 1534 | * Need tasklist lock for parent etc handling! |
1518 | */ | 1535 | */ |
@@ -1548,7 +1565,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1548 | spin_unlock(¤t->sighand->siglock); | 1565 | spin_unlock(¤t->sighand->siglock); |
1549 | write_unlock_irq(&tasklist_lock); | 1566 | write_unlock_irq(&tasklist_lock); |
1550 | retval = -ERESTARTNOINTR; | 1567 | retval = -ERESTARTNOINTR; |
1551 | goto bad_fork_free_pid; | 1568 | goto bad_fork_cancel_cgroup; |
1552 | } | 1569 | } |
1553 | 1570 | ||
1554 | if (likely(p->pid)) { | 1571 | if (likely(p->pid)) { |
@@ -1590,7 +1607,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1590 | write_unlock_irq(&tasklist_lock); | 1607 | write_unlock_irq(&tasklist_lock); |
1591 | 1608 | ||
1592 | proc_fork_connector(p); | 1609 | proc_fork_connector(p); |
1593 | cgroup_post_fork(p); | 1610 | cgroup_post_fork(p, cgrp_ss_priv); |
1594 | if (clone_flags & CLONE_THREAD) | 1611 | if (clone_flags & CLONE_THREAD) |
1595 | threadgroup_change_end(current); | 1612 | threadgroup_change_end(current); |
1596 | perf_event_fork(p); | 1613 | perf_event_fork(p); |
@@ -1600,6 +1617,8 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1600 | 1617 | ||
1601 | return p; | 1618 | return p; |
1602 | 1619 | ||
1620 | bad_fork_cancel_cgroup: | ||
1621 | cgroup_cancel_fork(p, cgrp_ss_priv); | ||
1603 | bad_fork_free_pid: | 1622 | bad_fork_free_pid: |
1604 | if (pid != &init_struct_pid) | 1623 | if (pid != &init_struct_pid) |
1605 | free_pid(pid); | 1624 | free_pid(pid); |
@@ -1866,13 +1885,21 @@ static int check_unshare_flags(unsigned long unshare_flags) | |||
1866 | CLONE_NEWUSER|CLONE_NEWPID)) | 1885 | CLONE_NEWUSER|CLONE_NEWPID)) |
1867 | return -EINVAL; | 1886 | return -EINVAL; |
1868 | /* | 1887 | /* |
1869 | * Not implemented, but pretend it works if there is nothing to | 1888 | * Not implemented, but pretend it works if there is nothing |
1870 | * unshare. Note that unsharing CLONE_THREAD or CLONE_SIGHAND | 1889 | * to unshare. Note that unsharing the address space or the |
1871 | * needs to unshare vm. | 1890 | * signal handlers also need to unshare the signal queues (aka |
1891 | * CLONE_THREAD). | ||
1872 | */ | 1892 | */ |
1873 | if (unshare_flags & (CLONE_THREAD | CLONE_SIGHAND | CLONE_VM)) { | 1893 | if (unshare_flags & (CLONE_THREAD | CLONE_SIGHAND | CLONE_VM)) { |
1874 | /* FIXME: get_task_mm() increments ->mm_users */ | 1894 | if (!thread_group_empty(current)) |
1875 | if (atomic_read(¤t->mm->mm_users) > 1) | 1895 | return -EINVAL; |
1896 | } | ||
1897 | if (unshare_flags & (CLONE_SIGHAND | CLONE_VM)) { | ||
1898 | if (atomic_read(¤t->sighand->count) > 1) | ||
1899 | return -EINVAL; | ||
1900 | } | ||
1901 | if (unshare_flags & CLONE_VM) { | ||
1902 | if (!current_is_single_threaded()) | ||
1876 | return -EINVAL; | 1903 | return -EINVAL; |
1877 | } | 1904 | } |
1878 | 1905 | ||
@@ -1936,21 +1963,22 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) | |||
1936 | int err; | 1963 | int err; |
1937 | 1964 | ||
1938 | /* | 1965 | /* |
1939 | * If unsharing a user namespace must also unshare the thread. | 1966 | * If unsharing a user namespace must also unshare the thread group |
1967 | * and unshare the filesystem root and working directories. | ||
1940 | */ | 1968 | */ |
1941 | if (unshare_flags & CLONE_NEWUSER) | 1969 | if (unshare_flags & CLONE_NEWUSER) |
1942 | unshare_flags |= CLONE_THREAD | CLONE_FS; | 1970 | unshare_flags |= CLONE_THREAD | CLONE_FS; |
1943 | /* | 1971 | /* |
1944 | * If unsharing a thread from a thread group, must also unshare vm. | ||
1945 | */ | ||
1946 | if (unshare_flags & CLONE_THREAD) | ||
1947 | unshare_flags |= CLONE_VM; | ||
1948 | /* | ||
1949 | * If unsharing vm, must also unshare signal handlers. | 1972 | * If unsharing vm, must also unshare signal handlers. |
1950 | */ | 1973 | */ |
1951 | if (unshare_flags & CLONE_VM) | 1974 | if (unshare_flags & CLONE_VM) |
1952 | unshare_flags |= CLONE_SIGHAND; | 1975 | unshare_flags |= CLONE_SIGHAND; |
1953 | /* | 1976 | /* |
1977 | * If unsharing a signal handlers, must also unshare the signal queues. | ||
1978 | */ | ||
1979 | if (unshare_flags & CLONE_SIGHAND) | ||
1980 | unshare_flags |= CLONE_THREAD; | ||
1981 | /* | ||
1954 | * If unsharing namespace, must also unshare filesystem information. | 1982 | * If unsharing namespace, must also unshare filesystem information. |
1955 | */ | 1983 | */ |
1956 | if (unshare_flags & CLONE_NEWNS) | 1984 | if (unshare_flags & CLONE_NEWNS) |
diff --git a/kernel/futex.c b/kernel/futex.c index c4a182f5357e..6e443efc65f4 100644 --- a/kernel/futex.c +++ b/kernel/futex.c | |||
@@ -64,6 +64,7 @@ | |||
64 | #include <linux/hugetlb.h> | 64 | #include <linux/hugetlb.h> |
65 | #include <linux/freezer.h> | 65 | #include <linux/freezer.h> |
66 | #include <linux/bootmem.h> | 66 | #include <linux/bootmem.h> |
67 | #include <linux/fault-inject.h> | ||
67 | 68 | ||
68 | #include <asm/futex.h> | 69 | #include <asm/futex.h> |
69 | 70 | ||
@@ -258,6 +259,66 @@ static unsigned long __read_mostly futex_hashsize; | |||
258 | 259 | ||
259 | static struct futex_hash_bucket *futex_queues; | 260 | static struct futex_hash_bucket *futex_queues; |
260 | 261 | ||
262 | /* | ||
263 | * Fault injections for futexes. | ||
264 | */ | ||
265 | #ifdef CONFIG_FAIL_FUTEX | ||
266 | |||
267 | static struct { | ||
268 | struct fault_attr attr; | ||
269 | |||
270 | u32 ignore_private; | ||
271 | } fail_futex = { | ||
272 | .attr = FAULT_ATTR_INITIALIZER, | ||
273 | .ignore_private = 0, | ||
274 | }; | ||
275 | |||
276 | static int __init setup_fail_futex(char *str) | ||
277 | { | ||
278 | return setup_fault_attr(&fail_futex.attr, str); | ||
279 | } | ||
280 | __setup("fail_futex=", setup_fail_futex); | ||
281 | |||
282 | static bool should_fail_futex(bool fshared) | ||
283 | { | ||
284 | if (fail_futex.ignore_private && !fshared) | ||
285 | return false; | ||
286 | |||
287 | return should_fail(&fail_futex.attr, 1); | ||
288 | } | ||
289 | |||
290 | #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS | ||
291 | |||
292 | static int __init fail_futex_debugfs(void) | ||
293 | { | ||
294 | umode_t mode = S_IFREG | S_IRUSR | S_IWUSR; | ||
295 | struct dentry *dir; | ||
296 | |||
297 | dir = fault_create_debugfs_attr("fail_futex", NULL, | ||
298 | &fail_futex.attr); | ||
299 | if (IS_ERR(dir)) | ||
300 | return PTR_ERR(dir); | ||
301 | |||
302 | if (!debugfs_create_bool("ignore-private", mode, dir, | ||
303 | &fail_futex.ignore_private)) { | ||
304 | debugfs_remove_recursive(dir); | ||
305 | return -ENOMEM; | ||
306 | } | ||
307 | |||
308 | return 0; | ||
309 | } | ||
310 | |||
311 | late_initcall(fail_futex_debugfs); | ||
312 | |||
313 | #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */ | ||
314 | |||
315 | #else | ||
316 | static inline bool should_fail_futex(bool fshared) | ||
317 | { | ||
318 | return false; | ||
319 | } | ||
320 | #endif /* CONFIG_FAIL_FUTEX */ | ||
321 | |||
261 | static inline void futex_get_mm(union futex_key *key) | 322 | static inline void futex_get_mm(union futex_key *key) |
262 | { | 323 | { |
263 | atomic_inc(&key->private.mm->mm_count); | 324 | atomic_inc(&key->private.mm->mm_count); |
@@ -413,6 +474,9 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw) | |||
413 | if (unlikely(!access_ok(rw, uaddr, sizeof(u32)))) | 474 | if (unlikely(!access_ok(rw, uaddr, sizeof(u32)))) |
414 | return -EFAULT; | 475 | return -EFAULT; |
415 | 476 | ||
477 | if (unlikely(should_fail_futex(fshared))) | ||
478 | return -EFAULT; | ||
479 | |||
416 | /* | 480 | /* |
417 | * PROCESS_PRIVATE futexes are fast. | 481 | * PROCESS_PRIVATE futexes are fast. |
418 | * As the mm cannot disappear under us and the 'key' only needs | 482 | * As the mm cannot disappear under us and the 'key' only needs |
@@ -428,6 +492,10 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw) | |||
428 | } | 492 | } |
429 | 493 | ||
430 | again: | 494 | again: |
495 | /* Ignore any VERIFY_READ mapping (futex common case) */ | ||
496 | if (unlikely(should_fail_futex(fshared))) | ||
497 | return -EFAULT; | ||
498 | |||
431 | err = get_user_pages_fast(address, 1, 1, &page); | 499 | err = get_user_pages_fast(address, 1, 1, &page); |
432 | /* | 500 | /* |
433 | * If write access is not required (eg. FUTEX_WAIT), try | 501 | * If write access is not required (eg. FUTEX_WAIT), try |
@@ -516,7 +584,7 @@ again: | |||
516 | * A RO anonymous page will never change and thus doesn't make | 584 | * A RO anonymous page will never change and thus doesn't make |
517 | * sense for futex operations. | 585 | * sense for futex operations. |
518 | */ | 586 | */ |
519 | if (ro) { | 587 | if (unlikely(should_fail_futex(fshared)) || ro) { |
520 | err = -EFAULT; | 588 | err = -EFAULT; |
521 | goto out; | 589 | goto out; |
522 | } | 590 | } |
@@ -974,6 +1042,9 @@ static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval) | |||
974 | { | 1042 | { |
975 | u32 uninitialized_var(curval); | 1043 | u32 uninitialized_var(curval); |
976 | 1044 | ||
1045 | if (unlikely(should_fail_futex(true))) | ||
1046 | return -EFAULT; | ||
1047 | |||
977 | if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))) | 1048 | if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))) |
978 | return -EFAULT; | 1049 | return -EFAULT; |
979 | 1050 | ||
@@ -1015,12 +1086,18 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb, | |||
1015 | if (get_futex_value_locked(&uval, uaddr)) | 1086 | if (get_futex_value_locked(&uval, uaddr)) |
1016 | return -EFAULT; | 1087 | return -EFAULT; |
1017 | 1088 | ||
1089 | if (unlikely(should_fail_futex(true))) | ||
1090 | return -EFAULT; | ||
1091 | |||
1018 | /* | 1092 | /* |
1019 | * Detect deadlocks. | 1093 | * Detect deadlocks. |
1020 | */ | 1094 | */ |
1021 | if ((unlikely((uval & FUTEX_TID_MASK) == vpid))) | 1095 | if ((unlikely((uval & FUTEX_TID_MASK) == vpid))) |
1022 | return -EDEADLK; | 1096 | return -EDEADLK; |
1023 | 1097 | ||
1098 | if ((unlikely(should_fail_futex(true)))) | ||
1099 | return -EDEADLK; | ||
1100 | |||
1024 | /* | 1101 | /* |
1025 | * Lookup existing state first. If it exists, try to attach to | 1102 | * Lookup existing state first. If it exists, try to attach to |
1026 | * its pi_state. | 1103 | * its pi_state. |
@@ -1155,6 +1232,9 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this, | |||
1155 | */ | 1232 | */ |
1156 | newval = FUTEX_WAITERS | task_pid_vnr(new_owner); | 1233 | newval = FUTEX_WAITERS | task_pid_vnr(new_owner); |
1157 | 1234 | ||
1235 | if (unlikely(should_fail_futex(true))) | ||
1236 | ret = -EFAULT; | ||
1237 | |||
1158 | if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) | 1238 | if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) |
1159 | ret = -EFAULT; | 1239 | ret = -EFAULT; |
1160 | else if (curval != uval) | 1240 | else if (curval != uval) |
@@ -1457,6 +1537,9 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex, | |||
1457 | if (get_futex_value_locked(&curval, pifutex)) | 1537 | if (get_futex_value_locked(&curval, pifutex)) |
1458 | return -EFAULT; | 1538 | return -EFAULT; |
1459 | 1539 | ||
1540 | if (unlikely(should_fail_futex(true))) | ||
1541 | return -EFAULT; | ||
1542 | |||
1460 | /* | 1543 | /* |
1461 | * Find the top_waiter and determine if there are additional waiters. | 1544 | * Find the top_waiter and determine if there are additional waiters. |
1462 | * If the caller intends to requeue more than 1 waiter to pifutex, | 1545 | * If the caller intends to requeue more than 1 waiter to pifutex, |
@@ -2268,8 +2351,11 @@ static long futex_wait_restart(struct restart_block *restart) | |||
2268 | /* | 2351 | /* |
2269 | * Userspace tried a 0 -> TID atomic transition of the futex value | 2352 | * Userspace tried a 0 -> TID atomic transition of the futex value |
2270 | * and failed. The kernel side here does the whole locking operation: | 2353 | * and failed. The kernel side here does the whole locking operation: |
2271 | * if there are waiters then it will block, it does PI, etc. (Due to | 2354 | * if there are waiters then it will block as a consequence of relying |
2272 | * races the kernel might see a 0 value of the futex too.) | 2355 | * on rt-mutexes, it does PI, etc. (Due to races the kernel might see |
2356 | * a 0 value of the futex too.). | ||
2357 | * | ||
2358 | * Also serves as futex trylock_pi()'ing, and due semantics. | ||
2273 | */ | 2359 | */ |
2274 | static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, | 2360 | static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, |
2275 | ktime_t *time, int trylock) | 2361 | ktime_t *time, int trylock) |
@@ -2300,6 +2386,10 @@ retry_private: | |||
2300 | 2386 | ||
2301 | ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current, 0); | 2387 | ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current, 0); |
2302 | if (unlikely(ret)) { | 2388 | if (unlikely(ret)) { |
2389 | /* | ||
2390 | * Atomic work succeeded and we got the lock, | ||
2391 | * or failed. Either way, we do _not_ block. | ||
2392 | */ | ||
2303 | switch (ret) { | 2393 | switch (ret) { |
2304 | case 1: | 2394 | case 1: |
2305 | /* We got the lock. */ | 2395 | /* We got the lock. */ |
@@ -2530,7 +2620,7 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, | |||
2530 | * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2 | 2620 | * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2 |
2531 | * @uaddr: the futex we initially wait on (non-pi) | 2621 | * @uaddr: the futex we initially wait on (non-pi) |
2532 | * @flags: futex flags (FLAGS_SHARED, FLAGS_CLOCKRT, etc.), they must be | 2622 | * @flags: futex flags (FLAGS_SHARED, FLAGS_CLOCKRT, etc.), they must be |
2533 | * the same type, no requeueing from private to shared, etc. | 2623 | * the same type, no requeueing from private to shared, etc. |
2534 | * @val: the expected value of uaddr | 2624 | * @val: the expected value of uaddr |
2535 | * @abs_time: absolute timeout | 2625 | * @abs_time: absolute timeout |
2536 | * @bitset: 32 bit wakeup bitset set by userspace, defaults to all | 2626 | * @bitset: 32 bit wakeup bitset set by userspace, defaults to all |
@@ -3005,6 +3095,8 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, | |||
3005 | if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI || | 3095 | if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI || |
3006 | cmd == FUTEX_WAIT_BITSET || | 3096 | cmd == FUTEX_WAIT_BITSET || |
3007 | cmd == FUTEX_WAIT_REQUEUE_PI)) { | 3097 | cmd == FUTEX_WAIT_REQUEUE_PI)) { |
3098 | if (unlikely(should_fail_futex(!(op & FUTEX_PRIVATE_FLAG)))) | ||
3099 | return -EFAULT; | ||
3008 | if (copy_from_user(&ts, utime, sizeof(ts)) != 0) | 3100 | if (copy_from_user(&ts, utime, sizeof(ts)) != 0) |
3009 | return -EFAULT; | 3101 | return -EFAULT; |
3010 | if (!timespec_valid(&ts)) | 3102 | if (!timespec_valid(&ts)) |
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 27f4332c7f84..6e40a9539763 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c | |||
@@ -63,7 +63,7 @@ int irq_set_irq_type(unsigned int irq, unsigned int type) | |||
63 | return -EINVAL; | 63 | return -EINVAL; |
64 | 64 | ||
65 | type &= IRQ_TYPE_SENSE_MASK; | 65 | type &= IRQ_TYPE_SENSE_MASK; |
66 | ret = __irq_set_trigger(desc, irq, type); | 66 | ret = __irq_set_trigger(desc, type); |
67 | irq_put_desc_busunlock(desc, flags); | 67 | irq_put_desc_busunlock(desc, flags); |
68 | return ret; | 68 | return ret; |
69 | } | 69 | } |
@@ -187,7 +187,7 @@ int irq_startup(struct irq_desc *desc, bool resend) | |||
187 | irq_enable(desc); | 187 | irq_enable(desc); |
188 | } | 188 | } |
189 | if (resend) | 189 | if (resend) |
190 | check_irq_resend(desc, desc->irq_data.irq); | 190 | check_irq_resend(desc); |
191 | return ret; | 191 | return ret; |
192 | } | 192 | } |
193 | 193 | ||
@@ -315,7 +315,7 @@ void handle_nested_irq(unsigned int irq) | |||
315 | raw_spin_lock_irq(&desc->lock); | 315 | raw_spin_lock_irq(&desc->lock); |
316 | 316 | ||
317 | desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); | 317 | desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); |
318 | kstat_incr_irqs_this_cpu(irq, desc); | 318 | kstat_incr_irqs_this_cpu(desc); |
319 | 319 | ||
320 | action = desc->action; | 320 | action = desc->action; |
321 | if (unlikely(!action || irqd_irq_disabled(&desc->irq_data))) { | 321 | if (unlikely(!action || irqd_irq_disabled(&desc->irq_data))) { |
@@ -328,7 +328,7 @@ void handle_nested_irq(unsigned int irq) | |||
328 | 328 | ||
329 | action_ret = action->thread_fn(action->irq, action->dev_id); | 329 | action_ret = action->thread_fn(action->irq, action->dev_id); |
330 | if (!noirqdebug) | 330 | if (!noirqdebug) |
331 | note_interrupt(irq, desc, action_ret); | 331 | note_interrupt(desc, action_ret); |
332 | 332 | ||
333 | raw_spin_lock_irq(&desc->lock); | 333 | raw_spin_lock_irq(&desc->lock); |
334 | irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS); | 334 | irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS); |
@@ -391,7 +391,7 @@ handle_simple_irq(unsigned int irq, struct irq_desc *desc) | |||
391 | goto out_unlock; | 391 | goto out_unlock; |
392 | 392 | ||
393 | desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); | 393 | desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); |
394 | kstat_incr_irqs_this_cpu(irq, desc); | 394 | kstat_incr_irqs_this_cpu(desc); |
395 | 395 | ||
396 | if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) { | 396 | if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) { |
397 | desc->istate |= IRQS_PENDING; | 397 | desc->istate |= IRQS_PENDING; |
@@ -443,7 +443,7 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc) | |||
443 | goto out_unlock; | 443 | goto out_unlock; |
444 | 444 | ||
445 | desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); | 445 | desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); |
446 | kstat_incr_irqs_this_cpu(irq, desc); | 446 | kstat_incr_irqs_this_cpu(desc); |
447 | 447 | ||
448 | /* | 448 | /* |
449 | * If its disabled or no action available | 449 | * If its disabled or no action available |
@@ -515,7 +515,7 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) | |||
515 | goto out; | 515 | goto out; |
516 | 516 | ||
517 | desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); | 517 | desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); |
518 | kstat_incr_irqs_this_cpu(irq, desc); | 518 | kstat_incr_irqs_this_cpu(desc); |
519 | 519 | ||
520 | /* | 520 | /* |
521 | * If its disabled or no action available | 521 | * If its disabled or no action available |
@@ -583,7 +583,7 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc) | |||
583 | goto out_unlock; | 583 | goto out_unlock; |
584 | } | 584 | } |
585 | 585 | ||
586 | kstat_incr_irqs_this_cpu(irq, desc); | 586 | kstat_incr_irqs_this_cpu(desc); |
587 | 587 | ||
588 | /* Start handling the irq */ | 588 | /* Start handling the irq */ |
589 | desc->irq_data.chip->irq_ack(&desc->irq_data); | 589 | desc->irq_data.chip->irq_ack(&desc->irq_data); |
@@ -646,7 +646,7 @@ void handle_edge_eoi_irq(unsigned int irq, struct irq_desc *desc) | |||
646 | goto out_eoi; | 646 | goto out_eoi; |
647 | } | 647 | } |
648 | 648 | ||
649 | kstat_incr_irqs_this_cpu(irq, desc); | 649 | kstat_incr_irqs_this_cpu(desc); |
650 | 650 | ||
651 | do { | 651 | do { |
652 | if (unlikely(!desc->action)) | 652 | if (unlikely(!desc->action)) |
@@ -675,7 +675,7 @@ handle_percpu_irq(unsigned int irq, struct irq_desc *desc) | |||
675 | { | 675 | { |
676 | struct irq_chip *chip = irq_desc_get_chip(desc); | 676 | struct irq_chip *chip = irq_desc_get_chip(desc); |
677 | 677 | ||
678 | kstat_incr_irqs_this_cpu(irq, desc); | 678 | kstat_incr_irqs_this_cpu(desc); |
679 | 679 | ||
680 | if (chip->irq_ack) | 680 | if (chip->irq_ack) |
681 | chip->irq_ack(&desc->irq_data); | 681 | chip->irq_ack(&desc->irq_data); |
@@ -705,7 +705,7 @@ void handle_percpu_devid_irq(unsigned int irq, struct irq_desc *desc) | |||
705 | void *dev_id = raw_cpu_ptr(action->percpu_dev_id); | 705 | void *dev_id = raw_cpu_ptr(action->percpu_dev_id); |
706 | irqreturn_t res; | 706 | irqreturn_t res; |
707 | 707 | ||
708 | kstat_incr_irqs_this_cpu(irq, desc); | 708 | kstat_incr_irqs_this_cpu(desc); |
709 | 709 | ||
710 | if (chip->irq_ack) | 710 | if (chip->irq_ack) |
711 | chip->irq_ack(&desc->irq_data); | 711 | chip->irq_ack(&desc->irq_data); |
@@ -985,6 +985,23 @@ int irq_chip_set_affinity_parent(struct irq_data *data, | |||
985 | } | 985 | } |
986 | 986 | ||
987 | /** | 987 | /** |
988 | * irq_chip_set_type_parent - Set IRQ type on the parent interrupt | ||
989 | * @data: Pointer to interrupt specific data | ||
990 | * @type: IRQ_TYPE_{LEVEL,EDGE}_* value - see include/linux/irq.h | ||
991 | * | ||
992 | * Conditional, as the underlying parent chip might not implement it. | ||
993 | */ | ||
994 | int irq_chip_set_type_parent(struct irq_data *data, unsigned int type) | ||
995 | { | ||
996 | data = data->parent_data; | ||
997 | |||
998 | if (data->chip->irq_set_type) | ||
999 | return data->chip->irq_set_type(data, type); | ||
1000 | |||
1001 | return -ENOSYS; | ||
1002 | } | ||
1003 | |||
1004 | /** | ||
988 | * irq_chip_retrigger_hierarchy - Retrigger an interrupt in hardware | 1005 | * irq_chip_retrigger_hierarchy - Retrigger an interrupt in hardware |
989 | * @data: Pointer to interrupt specific data | 1006 | * @data: Pointer to interrupt specific data |
990 | * | 1007 | * |
@@ -997,13 +1014,13 @@ int irq_chip_retrigger_hierarchy(struct irq_data *data) | |||
997 | if (data->chip && data->chip->irq_retrigger) | 1014 | if (data->chip && data->chip->irq_retrigger) |
998 | return data->chip->irq_retrigger(data); | 1015 | return data->chip->irq_retrigger(data); |
999 | 1016 | ||
1000 | return -ENOSYS; | 1017 | return 0; |
1001 | } | 1018 | } |
1002 | 1019 | ||
1003 | /** | 1020 | /** |
1004 | * irq_chip_set_vcpu_affinity_parent - Set vcpu affinity on the parent interrupt | 1021 | * irq_chip_set_vcpu_affinity_parent - Set vcpu affinity on the parent interrupt |
1005 | * @data: Pointer to interrupt specific data | 1022 | * @data: Pointer to interrupt specific data |
1006 | * @dest: The vcpu affinity information | 1023 | * @vcpu_info: The vcpu affinity information |
1007 | */ | 1024 | */ |
1008 | int irq_chip_set_vcpu_affinity_parent(struct irq_data *data, void *vcpu_info) | 1025 | int irq_chip_set_vcpu_affinity_parent(struct irq_data *data, void *vcpu_info) |
1009 | { | 1026 | { |
diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c index 15b370daf234..abd286afbd27 100644 --- a/kernel/irq/generic-chip.c +++ b/kernel/irq/generic-chip.c | |||
@@ -553,6 +553,9 @@ static int irq_gc_suspend(void) | |||
553 | if (data) | 553 | if (data) |
554 | ct->chip.irq_suspend(data); | 554 | ct->chip.irq_suspend(data); |
555 | } | 555 | } |
556 | |||
557 | if (gc->suspend) | ||
558 | gc->suspend(gc); | ||
556 | } | 559 | } |
557 | return 0; | 560 | return 0; |
558 | } | 561 | } |
@@ -564,6 +567,9 @@ static void irq_gc_resume(void) | |||
564 | list_for_each_entry(gc, &gc_list, list) { | 567 | list_for_each_entry(gc, &gc_list, list) { |
565 | struct irq_chip_type *ct = gc->chip_types; | 568 | struct irq_chip_type *ct = gc->chip_types; |
566 | 569 | ||
570 | if (gc->resume) | ||
571 | gc->resume(gc); | ||
572 | |||
567 | if (ct->chip.irq_resume) { | 573 | if (ct->chip.irq_resume) { |
568 | struct irq_data *data = irq_gc_get_irq_data(gc); | 574 | struct irq_data *data = irq_gc_get_irq_data(gc); |
569 | 575 | ||
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 635480270858..b6eeea8a80c5 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c | |||
@@ -30,7 +30,7 @@ | |||
30 | void handle_bad_irq(unsigned int irq, struct irq_desc *desc) | 30 | void handle_bad_irq(unsigned int irq, struct irq_desc *desc) |
31 | { | 31 | { |
32 | print_irq_desc(irq, desc); | 32 | print_irq_desc(irq, desc); |
33 | kstat_incr_irqs_this_cpu(irq, desc); | 33 | kstat_incr_irqs_this_cpu(desc); |
34 | ack_bad_irq(irq); | 34 | ack_bad_irq(irq); |
35 | } | 35 | } |
36 | 36 | ||
@@ -176,7 +176,7 @@ handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action) | |||
176 | add_interrupt_randomness(irq, flags); | 176 | add_interrupt_randomness(irq, flags); |
177 | 177 | ||
178 | if (!noirqdebug) | 178 | if (!noirqdebug) |
179 | note_interrupt(irq, desc, retval); | 179 | note_interrupt(desc, retval); |
180 | return retval; | 180 | return retval; |
181 | } | 181 | } |
182 | 182 | ||
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 61008b8433ab..eee4b385cffb 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h | |||
@@ -59,10 +59,9 @@ enum { | |||
59 | #include "debug.h" | 59 | #include "debug.h" |
60 | #include "settings.h" | 60 | #include "settings.h" |
61 | 61 | ||
62 | extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, | 62 | extern int __irq_set_trigger(struct irq_desc *desc, unsigned long flags); |
63 | unsigned long flags); | 63 | extern void __disable_irq(struct irq_desc *desc); |
64 | extern void __disable_irq(struct irq_desc *desc, unsigned int irq); | 64 | extern void __enable_irq(struct irq_desc *desc); |
65 | extern void __enable_irq(struct irq_desc *desc, unsigned int irq); | ||
66 | 65 | ||
67 | extern int irq_startup(struct irq_desc *desc, bool resend); | 66 | extern int irq_startup(struct irq_desc *desc, bool resend); |
68 | extern void irq_shutdown(struct irq_desc *desc); | 67 | extern void irq_shutdown(struct irq_desc *desc); |
@@ -86,7 +85,7 @@ irqreturn_t handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *act | |||
86 | irqreturn_t handle_irq_event(struct irq_desc *desc); | 85 | irqreturn_t handle_irq_event(struct irq_desc *desc); |
87 | 86 | ||
88 | /* Resending of interrupts :*/ | 87 | /* Resending of interrupts :*/ |
89 | void check_irq_resend(struct irq_desc *desc, unsigned int irq); | 88 | void check_irq_resend(struct irq_desc *desc); |
90 | bool irq_wait_for_poll(struct irq_desc *desc); | 89 | bool irq_wait_for_poll(struct irq_desc *desc); |
91 | void __irq_wake_thread(struct irq_desc *desc, struct irqaction *action); | 90 | void __irq_wake_thread(struct irq_desc *desc, struct irqaction *action); |
92 | 91 | ||
@@ -187,7 +186,7 @@ static inline bool irqd_has_set(struct irq_data *d, unsigned int mask) | |||
187 | return __irqd_to_state(d) & mask; | 186 | return __irqd_to_state(d) & mask; |
188 | } | 187 | } |
189 | 188 | ||
190 | static inline void kstat_incr_irqs_this_cpu(unsigned int irq, struct irq_desc *desc) | 189 | static inline void kstat_incr_irqs_this_cpu(struct irq_desc *desc) |
191 | { | 190 | { |
192 | __this_cpu_inc(*desc->kstat_irqs); | 191 | __this_cpu_inc(*desc->kstat_irqs); |
193 | __this_cpu_inc(kstat.irqs_sum); | 192 | __this_cpu_inc(kstat.irqs_sum); |
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 4afc457613dd..0a2a4b697bcb 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c | |||
@@ -582,7 +582,7 @@ int irq_set_percpu_devid(unsigned int irq) | |||
582 | 582 | ||
583 | void kstat_incr_irq_this_cpu(unsigned int irq) | 583 | void kstat_incr_irq_this_cpu(unsigned int irq) |
584 | { | 584 | { |
585 | kstat_incr_irqs_this_cpu(irq, irq_to_desc(irq)); | 585 | kstat_incr_irqs_this_cpu(irq_to_desc(irq)); |
586 | } | 586 | } |
587 | 587 | ||
588 | /** | 588 | /** |
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 8c3577fef78c..79baaf8a7813 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c | |||
@@ -187,10 +187,12 @@ struct irq_domain *irq_domain_add_legacy(struct device_node *of_node, | |||
187 | EXPORT_SYMBOL_GPL(irq_domain_add_legacy); | 187 | EXPORT_SYMBOL_GPL(irq_domain_add_legacy); |
188 | 188 | ||
189 | /** | 189 | /** |
190 | * irq_find_host() - Locates a domain for a given device node | 190 | * irq_find_matching_host() - Locates a domain for a given device node |
191 | * @node: device-tree node of the interrupt controller | 191 | * @node: device-tree node of the interrupt controller |
192 | * @bus_token: domain-specific data | ||
192 | */ | 193 | */ |
193 | struct irq_domain *irq_find_host(struct device_node *node) | 194 | struct irq_domain *irq_find_matching_host(struct device_node *node, |
195 | enum irq_domain_bus_token bus_token) | ||
194 | { | 196 | { |
195 | struct irq_domain *h, *found = NULL; | 197 | struct irq_domain *h, *found = NULL; |
196 | int rc; | 198 | int rc; |
@@ -199,13 +201,19 @@ struct irq_domain *irq_find_host(struct device_node *node) | |||
199 | * it might potentially be set to match all interrupts in | 201 | * it might potentially be set to match all interrupts in |
200 | * the absence of a device node. This isn't a problem so far | 202 | * the absence of a device node. This isn't a problem so far |
201 | * yet though... | 203 | * yet though... |
204 | * | ||
205 | * bus_token == DOMAIN_BUS_ANY matches any domain, any other | ||
206 | * values must generate an exact match for the domain to be | ||
207 | * selected. | ||
202 | */ | 208 | */ |
203 | mutex_lock(&irq_domain_mutex); | 209 | mutex_lock(&irq_domain_mutex); |
204 | list_for_each_entry(h, &irq_domain_list, link) { | 210 | list_for_each_entry(h, &irq_domain_list, link) { |
205 | if (h->ops->match) | 211 | if (h->ops->match) |
206 | rc = h->ops->match(h, node); | 212 | rc = h->ops->match(h, node, bus_token); |
207 | else | 213 | else |
208 | rc = (h->of_node != NULL) && (h->of_node == node); | 214 | rc = ((h->of_node != NULL) && (h->of_node == node) && |
215 | ((bus_token == DOMAIN_BUS_ANY) || | ||
216 | (h->bus_token == bus_token))); | ||
209 | 217 | ||
210 | if (rc) { | 218 | if (rc) { |
211 | found = h; | 219 | found = h; |
@@ -215,7 +223,7 @@ struct irq_domain *irq_find_host(struct device_node *node) | |||
215 | mutex_unlock(&irq_domain_mutex); | 223 | mutex_unlock(&irq_domain_mutex); |
216 | return found; | 224 | return found; |
217 | } | 225 | } |
218 | EXPORT_SYMBOL_GPL(irq_find_host); | 226 | EXPORT_SYMBOL_GPL(irq_find_matching_host); |
219 | 227 | ||
220 | /** | 228 | /** |
221 | * irq_set_default_host() - Set a "default" irq domain | 229 | * irq_set_default_host() - Set a "default" irq domain |
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index f9744853b656..ad1b064f94fe 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c | |||
@@ -115,6 +115,14 @@ EXPORT_SYMBOL(synchronize_irq); | |||
115 | #ifdef CONFIG_SMP | 115 | #ifdef CONFIG_SMP |
116 | cpumask_var_t irq_default_affinity; | 116 | cpumask_var_t irq_default_affinity; |
117 | 117 | ||
118 | static int __irq_can_set_affinity(struct irq_desc *desc) | ||
119 | { | ||
120 | if (!desc || !irqd_can_balance(&desc->irq_data) || | ||
121 | !desc->irq_data.chip || !desc->irq_data.chip->irq_set_affinity) | ||
122 | return 0; | ||
123 | return 1; | ||
124 | } | ||
125 | |||
118 | /** | 126 | /** |
119 | * irq_can_set_affinity - Check if the affinity of a given irq can be set | 127 | * irq_can_set_affinity - Check if the affinity of a given irq can be set |
120 | * @irq: Interrupt to check | 128 | * @irq: Interrupt to check |
@@ -122,13 +130,7 @@ cpumask_var_t irq_default_affinity; | |||
122 | */ | 130 | */ |
123 | int irq_can_set_affinity(unsigned int irq) | 131 | int irq_can_set_affinity(unsigned int irq) |
124 | { | 132 | { |
125 | struct irq_desc *desc = irq_to_desc(irq); | 133 | return __irq_can_set_affinity(irq_to_desc(irq)); |
126 | |||
127 | if (!desc || !irqd_can_balance(&desc->irq_data) || | ||
128 | !desc->irq_data.chip || !desc->irq_data.chip->irq_set_affinity) | ||
129 | return 0; | ||
130 | |||
131 | return 1; | ||
132 | } | 134 | } |
133 | 135 | ||
134 | /** | 136 | /** |
@@ -359,14 +361,13 @@ EXPORT_SYMBOL_GPL(irq_set_affinity_notifier); | |||
359 | /* | 361 | /* |
360 | * Generic version of the affinity autoselector. | 362 | * Generic version of the affinity autoselector. |
361 | */ | 363 | */ |
362 | static int | 364 | static int setup_affinity(struct irq_desc *desc, struct cpumask *mask) |
363 | setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask) | ||
364 | { | 365 | { |
365 | struct cpumask *set = irq_default_affinity; | 366 | struct cpumask *set = irq_default_affinity; |
366 | int node = irq_desc_get_node(desc); | 367 | int node = irq_desc_get_node(desc); |
367 | 368 | ||
368 | /* Excludes PER_CPU and NO_BALANCE interrupts */ | 369 | /* Excludes PER_CPU and NO_BALANCE interrupts */ |
369 | if (!irq_can_set_affinity(irq)) | 370 | if (!__irq_can_set_affinity(desc)) |
370 | return 0; | 371 | return 0; |
371 | 372 | ||
372 | /* | 373 | /* |
@@ -393,10 +394,10 @@ setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask) | |||
393 | return 0; | 394 | return 0; |
394 | } | 395 | } |
395 | #else | 396 | #else |
396 | static inline int | 397 | /* Wrapper for ALPHA specific affinity selector magic */ |
397 | setup_affinity(unsigned int irq, struct irq_desc *d, struct cpumask *mask) | 398 | static inline int setup_affinity(struct irq_desc *d, struct cpumask *mask) |
398 | { | 399 | { |
399 | return irq_select_affinity(irq); | 400 | return irq_select_affinity(irq_desc_get_irq(d)); |
400 | } | 401 | } |
401 | #endif | 402 | #endif |
402 | 403 | ||
@@ -410,20 +411,20 @@ int irq_select_affinity_usr(unsigned int irq, struct cpumask *mask) | |||
410 | int ret; | 411 | int ret; |
411 | 412 | ||
412 | raw_spin_lock_irqsave(&desc->lock, flags); | 413 | raw_spin_lock_irqsave(&desc->lock, flags); |
413 | ret = setup_affinity(irq, desc, mask); | 414 | ret = setup_affinity(desc, mask); |
414 | raw_spin_unlock_irqrestore(&desc->lock, flags); | 415 | raw_spin_unlock_irqrestore(&desc->lock, flags); |
415 | return ret; | 416 | return ret; |
416 | } | 417 | } |
417 | 418 | ||
418 | #else | 419 | #else |
419 | static inline int | 420 | static inline int |
420 | setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask) | 421 | setup_affinity(struct irq_desc *desc, struct cpumask *mask) |
421 | { | 422 | { |
422 | return 0; | 423 | return 0; |
423 | } | 424 | } |
424 | #endif | 425 | #endif |
425 | 426 | ||
426 | void __disable_irq(struct irq_desc *desc, unsigned int irq) | 427 | void __disable_irq(struct irq_desc *desc) |
427 | { | 428 | { |
428 | if (!desc->depth++) | 429 | if (!desc->depth++) |
429 | irq_disable(desc); | 430 | irq_disable(desc); |
@@ -436,7 +437,7 @@ static int __disable_irq_nosync(unsigned int irq) | |||
436 | 437 | ||
437 | if (!desc) | 438 | if (!desc) |
438 | return -EINVAL; | 439 | return -EINVAL; |
439 | __disable_irq(desc, irq); | 440 | __disable_irq(desc); |
440 | irq_put_desc_busunlock(desc, flags); | 441 | irq_put_desc_busunlock(desc, flags); |
441 | return 0; | 442 | return 0; |
442 | } | 443 | } |
@@ -503,12 +504,13 @@ bool disable_hardirq(unsigned int irq) | |||
503 | } | 504 | } |
504 | EXPORT_SYMBOL_GPL(disable_hardirq); | 505 | EXPORT_SYMBOL_GPL(disable_hardirq); |
505 | 506 | ||
506 | void __enable_irq(struct irq_desc *desc, unsigned int irq) | 507 | void __enable_irq(struct irq_desc *desc) |
507 | { | 508 | { |
508 | switch (desc->depth) { | 509 | switch (desc->depth) { |
509 | case 0: | 510 | case 0: |
510 | err_out: | 511 | err_out: |
511 | WARN(1, KERN_WARNING "Unbalanced enable for IRQ %d\n", irq); | 512 | WARN(1, KERN_WARNING "Unbalanced enable for IRQ %d\n", |
513 | irq_desc_get_irq(desc)); | ||
512 | break; | 514 | break; |
513 | case 1: { | 515 | case 1: { |
514 | if (desc->istate & IRQS_SUSPENDED) | 516 | if (desc->istate & IRQS_SUSPENDED) |
@@ -516,7 +518,7 @@ void __enable_irq(struct irq_desc *desc, unsigned int irq) | |||
516 | /* Prevent probing on this irq: */ | 518 | /* Prevent probing on this irq: */ |
517 | irq_settings_set_noprobe(desc); | 519 | irq_settings_set_noprobe(desc); |
518 | irq_enable(desc); | 520 | irq_enable(desc); |
519 | check_irq_resend(desc, irq); | 521 | check_irq_resend(desc); |
520 | /* fall-through */ | 522 | /* fall-through */ |
521 | } | 523 | } |
522 | default: | 524 | default: |
@@ -546,7 +548,7 @@ void enable_irq(unsigned int irq) | |||
546 | KERN_ERR "enable_irq before setup/request_irq: irq %u\n", irq)) | 548 | KERN_ERR "enable_irq before setup/request_irq: irq %u\n", irq)) |
547 | goto out; | 549 | goto out; |
548 | 550 | ||
549 | __enable_irq(desc, irq); | 551 | __enable_irq(desc); |
550 | out: | 552 | out: |
551 | irq_put_desc_busunlock(desc, flags); | 553 | irq_put_desc_busunlock(desc, flags); |
552 | } | 554 | } |
@@ -637,8 +639,7 @@ int can_request_irq(unsigned int irq, unsigned long irqflags) | |||
637 | return canrequest; | 639 | return canrequest; |
638 | } | 640 | } |
639 | 641 | ||
640 | int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, | 642 | int __irq_set_trigger(struct irq_desc *desc, unsigned long flags) |
641 | unsigned long flags) | ||
642 | { | 643 | { |
643 | struct irq_chip *chip = desc->irq_data.chip; | 644 | struct irq_chip *chip = desc->irq_data.chip; |
644 | int ret, unmask = 0; | 645 | int ret, unmask = 0; |
@@ -648,7 +649,8 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, | |||
648 | * IRQF_TRIGGER_* but the PIC does not support multiple | 649 | * IRQF_TRIGGER_* but the PIC does not support multiple |
649 | * flow-types? | 650 | * flow-types? |
650 | */ | 651 | */ |
651 | pr_debug("No set_type function for IRQ %d (%s)\n", irq, | 652 | pr_debug("No set_type function for IRQ %d (%s)\n", |
653 | irq_desc_get_irq(desc), | ||
652 | chip ? (chip->name ? : "unknown") : "unknown"); | 654 | chip ? (chip->name ? : "unknown") : "unknown"); |
653 | return 0; | 655 | return 0; |
654 | } | 656 | } |
@@ -685,7 +687,7 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, | |||
685 | break; | 687 | break; |
686 | default: | 688 | default: |
687 | pr_err("Setting trigger mode %lu for irq %u failed (%pF)\n", | 689 | pr_err("Setting trigger mode %lu for irq %u failed (%pF)\n", |
688 | flags, irq, chip->irq_set_type); | 690 | flags, irq_desc_get_irq(desc), chip->irq_set_type); |
689 | } | 691 | } |
690 | if (unmask) | 692 | if (unmask) |
691 | unmask_irq(desc); | 693 | unmask_irq(desc); |
@@ -1221,8 +1223,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) | |||
1221 | 1223 | ||
1222 | /* Setup the type (level, edge polarity) if configured: */ | 1224 | /* Setup the type (level, edge polarity) if configured: */ |
1223 | if (new->flags & IRQF_TRIGGER_MASK) { | 1225 | if (new->flags & IRQF_TRIGGER_MASK) { |
1224 | ret = __irq_set_trigger(desc, irq, | 1226 | ret = __irq_set_trigger(desc, |
1225 | new->flags & IRQF_TRIGGER_MASK); | 1227 | new->flags & IRQF_TRIGGER_MASK); |
1226 | 1228 | ||
1227 | if (ret) | 1229 | if (ret) |
1228 | goto out_mask; | 1230 | goto out_mask; |
@@ -1253,7 +1255,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) | |||
1253 | } | 1255 | } |
1254 | 1256 | ||
1255 | /* Set default affinity mask once everything is setup */ | 1257 | /* Set default affinity mask once everything is setup */ |
1256 | setup_affinity(irq, desc, mask); | 1258 | setup_affinity(desc, mask); |
1257 | 1259 | ||
1258 | } else if (new->flags & IRQF_TRIGGER_MASK) { | 1260 | } else if (new->flags & IRQF_TRIGGER_MASK) { |
1259 | unsigned int nmsk = new->flags & IRQF_TRIGGER_MASK; | 1261 | unsigned int nmsk = new->flags & IRQF_TRIGGER_MASK; |
@@ -1280,7 +1282,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) | |||
1280 | */ | 1282 | */ |
1281 | if (shared && (desc->istate & IRQS_SPURIOUS_DISABLED)) { | 1283 | if (shared && (desc->istate & IRQS_SPURIOUS_DISABLED)) { |
1282 | desc->istate &= ~IRQS_SPURIOUS_DISABLED; | 1284 | desc->istate &= ~IRQS_SPURIOUS_DISABLED; |
1283 | __enable_irq(desc, irq); | 1285 | __enable_irq(desc); |
1284 | } | 1286 | } |
1285 | 1287 | ||
1286 | raw_spin_unlock_irqrestore(&desc->lock, flags); | 1288 | raw_spin_unlock_irqrestore(&desc->lock, flags); |
@@ -1650,7 +1652,7 @@ void enable_percpu_irq(unsigned int irq, unsigned int type) | |||
1650 | if (type != IRQ_TYPE_NONE) { | 1652 | if (type != IRQ_TYPE_NONE) { |
1651 | int ret; | 1653 | int ret; |
1652 | 1654 | ||
1653 | ret = __irq_set_trigger(desc, irq, type); | 1655 | ret = __irq_set_trigger(desc, type); |
1654 | 1656 | ||
1655 | if (ret) { | 1657 | if (ret) { |
1656 | WARN(1, "failed to set type for IRQ%d\n", irq); | 1658 | WARN(1, "failed to set type for IRQ%d\n", irq); |
@@ -1875,6 +1877,7 @@ int irq_get_irqchip_state(unsigned int irq, enum irqchip_irq_state which, | |||
1875 | irq_put_desc_busunlock(desc, flags); | 1877 | irq_put_desc_busunlock(desc, flags); |
1876 | return err; | 1878 | return err; |
1877 | } | 1879 | } |
1880 | EXPORT_SYMBOL_GPL(irq_get_irqchip_state); | ||
1878 | 1881 | ||
1879 | /** | 1882 | /** |
1880 | * irq_set_irqchip_state - set the state of a forwarded interrupt. | 1883 | * irq_set_irqchip_state - set the state of a forwarded interrupt. |
@@ -1920,3 +1923,4 @@ int irq_set_irqchip_state(unsigned int irq, enum irqchip_irq_state which, | |||
1920 | irq_put_desc_busunlock(desc, flags); | 1923 | irq_put_desc_busunlock(desc, flags); |
1921 | return err; | 1924 | return err; |
1922 | } | 1925 | } |
1926 | EXPORT_SYMBOL_GPL(irq_set_irqchip_state); | ||
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index 7bf1f1bbb7fa..7e6512b9dc1f 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c | |||
@@ -18,6 +18,23 @@ | |||
18 | /* Temparory solution for building, will be removed later */ | 18 | /* Temparory solution for building, will be removed later */ |
19 | #include <linux/pci.h> | 19 | #include <linux/pci.h> |
20 | 20 | ||
21 | struct msi_desc *alloc_msi_entry(struct device *dev) | ||
22 | { | ||
23 | struct msi_desc *desc = kzalloc(sizeof(*desc), GFP_KERNEL); | ||
24 | if (!desc) | ||
25 | return NULL; | ||
26 | |||
27 | INIT_LIST_HEAD(&desc->list); | ||
28 | desc->dev = dev; | ||
29 | |||
30 | return desc; | ||
31 | } | ||
32 | |||
33 | void free_msi_entry(struct msi_desc *entry) | ||
34 | { | ||
35 | kfree(entry); | ||
36 | } | ||
37 | |||
21 | void __get_cached_msi_msg(struct msi_desc *entry, struct msi_msg *msg) | 38 | void __get_cached_msi_msg(struct msi_desc *entry, struct msi_msg *msg) |
22 | { | 39 | { |
23 | *msg = entry->msg; | 40 | *msg = entry->msg; |
diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c index d22786a6dbde..21c62617a35a 100644 --- a/kernel/irq/pm.c +++ b/kernel/irq/pm.c | |||
@@ -68,7 +68,7 @@ void irq_pm_remove_action(struct irq_desc *desc, struct irqaction *action) | |||
68 | desc->cond_suspend_depth--; | 68 | desc->cond_suspend_depth--; |
69 | } | 69 | } |
70 | 70 | ||
71 | static bool suspend_device_irq(struct irq_desc *desc, int irq) | 71 | static bool suspend_device_irq(struct irq_desc *desc) |
72 | { | 72 | { |
73 | if (!desc->action || desc->no_suspend_depth) | 73 | if (!desc->action || desc->no_suspend_depth) |
74 | return false; | 74 | return false; |
@@ -85,7 +85,7 @@ static bool suspend_device_irq(struct irq_desc *desc, int irq) | |||
85 | } | 85 | } |
86 | 86 | ||
87 | desc->istate |= IRQS_SUSPENDED; | 87 | desc->istate |= IRQS_SUSPENDED; |
88 | __disable_irq(desc, irq); | 88 | __disable_irq(desc); |
89 | 89 | ||
90 | /* | 90 | /* |
91 | * Hardware which has no wakeup source configuration facility | 91 | * Hardware which has no wakeup source configuration facility |
@@ -126,7 +126,7 @@ void suspend_device_irqs(void) | |||
126 | if (irq_settings_is_nested_thread(desc)) | 126 | if (irq_settings_is_nested_thread(desc)) |
127 | continue; | 127 | continue; |
128 | raw_spin_lock_irqsave(&desc->lock, flags); | 128 | raw_spin_lock_irqsave(&desc->lock, flags); |
129 | sync = suspend_device_irq(desc, irq); | 129 | sync = suspend_device_irq(desc); |
130 | raw_spin_unlock_irqrestore(&desc->lock, flags); | 130 | raw_spin_unlock_irqrestore(&desc->lock, flags); |
131 | 131 | ||
132 | if (sync) | 132 | if (sync) |
@@ -135,7 +135,7 @@ void suspend_device_irqs(void) | |||
135 | } | 135 | } |
136 | EXPORT_SYMBOL_GPL(suspend_device_irqs); | 136 | EXPORT_SYMBOL_GPL(suspend_device_irqs); |
137 | 137 | ||
138 | static void resume_irq(struct irq_desc *desc, int irq) | 138 | static void resume_irq(struct irq_desc *desc) |
139 | { | 139 | { |
140 | irqd_clear(&desc->irq_data, IRQD_WAKEUP_ARMED); | 140 | irqd_clear(&desc->irq_data, IRQD_WAKEUP_ARMED); |
141 | 141 | ||
@@ -150,7 +150,7 @@ static void resume_irq(struct irq_desc *desc, int irq) | |||
150 | desc->depth++; | 150 | desc->depth++; |
151 | resume: | 151 | resume: |
152 | desc->istate &= ~IRQS_SUSPENDED; | 152 | desc->istate &= ~IRQS_SUSPENDED; |
153 | __enable_irq(desc, irq); | 153 | __enable_irq(desc); |
154 | } | 154 | } |
155 | 155 | ||
156 | static void resume_irqs(bool want_early) | 156 | static void resume_irqs(bool want_early) |
@@ -169,7 +169,7 @@ static void resume_irqs(bool want_early) | |||
169 | continue; | 169 | continue; |
170 | 170 | ||
171 | raw_spin_lock_irqsave(&desc->lock, flags); | 171 | raw_spin_lock_irqsave(&desc->lock, flags); |
172 | resume_irq(desc, irq); | 172 | resume_irq(desc); |
173 | raw_spin_unlock_irqrestore(&desc->lock, flags); | 173 | raw_spin_unlock_irqrestore(&desc->lock, flags); |
174 | } | 174 | } |
175 | } | 175 | } |
diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c index 9065107f083e..dd95f44f99b2 100644 --- a/kernel/irq/resend.c +++ b/kernel/irq/resend.c | |||
@@ -53,7 +53,7 @@ static DECLARE_TASKLET(resend_tasklet, resend_irqs, 0); | |||
53 | * | 53 | * |
54 | * Is called with interrupts disabled and desc->lock held. | 54 | * Is called with interrupts disabled and desc->lock held. |
55 | */ | 55 | */ |
56 | void check_irq_resend(struct irq_desc *desc, unsigned int irq) | 56 | void check_irq_resend(struct irq_desc *desc) |
57 | { | 57 | { |
58 | /* | 58 | /* |
59 | * We do not resend level type interrupts. Level type | 59 | * We do not resend level type interrupts. Level type |
@@ -74,14 +74,24 @@ void check_irq_resend(struct irq_desc *desc, unsigned int irq) | |||
74 | if (!desc->irq_data.chip->irq_retrigger || | 74 | if (!desc->irq_data.chip->irq_retrigger || |
75 | !desc->irq_data.chip->irq_retrigger(&desc->irq_data)) { | 75 | !desc->irq_data.chip->irq_retrigger(&desc->irq_data)) { |
76 | #ifdef CONFIG_HARDIRQS_SW_RESEND | 76 | #ifdef CONFIG_HARDIRQS_SW_RESEND |
77 | unsigned int irq = irq_desc_get_irq(desc); | ||
78 | |||
77 | /* | 79 | /* |
78 | * If the interrupt has a parent irq and runs | 80 | * If the interrupt is running in the thread |
79 | * in the thread context of the parent irq, | 81 | * context of the parent irq we need to be |
80 | * retrigger the parent. | 82 | * careful, because we cannot trigger it |
83 | * directly. | ||
81 | */ | 84 | */ |
82 | if (desc->parent_irq && | 85 | if (irq_settings_is_nested_thread(desc)) { |
83 | irq_settings_is_nested_thread(desc)) | 86 | /* |
87 | * If the parent_irq is valid, we | ||
88 | * retrigger the parent, otherwise we | ||
89 | * do nothing. | ||
90 | */ | ||
91 | if (!desc->parent_irq) | ||
92 | return; | ||
84 | irq = desc->parent_irq; | 93 | irq = desc->parent_irq; |
94 | } | ||
85 | /* Set it pending and activate the softirq: */ | 95 | /* Set it pending and activate the softirq: */ |
86 | set_bit(irq, irqs_resend); | 96 | set_bit(irq, irqs_resend); |
87 | tasklet_schedule(&resend_tasklet); | 97 | tasklet_schedule(&resend_tasklet); |
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index e2514b0e439e..32144175458d 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c | |||
@@ -60,7 +60,7 @@ bool irq_wait_for_poll(struct irq_desc *desc) | |||
60 | /* | 60 | /* |
61 | * Recovery handler for misrouted interrupts. | 61 | * Recovery handler for misrouted interrupts. |
62 | */ | 62 | */ |
63 | static int try_one_irq(int irq, struct irq_desc *desc, bool force) | 63 | static int try_one_irq(struct irq_desc *desc, bool force) |
64 | { | 64 | { |
65 | irqreturn_t ret = IRQ_NONE; | 65 | irqreturn_t ret = IRQ_NONE; |
66 | struct irqaction *action; | 66 | struct irqaction *action; |
@@ -133,7 +133,7 @@ static int misrouted_irq(int irq) | |||
133 | if (i == irq) /* Already tried */ | 133 | if (i == irq) /* Already tried */ |
134 | continue; | 134 | continue; |
135 | 135 | ||
136 | if (try_one_irq(i, desc, false)) | 136 | if (try_one_irq(desc, false)) |
137 | ok = 1; | 137 | ok = 1; |
138 | } | 138 | } |
139 | out: | 139 | out: |
@@ -164,7 +164,7 @@ static void poll_spurious_irqs(unsigned long dummy) | |||
164 | continue; | 164 | continue; |
165 | 165 | ||
166 | local_irq_disable(); | 166 | local_irq_disable(); |
167 | try_one_irq(i, desc, true); | 167 | try_one_irq(desc, true); |
168 | local_irq_enable(); | 168 | local_irq_enable(); |
169 | } | 169 | } |
170 | out: | 170 | out: |
@@ -188,10 +188,9 @@ static inline int bad_action_ret(irqreturn_t action_ret) | |||
188 | * (The other 100-of-100,000 interrupts may have been a correctly | 188 | * (The other 100-of-100,000 interrupts may have been a correctly |
189 | * functioning device sharing an IRQ with the failing one) | 189 | * functioning device sharing an IRQ with the failing one) |
190 | */ | 190 | */ |
191 | static void | 191 | static void __report_bad_irq(struct irq_desc *desc, irqreturn_t action_ret) |
192 | __report_bad_irq(unsigned int irq, struct irq_desc *desc, | ||
193 | irqreturn_t action_ret) | ||
194 | { | 192 | { |
193 | unsigned int irq = irq_desc_get_irq(desc); | ||
195 | struct irqaction *action; | 194 | struct irqaction *action; |
196 | unsigned long flags; | 195 | unsigned long flags; |
197 | 196 | ||
@@ -224,14 +223,13 @@ __report_bad_irq(unsigned int irq, struct irq_desc *desc, | |||
224 | raw_spin_unlock_irqrestore(&desc->lock, flags); | 223 | raw_spin_unlock_irqrestore(&desc->lock, flags); |
225 | } | 224 | } |
226 | 225 | ||
227 | static void | 226 | static void report_bad_irq(struct irq_desc *desc, irqreturn_t action_ret) |
228 | report_bad_irq(unsigned int irq, struct irq_desc *desc, irqreturn_t action_ret) | ||
229 | { | 227 | { |
230 | static int count = 100; | 228 | static int count = 100; |
231 | 229 | ||
232 | if (count > 0) { | 230 | if (count > 0) { |
233 | count--; | 231 | count--; |
234 | __report_bad_irq(irq, desc, action_ret); | 232 | __report_bad_irq(desc, action_ret); |
235 | } | 233 | } |
236 | } | 234 | } |
237 | 235 | ||
@@ -272,15 +270,16 @@ try_misrouted_irq(unsigned int irq, struct irq_desc *desc, | |||
272 | 270 | ||
273 | #define SPURIOUS_DEFERRED 0x80000000 | 271 | #define SPURIOUS_DEFERRED 0x80000000 |
274 | 272 | ||
275 | void note_interrupt(unsigned int irq, struct irq_desc *desc, | 273 | void note_interrupt(struct irq_desc *desc, irqreturn_t action_ret) |
276 | irqreturn_t action_ret) | ||
277 | { | 274 | { |
275 | unsigned int irq; | ||
276 | |||
278 | if (desc->istate & IRQS_POLL_INPROGRESS || | 277 | if (desc->istate & IRQS_POLL_INPROGRESS || |
279 | irq_settings_is_polled(desc)) | 278 | irq_settings_is_polled(desc)) |
280 | return; | 279 | return; |
281 | 280 | ||
282 | if (bad_action_ret(action_ret)) { | 281 | if (bad_action_ret(action_ret)) { |
283 | report_bad_irq(irq, desc, action_ret); | 282 | report_bad_irq(desc, action_ret); |
284 | return; | 283 | return; |
285 | } | 284 | } |
286 | 285 | ||
@@ -398,6 +397,7 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc, | |||
398 | desc->last_unhandled = jiffies; | 397 | desc->last_unhandled = jiffies; |
399 | } | 398 | } |
400 | 399 | ||
400 | irq = irq_desc_get_irq(desc); | ||
401 | if (unlikely(try_misrouted_irq(irq, desc, action_ret))) { | 401 | if (unlikely(try_misrouted_irq(irq, desc, action_ret))) { |
402 | int ok = misrouted_irq(irq); | 402 | int ok = misrouted_irq(irq); |
403 | if (action_ret == IRQ_NONE) | 403 | if (action_ret == IRQ_NONE) |
@@ -413,7 +413,7 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc, | |||
413 | /* | 413 | /* |
414 | * The interrupt is stuck | 414 | * The interrupt is stuck |
415 | */ | 415 | */ |
416 | __report_bad_irq(irq, desc, action_ret); | 416 | __report_bad_irq(desc, action_ret); |
417 | /* | 417 | /* |
418 | * Now kill the IRQ | 418 | * Now kill the IRQ |
419 | */ | 419 | */ |
diff --git a/kernel/jump_label.c b/kernel/jump_label.c index 52ebaca1b9fc..f7dd15d537f9 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c | |||
@@ -54,7 +54,7 @@ jump_label_sort_entries(struct jump_entry *start, struct jump_entry *stop) | |||
54 | sort(start, size, sizeof(struct jump_entry), jump_label_cmp, NULL); | 54 | sort(start, size, sizeof(struct jump_entry), jump_label_cmp, NULL); |
55 | } | 55 | } |
56 | 56 | ||
57 | static void jump_label_update(struct static_key *key, int enable); | 57 | static void jump_label_update(struct static_key *key); |
58 | 58 | ||
59 | void static_key_slow_inc(struct static_key *key) | 59 | void static_key_slow_inc(struct static_key *key) |
60 | { | 60 | { |
@@ -63,13 +63,8 @@ void static_key_slow_inc(struct static_key *key) | |||
63 | return; | 63 | return; |
64 | 64 | ||
65 | jump_label_lock(); | 65 | jump_label_lock(); |
66 | if (atomic_read(&key->enabled) == 0) { | 66 | if (atomic_inc_return(&key->enabled) == 1) |
67 | if (!jump_label_get_branch_default(key)) | 67 | jump_label_update(key); |
68 | jump_label_update(key, JUMP_LABEL_ENABLE); | ||
69 | else | ||
70 | jump_label_update(key, JUMP_LABEL_DISABLE); | ||
71 | } | ||
72 | atomic_inc(&key->enabled); | ||
73 | jump_label_unlock(); | 68 | jump_label_unlock(); |
74 | } | 69 | } |
75 | EXPORT_SYMBOL_GPL(static_key_slow_inc); | 70 | EXPORT_SYMBOL_GPL(static_key_slow_inc); |
@@ -87,10 +82,7 @@ static void __static_key_slow_dec(struct static_key *key, | |||
87 | atomic_inc(&key->enabled); | 82 | atomic_inc(&key->enabled); |
88 | schedule_delayed_work(work, rate_limit); | 83 | schedule_delayed_work(work, rate_limit); |
89 | } else { | 84 | } else { |
90 | if (!jump_label_get_branch_default(key)) | 85 | jump_label_update(key); |
91 | jump_label_update(key, JUMP_LABEL_DISABLE); | ||
92 | else | ||
93 | jump_label_update(key, JUMP_LABEL_ENABLE); | ||
94 | } | 86 | } |
95 | jump_label_unlock(); | 87 | jump_label_unlock(); |
96 | } | 88 | } |
@@ -149,7 +141,7 @@ static int __jump_label_text_reserved(struct jump_entry *iter_start, | |||
149 | return 0; | 141 | return 0; |
150 | } | 142 | } |
151 | 143 | ||
152 | /* | 144 | /* |
153 | * Update code which is definitely not currently executing. | 145 | * Update code which is definitely not currently executing. |
154 | * Architectures which need heavyweight synchronization to modify | 146 | * Architectures which need heavyweight synchronization to modify |
155 | * running code can override this to make the non-live update case | 147 | * running code can override this to make the non-live update case |
@@ -158,37 +150,54 @@ static int __jump_label_text_reserved(struct jump_entry *iter_start, | |||
158 | void __weak __init_or_module arch_jump_label_transform_static(struct jump_entry *entry, | 150 | void __weak __init_or_module arch_jump_label_transform_static(struct jump_entry *entry, |
159 | enum jump_label_type type) | 151 | enum jump_label_type type) |
160 | { | 152 | { |
161 | arch_jump_label_transform(entry, type); | 153 | arch_jump_label_transform(entry, type); |
154 | } | ||
155 | |||
156 | static inline struct jump_entry *static_key_entries(struct static_key *key) | ||
157 | { | ||
158 | return (struct jump_entry *)((unsigned long)key->entries & ~JUMP_TYPE_MASK); | ||
159 | } | ||
160 | |||
161 | static inline bool static_key_type(struct static_key *key) | ||
162 | { | ||
163 | return (unsigned long)key->entries & JUMP_TYPE_MASK; | ||
164 | } | ||
165 | |||
166 | static inline struct static_key *jump_entry_key(struct jump_entry *entry) | ||
167 | { | ||
168 | return (struct static_key *)((unsigned long)entry->key & ~1UL); | ||
169 | } | ||
170 | |||
171 | static bool jump_entry_branch(struct jump_entry *entry) | ||
172 | { | ||
173 | return (unsigned long)entry->key & 1UL; | ||
174 | } | ||
175 | |||
176 | static enum jump_label_type jump_label_type(struct jump_entry *entry) | ||
177 | { | ||
178 | struct static_key *key = jump_entry_key(entry); | ||
179 | bool enabled = static_key_enabled(key); | ||
180 | bool branch = jump_entry_branch(entry); | ||
181 | |||
182 | /* See the comment in linux/jump_label.h */ | ||
183 | return enabled ^ branch; | ||
162 | } | 184 | } |
163 | 185 | ||
164 | static void __jump_label_update(struct static_key *key, | 186 | static void __jump_label_update(struct static_key *key, |
165 | struct jump_entry *entry, | 187 | struct jump_entry *entry, |
166 | struct jump_entry *stop, int enable) | 188 | struct jump_entry *stop) |
167 | { | 189 | { |
168 | for (; (entry < stop) && | 190 | for (; (entry < stop) && (jump_entry_key(entry) == key); entry++) { |
169 | (entry->key == (jump_label_t)(unsigned long)key); | ||
170 | entry++) { | ||
171 | /* | 191 | /* |
172 | * entry->code set to 0 invalidates module init text sections | 192 | * entry->code set to 0 invalidates module init text sections |
173 | * kernel_text_address() verifies we are not in core kernel | 193 | * kernel_text_address() verifies we are not in core kernel |
174 | * init code, see jump_label_invalidate_module_init(). | 194 | * init code, see jump_label_invalidate_module_init(). |
175 | */ | 195 | */ |
176 | if (entry->code && kernel_text_address(entry->code)) | 196 | if (entry->code && kernel_text_address(entry->code)) |
177 | arch_jump_label_transform(entry, enable); | 197 | arch_jump_label_transform(entry, jump_label_type(entry)); |
178 | } | 198 | } |
179 | } | 199 | } |
180 | 200 | ||
181 | static enum jump_label_type jump_label_type(struct static_key *key) | ||
182 | { | ||
183 | bool true_branch = jump_label_get_branch_default(key); | ||
184 | bool state = static_key_enabled(key); | ||
185 | |||
186 | if ((!true_branch && state) || (true_branch && !state)) | ||
187 | return JUMP_LABEL_ENABLE; | ||
188 | |||
189 | return JUMP_LABEL_DISABLE; | ||
190 | } | ||
191 | |||
192 | void __init jump_label_init(void) | 201 | void __init jump_label_init(void) |
193 | { | 202 | { |
194 | struct jump_entry *iter_start = __start___jump_table; | 203 | struct jump_entry *iter_start = __start___jump_table; |
@@ -202,8 +211,11 @@ void __init jump_label_init(void) | |||
202 | for (iter = iter_start; iter < iter_stop; iter++) { | 211 | for (iter = iter_start; iter < iter_stop; iter++) { |
203 | struct static_key *iterk; | 212 | struct static_key *iterk; |
204 | 213 | ||
205 | iterk = (struct static_key *)(unsigned long)iter->key; | 214 | /* rewrite NOPs */ |
206 | arch_jump_label_transform_static(iter, jump_label_type(iterk)); | 215 | if (jump_label_type(iter) == JUMP_LABEL_NOP) |
216 | arch_jump_label_transform_static(iter, JUMP_LABEL_NOP); | ||
217 | |||
218 | iterk = jump_entry_key(iter); | ||
207 | if (iterk == key) | 219 | if (iterk == key) |
208 | continue; | 220 | continue; |
209 | 221 | ||
@@ -222,6 +234,16 @@ void __init jump_label_init(void) | |||
222 | 234 | ||
223 | #ifdef CONFIG_MODULES | 235 | #ifdef CONFIG_MODULES |
224 | 236 | ||
237 | static enum jump_label_type jump_label_init_type(struct jump_entry *entry) | ||
238 | { | ||
239 | struct static_key *key = jump_entry_key(entry); | ||
240 | bool type = static_key_type(key); | ||
241 | bool branch = jump_entry_branch(entry); | ||
242 | |||
243 | /* See the comment in linux/jump_label.h */ | ||
244 | return type ^ branch; | ||
245 | } | ||
246 | |||
225 | struct static_key_mod { | 247 | struct static_key_mod { |
226 | struct static_key_mod *next; | 248 | struct static_key_mod *next; |
227 | struct jump_entry *entries; | 249 | struct jump_entry *entries; |
@@ -243,17 +265,15 @@ static int __jump_label_mod_text_reserved(void *start, void *end) | |||
243 | start, end); | 265 | start, end); |
244 | } | 266 | } |
245 | 267 | ||
246 | static void __jump_label_mod_update(struct static_key *key, int enable) | 268 | static void __jump_label_mod_update(struct static_key *key) |
247 | { | 269 | { |
248 | struct static_key_mod *mod = key->next; | 270 | struct static_key_mod *mod; |
249 | 271 | ||
250 | while (mod) { | 272 | for (mod = key->next; mod; mod = mod->next) { |
251 | struct module *m = mod->mod; | 273 | struct module *m = mod->mod; |
252 | 274 | ||
253 | __jump_label_update(key, mod->entries, | 275 | __jump_label_update(key, mod->entries, |
254 | m->jump_entries + m->num_jump_entries, | 276 | m->jump_entries + m->num_jump_entries); |
255 | enable); | ||
256 | mod = mod->next; | ||
257 | } | 277 | } |
258 | } | 278 | } |
259 | 279 | ||
@@ -276,7 +296,9 @@ void jump_label_apply_nops(struct module *mod) | |||
276 | return; | 296 | return; |
277 | 297 | ||
278 | for (iter = iter_start; iter < iter_stop; iter++) { | 298 | for (iter = iter_start; iter < iter_stop; iter++) { |
279 | arch_jump_label_transform_static(iter, JUMP_LABEL_DISABLE); | 299 | /* Only write NOPs for arch_branch_static(). */ |
300 | if (jump_label_init_type(iter) == JUMP_LABEL_NOP) | ||
301 | arch_jump_label_transform_static(iter, JUMP_LABEL_NOP); | ||
280 | } | 302 | } |
281 | } | 303 | } |
282 | 304 | ||
@@ -297,7 +319,7 @@ static int jump_label_add_module(struct module *mod) | |||
297 | for (iter = iter_start; iter < iter_stop; iter++) { | 319 | for (iter = iter_start; iter < iter_stop; iter++) { |
298 | struct static_key *iterk; | 320 | struct static_key *iterk; |
299 | 321 | ||
300 | iterk = (struct static_key *)(unsigned long)iter->key; | 322 | iterk = jump_entry_key(iter); |
301 | if (iterk == key) | 323 | if (iterk == key) |
302 | continue; | 324 | continue; |
303 | 325 | ||
@@ -318,8 +340,9 @@ static int jump_label_add_module(struct module *mod) | |||
318 | jlm->next = key->next; | 340 | jlm->next = key->next; |
319 | key->next = jlm; | 341 | key->next = jlm; |
320 | 342 | ||
321 | if (jump_label_type(key) == JUMP_LABEL_ENABLE) | 343 | /* Only update if we've changed from our initial state */ |
322 | __jump_label_update(key, iter, iter_stop, JUMP_LABEL_ENABLE); | 344 | if (jump_label_type(iter) != jump_label_init_type(iter)) |
345 | __jump_label_update(key, iter, iter_stop); | ||
323 | } | 346 | } |
324 | 347 | ||
325 | return 0; | 348 | return 0; |
@@ -334,10 +357,10 @@ static void jump_label_del_module(struct module *mod) | |||
334 | struct static_key_mod *jlm, **prev; | 357 | struct static_key_mod *jlm, **prev; |
335 | 358 | ||
336 | for (iter = iter_start; iter < iter_stop; iter++) { | 359 | for (iter = iter_start; iter < iter_stop; iter++) { |
337 | if (iter->key == (jump_label_t)(unsigned long)key) | 360 | if (jump_entry_key(iter) == key) |
338 | continue; | 361 | continue; |
339 | 362 | ||
340 | key = (struct static_key *)(unsigned long)iter->key; | 363 | key = jump_entry_key(iter); |
341 | 364 | ||
342 | if (within_module(iter->key, mod)) | 365 | if (within_module(iter->key, mod)) |
343 | continue; | 366 | continue; |
@@ -439,14 +462,14 @@ int jump_label_text_reserved(void *start, void *end) | |||
439 | return ret; | 462 | return ret; |
440 | } | 463 | } |
441 | 464 | ||
442 | static void jump_label_update(struct static_key *key, int enable) | 465 | static void jump_label_update(struct static_key *key) |
443 | { | 466 | { |
444 | struct jump_entry *stop = __stop___jump_table; | 467 | struct jump_entry *stop = __stop___jump_table; |
445 | struct jump_entry *entry = jump_label_get_entries(key); | 468 | struct jump_entry *entry = static_key_entries(key); |
446 | #ifdef CONFIG_MODULES | 469 | #ifdef CONFIG_MODULES |
447 | struct module *mod; | 470 | struct module *mod; |
448 | 471 | ||
449 | __jump_label_mod_update(key, enable); | 472 | __jump_label_mod_update(key); |
450 | 473 | ||
451 | preempt_disable(); | 474 | preempt_disable(); |
452 | mod = __module_address((unsigned long)key); | 475 | mod = __module_address((unsigned long)key); |
@@ -456,7 +479,44 @@ static void jump_label_update(struct static_key *key, int enable) | |||
456 | #endif | 479 | #endif |
457 | /* if there are no users, entry can be NULL */ | 480 | /* if there are no users, entry can be NULL */ |
458 | if (entry) | 481 | if (entry) |
459 | __jump_label_update(key, entry, stop, enable); | 482 | __jump_label_update(key, entry, stop); |
460 | } | 483 | } |
461 | 484 | ||
462 | #endif | 485 | #ifdef CONFIG_STATIC_KEYS_SELFTEST |
486 | static DEFINE_STATIC_KEY_TRUE(sk_true); | ||
487 | static DEFINE_STATIC_KEY_FALSE(sk_false); | ||
488 | |||
489 | static __init int jump_label_test(void) | ||
490 | { | ||
491 | int i; | ||
492 | |||
493 | for (i = 0; i < 2; i++) { | ||
494 | WARN_ON(static_key_enabled(&sk_true.key) != true); | ||
495 | WARN_ON(static_key_enabled(&sk_false.key) != false); | ||
496 | |||
497 | WARN_ON(!static_branch_likely(&sk_true)); | ||
498 | WARN_ON(!static_branch_unlikely(&sk_true)); | ||
499 | WARN_ON(static_branch_likely(&sk_false)); | ||
500 | WARN_ON(static_branch_unlikely(&sk_false)); | ||
501 | |||
502 | static_branch_disable(&sk_true); | ||
503 | static_branch_enable(&sk_false); | ||
504 | |||
505 | WARN_ON(static_key_enabled(&sk_true.key) == true); | ||
506 | WARN_ON(static_key_enabled(&sk_false.key) == false); | ||
507 | |||
508 | WARN_ON(static_branch_likely(&sk_true)); | ||
509 | WARN_ON(static_branch_unlikely(&sk_true)); | ||
510 | WARN_ON(!static_branch_likely(&sk_false)); | ||
511 | WARN_ON(!static_branch_unlikely(&sk_false)); | ||
512 | |||
513 | static_branch_enable(&sk_true); | ||
514 | static_branch_disable(&sk_false); | ||
515 | } | ||
516 | |||
517 | return 0; | ||
518 | } | ||
519 | late_initcall(jump_label_test); | ||
520 | #endif /* STATIC_KEYS_SELFTEST */ | ||
521 | |||
522 | #endif /* HAVE_JUMP_LABEL */ | ||
diff --git a/kernel/kexec.c b/kernel/kexec.c index a785c1015e25..4c5edc357923 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c | |||
@@ -1,156 +1,22 @@ | |||
1 | /* | 1 | /* |
2 | * kexec.c - kexec system call | 2 | * kexec.c - kexec_load system call |
3 | * Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com> | 3 | * Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com> |
4 | * | 4 | * |
5 | * This source code is licensed under the GNU General Public License, | 5 | * This source code is licensed under the GNU General Public License, |
6 | * Version 2. See the file COPYING for more details. | 6 | * Version 2. See the file COPYING for more details. |
7 | */ | 7 | */ |
8 | 8 | ||
9 | #define pr_fmt(fmt) "kexec: " fmt | ||
10 | |||
11 | #include <linux/capability.h> | 9 | #include <linux/capability.h> |
12 | #include <linux/mm.h> | 10 | #include <linux/mm.h> |
13 | #include <linux/file.h> | 11 | #include <linux/file.h> |
14 | #include <linux/slab.h> | ||
15 | #include <linux/fs.h> | ||
16 | #include <linux/kexec.h> | 12 | #include <linux/kexec.h> |
17 | #include <linux/mutex.h> | 13 | #include <linux/mutex.h> |
18 | #include <linux/list.h> | 14 | #include <linux/list.h> |
19 | #include <linux/highmem.h> | ||
20 | #include <linux/syscalls.h> | 15 | #include <linux/syscalls.h> |
21 | #include <linux/reboot.h> | ||
22 | #include <linux/ioport.h> | ||
23 | #include <linux/hardirq.h> | ||
24 | #include <linux/elf.h> | ||
25 | #include <linux/elfcore.h> | ||
26 | #include <linux/utsname.h> | ||
27 | #include <linux/numa.h> | ||
28 | #include <linux/suspend.h> | ||
29 | #include <linux/device.h> | ||
30 | #include <linux/freezer.h> | ||
31 | #include <linux/pm.h> | ||
32 | #include <linux/cpu.h> | ||
33 | #include <linux/console.h> | ||
34 | #include <linux/vmalloc.h> | 16 | #include <linux/vmalloc.h> |
35 | #include <linux/swap.h> | 17 | #include <linux/slab.h> |
36 | #include <linux/syscore_ops.h> | ||
37 | #include <linux/compiler.h> | ||
38 | #include <linux/hugetlb.h> | ||
39 | |||
40 | #include <asm/page.h> | ||
41 | #include <asm/uaccess.h> | ||
42 | #include <asm/io.h> | ||
43 | #include <asm/sections.h> | ||
44 | |||
45 | #include <crypto/hash.h> | ||
46 | #include <crypto/sha.h> | ||
47 | |||
48 | /* Per cpu memory for storing cpu states in case of system crash. */ | ||
49 | note_buf_t __percpu *crash_notes; | ||
50 | |||
51 | /* vmcoreinfo stuff */ | ||
52 | static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES]; | ||
53 | u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4]; | ||
54 | size_t vmcoreinfo_size; | ||
55 | size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data); | ||
56 | |||
57 | /* Flag to indicate we are going to kexec a new kernel */ | ||
58 | bool kexec_in_progress = false; | ||
59 | |||
60 | /* | ||
61 | * Declare these symbols weak so that if architecture provides a purgatory, | ||
62 | * these will be overridden. | ||
63 | */ | ||
64 | char __weak kexec_purgatory[0]; | ||
65 | size_t __weak kexec_purgatory_size = 0; | ||
66 | |||
67 | #ifdef CONFIG_KEXEC_FILE | ||
68 | static int kexec_calculate_store_digests(struct kimage *image); | ||
69 | #endif | ||
70 | |||
71 | /* Location of the reserved area for the crash kernel */ | ||
72 | struct resource crashk_res = { | ||
73 | .name = "Crash kernel", | ||
74 | .start = 0, | ||
75 | .end = 0, | ||
76 | .flags = IORESOURCE_BUSY | IORESOURCE_MEM | ||
77 | }; | ||
78 | struct resource crashk_low_res = { | ||
79 | .name = "Crash kernel", | ||
80 | .start = 0, | ||
81 | .end = 0, | ||
82 | .flags = IORESOURCE_BUSY | IORESOURCE_MEM | ||
83 | }; | ||
84 | |||
85 | int kexec_should_crash(struct task_struct *p) | ||
86 | { | ||
87 | /* | ||
88 | * If crash_kexec_post_notifiers is enabled, don't run | ||
89 | * crash_kexec() here yet, which must be run after panic | ||
90 | * notifiers in panic(). | ||
91 | */ | ||
92 | if (crash_kexec_post_notifiers) | ||
93 | return 0; | ||
94 | /* | ||
95 | * There are 4 panic() calls in do_exit() path, each of which | ||
96 | * corresponds to each of these 4 conditions. | ||
97 | */ | ||
98 | if (in_interrupt() || !p->pid || is_global_init(p) || panic_on_oops) | ||
99 | return 1; | ||
100 | return 0; | ||
101 | } | ||
102 | |||
103 | /* | ||
104 | * When kexec transitions to the new kernel there is a one-to-one | ||
105 | * mapping between physical and virtual addresses. On processors | ||
106 | * where you can disable the MMU this is trivial, and easy. For | ||
107 | * others it is still a simple predictable page table to setup. | ||
108 | * | ||
109 | * In that environment kexec copies the new kernel to its final | ||
110 | * resting place. This means I can only support memory whose | ||
111 | * physical address can fit in an unsigned long. In particular | ||
112 | * addresses where (pfn << PAGE_SHIFT) > ULONG_MAX cannot be handled. | ||
113 | * If the assembly stub has more restrictive requirements | ||
114 | * KEXEC_SOURCE_MEMORY_LIMIT and KEXEC_DEST_MEMORY_LIMIT can be | ||
115 | * defined more restrictively in <asm/kexec.h>. | ||
116 | * | ||
117 | * The code for the transition from the current kernel to the | ||
118 | * the new kernel is placed in the control_code_buffer, whose size | ||
119 | * is given by KEXEC_CONTROL_PAGE_SIZE. In the best case only a single | ||
120 | * page of memory is necessary, but some architectures require more. | ||
121 | * Because this memory must be identity mapped in the transition from | ||
122 | * virtual to physical addresses it must live in the range | ||
123 | * 0 - TASK_SIZE, as only the user space mappings are arbitrarily | ||
124 | * modifiable. | ||
125 | * | ||
126 | * The assembly stub in the control code buffer is passed a linked list | ||
127 | * of descriptor pages detailing the source pages of the new kernel, | ||
128 | * and the destination addresses of those source pages. As this data | ||
129 | * structure is not used in the context of the current OS, it must | ||
130 | * be self-contained. | ||
131 | * | ||
132 | * The code has been made to work with highmem pages and will use a | ||
133 | * destination page in its final resting place (if it happens | ||
134 | * to allocate it). The end product of this is that most of the | ||
135 | * physical address space, and most of RAM can be used. | ||
136 | * | ||
137 | * Future directions include: | ||
138 | * - allocating a page table with the control code buffer identity | ||
139 | * mapped, to simplify machine_kexec and make kexec_on_panic more | ||
140 | * reliable. | ||
141 | */ | ||
142 | |||
143 | /* | ||
144 | * KIMAGE_NO_DEST is an impossible destination address..., for | ||
145 | * allocating pages whose destination address we do not care about. | ||
146 | */ | ||
147 | #define KIMAGE_NO_DEST (-1UL) | ||
148 | 18 | ||
149 | static int kimage_is_destination_range(struct kimage *image, | 19 | #include "kexec_internal.h" |
150 | unsigned long start, unsigned long end); | ||
151 | static struct page *kimage_alloc_page(struct kimage *image, | ||
152 | gfp_t gfp_mask, | ||
153 | unsigned long dest); | ||
154 | 20 | ||
155 | static int copy_user_segment_list(struct kimage *image, | 21 | static int copy_user_segment_list(struct kimage *image, |
156 | unsigned long nr_segments, | 22 | unsigned long nr_segments, |
@@ -169,125 +35,6 @@ static int copy_user_segment_list(struct kimage *image, | |||
169 | return ret; | 35 | return ret; |
170 | } | 36 | } |
171 | 37 | ||
172 | static int sanity_check_segment_list(struct kimage *image) | ||
173 | { | ||
174 | int result, i; | ||
175 | unsigned long nr_segments = image->nr_segments; | ||
176 | |||
177 | /* | ||
178 | * Verify we have good destination addresses. The caller is | ||
179 | * responsible for making certain we don't attempt to load | ||
180 | * the new image into invalid or reserved areas of RAM. This | ||
181 | * just verifies it is an address we can use. | ||
182 | * | ||
183 | * Since the kernel does everything in page size chunks ensure | ||
184 | * the destination addresses are page aligned. Too many | ||
185 | * special cases crop of when we don't do this. The most | ||
186 | * insidious is getting overlapping destination addresses | ||
187 | * simply because addresses are changed to page size | ||
188 | * granularity. | ||
189 | */ | ||
190 | result = -EADDRNOTAVAIL; | ||
191 | for (i = 0; i < nr_segments; i++) { | ||
192 | unsigned long mstart, mend; | ||
193 | |||
194 | mstart = image->segment[i].mem; | ||
195 | mend = mstart + image->segment[i].memsz; | ||
196 | if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK)) | ||
197 | return result; | ||
198 | if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT) | ||
199 | return result; | ||
200 | } | ||
201 | |||
202 | /* Verify our destination addresses do not overlap. | ||
203 | * If we alloed overlapping destination addresses | ||
204 | * through very weird things can happen with no | ||
205 | * easy explanation as one segment stops on another. | ||
206 | */ | ||
207 | result = -EINVAL; | ||
208 | for (i = 0; i < nr_segments; i++) { | ||
209 | unsigned long mstart, mend; | ||
210 | unsigned long j; | ||
211 | |||
212 | mstart = image->segment[i].mem; | ||
213 | mend = mstart + image->segment[i].memsz; | ||
214 | for (j = 0; j < i; j++) { | ||
215 | unsigned long pstart, pend; | ||
216 | pstart = image->segment[j].mem; | ||
217 | pend = pstart + image->segment[j].memsz; | ||
218 | /* Do the segments overlap ? */ | ||
219 | if ((mend > pstart) && (mstart < pend)) | ||
220 | return result; | ||
221 | } | ||
222 | } | ||
223 | |||
224 | /* Ensure our buffer sizes are strictly less than | ||
225 | * our memory sizes. This should always be the case, | ||
226 | * and it is easier to check up front than to be surprised | ||
227 | * later on. | ||
228 | */ | ||
229 | result = -EINVAL; | ||
230 | for (i = 0; i < nr_segments; i++) { | ||
231 | if (image->segment[i].bufsz > image->segment[i].memsz) | ||
232 | return result; | ||
233 | } | ||
234 | |||
235 | /* | ||
236 | * Verify we have good destination addresses. Normally | ||
237 | * the caller is responsible for making certain we don't | ||
238 | * attempt to load the new image into invalid or reserved | ||
239 | * areas of RAM. But crash kernels are preloaded into a | ||
240 | * reserved area of ram. We must ensure the addresses | ||
241 | * are in the reserved area otherwise preloading the | ||
242 | * kernel could corrupt things. | ||
243 | */ | ||
244 | |||
245 | if (image->type == KEXEC_TYPE_CRASH) { | ||
246 | result = -EADDRNOTAVAIL; | ||
247 | for (i = 0; i < nr_segments; i++) { | ||
248 | unsigned long mstart, mend; | ||
249 | |||
250 | mstart = image->segment[i].mem; | ||
251 | mend = mstart + image->segment[i].memsz - 1; | ||
252 | /* Ensure we are within the crash kernel limits */ | ||
253 | if ((mstart < crashk_res.start) || | ||
254 | (mend > crashk_res.end)) | ||
255 | return result; | ||
256 | } | ||
257 | } | ||
258 | |||
259 | return 0; | ||
260 | } | ||
261 | |||
262 | static struct kimage *do_kimage_alloc_init(void) | ||
263 | { | ||
264 | struct kimage *image; | ||
265 | |||
266 | /* Allocate a controlling structure */ | ||
267 | image = kzalloc(sizeof(*image), GFP_KERNEL); | ||
268 | if (!image) | ||
269 | return NULL; | ||
270 | |||
271 | image->head = 0; | ||
272 | image->entry = &image->head; | ||
273 | image->last_entry = &image->head; | ||
274 | image->control_page = ~0; /* By default this does not apply */ | ||
275 | image->type = KEXEC_TYPE_DEFAULT; | ||
276 | |||
277 | /* Initialize the list of control pages */ | ||
278 | INIT_LIST_HEAD(&image->control_pages); | ||
279 | |||
280 | /* Initialize the list of destination pages */ | ||
281 | INIT_LIST_HEAD(&image->dest_pages); | ||
282 | |||
283 | /* Initialize the list of unusable pages */ | ||
284 | INIT_LIST_HEAD(&image->unusable_pages); | ||
285 | |||
286 | return image; | ||
287 | } | ||
288 | |||
289 | static void kimage_free_page_list(struct list_head *list); | ||
290 | |||
291 | static int kimage_alloc_init(struct kimage **rimage, unsigned long entry, | 38 | static int kimage_alloc_init(struct kimage **rimage, unsigned long entry, |
292 | unsigned long nr_segments, | 39 | unsigned long nr_segments, |
293 | struct kexec_segment __user *segments, | 40 | struct kexec_segment __user *segments, |
@@ -354,873 +101,6 @@ out_free_image: | |||
354 | return ret; | 101 | return ret; |
355 | } | 102 | } |
356 | 103 | ||
357 | #ifdef CONFIG_KEXEC_FILE | ||
358 | static int copy_file_from_fd(int fd, void **buf, unsigned long *buf_len) | ||
359 | { | ||
360 | struct fd f = fdget(fd); | ||
361 | int ret; | ||
362 | struct kstat stat; | ||
363 | loff_t pos; | ||
364 | ssize_t bytes = 0; | ||
365 | |||
366 | if (!f.file) | ||
367 | return -EBADF; | ||
368 | |||
369 | ret = vfs_getattr(&f.file->f_path, &stat); | ||
370 | if (ret) | ||
371 | goto out; | ||
372 | |||
373 | if (stat.size > INT_MAX) { | ||
374 | ret = -EFBIG; | ||
375 | goto out; | ||
376 | } | ||
377 | |||
378 | /* Don't hand 0 to vmalloc, it whines. */ | ||
379 | if (stat.size == 0) { | ||
380 | ret = -EINVAL; | ||
381 | goto out; | ||
382 | } | ||
383 | |||
384 | *buf = vmalloc(stat.size); | ||
385 | if (!*buf) { | ||
386 | ret = -ENOMEM; | ||
387 | goto out; | ||
388 | } | ||
389 | |||
390 | pos = 0; | ||
391 | while (pos < stat.size) { | ||
392 | bytes = kernel_read(f.file, pos, (char *)(*buf) + pos, | ||
393 | stat.size - pos); | ||
394 | if (bytes < 0) { | ||
395 | vfree(*buf); | ||
396 | ret = bytes; | ||
397 | goto out; | ||
398 | } | ||
399 | |||
400 | if (bytes == 0) | ||
401 | break; | ||
402 | pos += bytes; | ||
403 | } | ||
404 | |||
405 | if (pos != stat.size) { | ||
406 | ret = -EBADF; | ||
407 | vfree(*buf); | ||
408 | goto out; | ||
409 | } | ||
410 | |||
411 | *buf_len = pos; | ||
412 | out: | ||
413 | fdput(f); | ||
414 | return ret; | ||
415 | } | ||
416 | |||
417 | /* Architectures can provide this probe function */ | ||
418 | int __weak arch_kexec_kernel_image_probe(struct kimage *image, void *buf, | ||
419 | unsigned long buf_len) | ||
420 | { | ||
421 | return -ENOEXEC; | ||
422 | } | ||
423 | |||
424 | void * __weak arch_kexec_kernel_image_load(struct kimage *image) | ||
425 | { | ||
426 | return ERR_PTR(-ENOEXEC); | ||
427 | } | ||
428 | |||
429 | void __weak arch_kimage_file_post_load_cleanup(struct kimage *image) | ||
430 | { | ||
431 | } | ||
432 | |||
433 | int __weak arch_kexec_kernel_verify_sig(struct kimage *image, void *buf, | ||
434 | unsigned long buf_len) | ||
435 | { | ||
436 | return -EKEYREJECTED; | ||
437 | } | ||
438 | |||
439 | /* Apply relocations of type RELA */ | ||
440 | int __weak | ||
441 | arch_kexec_apply_relocations_add(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, | ||
442 | unsigned int relsec) | ||
443 | { | ||
444 | pr_err("RELA relocation unsupported.\n"); | ||
445 | return -ENOEXEC; | ||
446 | } | ||
447 | |||
448 | /* Apply relocations of type REL */ | ||
449 | int __weak | ||
450 | arch_kexec_apply_relocations(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, | ||
451 | unsigned int relsec) | ||
452 | { | ||
453 | pr_err("REL relocation unsupported.\n"); | ||
454 | return -ENOEXEC; | ||
455 | } | ||
456 | |||
457 | /* | ||
458 | * Free up memory used by kernel, initrd, and command line. This is temporary | ||
459 | * memory allocation which is not needed any more after these buffers have | ||
460 | * been loaded into separate segments and have been copied elsewhere. | ||
461 | */ | ||
462 | static void kimage_file_post_load_cleanup(struct kimage *image) | ||
463 | { | ||
464 | struct purgatory_info *pi = &image->purgatory_info; | ||
465 | |||
466 | vfree(image->kernel_buf); | ||
467 | image->kernel_buf = NULL; | ||
468 | |||
469 | vfree(image->initrd_buf); | ||
470 | image->initrd_buf = NULL; | ||
471 | |||
472 | kfree(image->cmdline_buf); | ||
473 | image->cmdline_buf = NULL; | ||
474 | |||
475 | vfree(pi->purgatory_buf); | ||
476 | pi->purgatory_buf = NULL; | ||
477 | |||
478 | vfree(pi->sechdrs); | ||
479 | pi->sechdrs = NULL; | ||
480 | |||
481 | /* See if architecture has anything to cleanup post load */ | ||
482 | arch_kimage_file_post_load_cleanup(image); | ||
483 | |||
484 | /* | ||
485 | * Above call should have called into bootloader to free up | ||
486 | * any data stored in kimage->image_loader_data. It should | ||
487 | * be ok now to free it up. | ||
488 | */ | ||
489 | kfree(image->image_loader_data); | ||
490 | image->image_loader_data = NULL; | ||
491 | } | ||
492 | |||
493 | /* | ||
494 | * In file mode list of segments is prepared by kernel. Copy relevant | ||
495 | * data from user space, do error checking, prepare segment list | ||
496 | */ | ||
497 | static int | ||
498 | kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd, | ||
499 | const char __user *cmdline_ptr, | ||
500 | unsigned long cmdline_len, unsigned flags) | ||
501 | { | ||
502 | int ret = 0; | ||
503 | void *ldata; | ||
504 | |||
505 | ret = copy_file_from_fd(kernel_fd, &image->kernel_buf, | ||
506 | &image->kernel_buf_len); | ||
507 | if (ret) | ||
508 | return ret; | ||
509 | |||
510 | /* Call arch image probe handlers */ | ||
511 | ret = arch_kexec_kernel_image_probe(image, image->kernel_buf, | ||
512 | image->kernel_buf_len); | ||
513 | |||
514 | if (ret) | ||
515 | goto out; | ||
516 | |||
517 | #ifdef CONFIG_KEXEC_VERIFY_SIG | ||
518 | ret = arch_kexec_kernel_verify_sig(image, image->kernel_buf, | ||
519 | image->kernel_buf_len); | ||
520 | if (ret) { | ||
521 | pr_debug("kernel signature verification failed.\n"); | ||
522 | goto out; | ||
523 | } | ||
524 | pr_debug("kernel signature verification successful.\n"); | ||
525 | #endif | ||
526 | /* It is possible that there no initramfs is being loaded */ | ||
527 | if (!(flags & KEXEC_FILE_NO_INITRAMFS)) { | ||
528 | ret = copy_file_from_fd(initrd_fd, &image->initrd_buf, | ||
529 | &image->initrd_buf_len); | ||
530 | if (ret) | ||
531 | goto out; | ||
532 | } | ||
533 | |||
534 | if (cmdline_len) { | ||
535 | image->cmdline_buf = kzalloc(cmdline_len, GFP_KERNEL); | ||
536 | if (!image->cmdline_buf) { | ||
537 | ret = -ENOMEM; | ||
538 | goto out; | ||
539 | } | ||
540 | |||
541 | ret = copy_from_user(image->cmdline_buf, cmdline_ptr, | ||
542 | cmdline_len); | ||
543 | if (ret) { | ||
544 | ret = -EFAULT; | ||
545 | goto out; | ||
546 | } | ||
547 | |||
548 | image->cmdline_buf_len = cmdline_len; | ||
549 | |||
550 | /* command line should be a string with last byte null */ | ||
551 | if (image->cmdline_buf[cmdline_len - 1] != '\0') { | ||
552 | ret = -EINVAL; | ||
553 | goto out; | ||
554 | } | ||
555 | } | ||
556 | |||
557 | /* Call arch image load handlers */ | ||
558 | ldata = arch_kexec_kernel_image_load(image); | ||
559 | |||
560 | if (IS_ERR(ldata)) { | ||
561 | ret = PTR_ERR(ldata); | ||
562 | goto out; | ||
563 | } | ||
564 | |||
565 | image->image_loader_data = ldata; | ||
566 | out: | ||
567 | /* In case of error, free up all allocated memory in this function */ | ||
568 | if (ret) | ||
569 | kimage_file_post_load_cleanup(image); | ||
570 | return ret; | ||
571 | } | ||
572 | |||
573 | static int | ||
574 | kimage_file_alloc_init(struct kimage **rimage, int kernel_fd, | ||
575 | int initrd_fd, const char __user *cmdline_ptr, | ||
576 | unsigned long cmdline_len, unsigned long flags) | ||
577 | { | ||
578 | int ret; | ||
579 | struct kimage *image; | ||
580 | bool kexec_on_panic = flags & KEXEC_FILE_ON_CRASH; | ||
581 | |||
582 | image = do_kimage_alloc_init(); | ||
583 | if (!image) | ||
584 | return -ENOMEM; | ||
585 | |||
586 | image->file_mode = 1; | ||
587 | |||
588 | if (kexec_on_panic) { | ||
589 | /* Enable special crash kernel control page alloc policy. */ | ||
590 | image->control_page = crashk_res.start; | ||
591 | image->type = KEXEC_TYPE_CRASH; | ||
592 | } | ||
593 | |||
594 | ret = kimage_file_prepare_segments(image, kernel_fd, initrd_fd, | ||
595 | cmdline_ptr, cmdline_len, flags); | ||
596 | if (ret) | ||
597 | goto out_free_image; | ||
598 | |||
599 | ret = sanity_check_segment_list(image); | ||
600 | if (ret) | ||
601 | goto out_free_post_load_bufs; | ||
602 | |||
603 | ret = -ENOMEM; | ||
604 | image->control_code_page = kimage_alloc_control_pages(image, | ||
605 | get_order(KEXEC_CONTROL_PAGE_SIZE)); | ||
606 | if (!image->control_code_page) { | ||
607 | pr_err("Could not allocate control_code_buffer\n"); | ||
608 | goto out_free_post_load_bufs; | ||
609 | } | ||
610 | |||
611 | if (!kexec_on_panic) { | ||
612 | image->swap_page = kimage_alloc_control_pages(image, 0); | ||
613 | if (!image->swap_page) { | ||
614 | pr_err("Could not allocate swap buffer\n"); | ||
615 | goto out_free_control_pages; | ||
616 | } | ||
617 | } | ||
618 | |||
619 | *rimage = image; | ||
620 | return 0; | ||
621 | out_free_control_pages: | ||
622 | kimage_free_page_list(&image->control_pages); | ||
623 | out_free_post_load_bufs: | ||
624 | kimage_file_post_load_cleanup(image); | ||
625 | out_free_image: | ||
626 | kfree(image); | ||
627 | return ret; | ||
628 | } | ||
629 | #else /* CONFIG_KEXEC_FILE */ | ||
630 | static inline void kimage_file_post_load_cleanup(struct kimage *image) { } | ||
631 | #endif /* CONFIG_KEXEC_FILE */ | ||
632 | |||
633 | static int kimage_is_destination_range(struct kimage *image, | ||
634 | unsigned long start, | ||
635 | unsigned long end) | ||
636 | { | ||
637 | unsigned long i; | ||
638 | |||
639 | for (i = 0; i < image->nr_segments; i++) { | ||
640 | unsigned long mstart, mend; | ||
641 | |||
642 | mstart = image->segment[i].mem; | ||
643 | mend = mstart + image->segment[i].memsz; | ||
644 | if ((end > mstart) && (start < mend)) | ||
645 | return 1; | ||
646 | } | ||
647 | |||
648 | return 0; | ||
649 | } | ||
650 | |||
651 | static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order) | ||
652 | { | ||
653 | struct page *pages; | ||
654 | |||
655 | pages = alloc_pages(gfp_mask, order); | ||
656 | if (pages) { | ||
657 | unsigned int count, i; | ||
658 | pages->mapping = NULL; | ||
659 | set_page_private(pages, order); | ||
660 | count = 1 << order; | ||
661 | for (i = 0; i < count; i++) | ||
662 | SetPageReserved(pages + i); | ||
663 | } | ||
664 | |||
665 | return pages; | ||
666 | } | ||
667 | |||
668 | static void kimage_free_pages(struct page *page) | ||
669 | { | ||
670 | unsigned int order, count, i; | ||
671 | |||
672 | order = page_private(page); | ||
673 | count = 1 << order; | ||
674 | for (i = 0; i < count; i++) | ||
675 | ClearPageReserved(page + i); | ||
676 | __free_pages(page, order); | ||
677 | } | ||
678 | |||
679 | static void kimage_free_page_list(struct list_head *list) | ||
680 | { | ||
681 | struct list_head *pos, *next; | ||
682 | |||
683 | list_for_each_safe(pos, next, list) { | ||
684 | struct page *page; | ||
685 | |||
686 | page = list_entry(pos, struct page, lru); | ||
687 | list_del(&page->lru); | ||
688 | kimage_free_pages(page); | ||
689 | } | ||
690 | } | ||
691 | |||
692 | static struct page *kimage_alloc_normal_control_pages(struct kimage *image, | ||
693 | unsigned int order) | ||
694 | { | ||
695 | /* Control pages are special, they are the intermediaries | ||
696 | * that are needed while we copy the rest of the pages | ||
697 | * to their final resting place. As such they must | ||
698 | * not conflict with either the destination addresses | ||
699 | * or memory the kernel is already using. | ||
700 | * | ||
701 | * The only case where we really need more than one of | ||
702 | * these are for architectures where we cannot disable | ||
703 | * the MMU and must instead generate an identity mapped | ||
704 | * page table for all of the memory. | ||
705 | * | ||
706 | * At worst this runs in O(N) of the image size. | ||
707 | */ | ||
708 | struct list_head extra_pages; | ||
709 | struct page *pages; | ||
710 | unsigned int count; | ||
711 | |||
712 | count = 1 << order; | ||
713 | INIT_LIST_HEAD(&extra_pages); | ||
714 | |||
715 | /* Loop while I can allocate a page and the page allocated | ||
716 | * is a destination page. | ||
717 | */ | ||
718 | do { | ||
719 | unsigned long pfn, epfn, addr, eaddr; | ||
720 | |||
721 | pages = kimage_alloc_pages(KEXEC_CONTROL_MEMORY_GFP, order); | ||
722 | if (!pages) | ||
723 | break; | ||
724 | pfn = page_to_pfn(pages); | ||
725 | epfn = pfn + count; | ||
726 | addr = pfn << PAGE_SHIFT; | ||
727 | eaddr = epfn << PAGE_SHIFT; | ||
728 | if ((epfn >= (KEXEC_CONTROL_MEMORY_LIMIT >> PAGE_SHIFT)) || | ||
729 | kimage_is_destination_range(image, addr, eaddr)) { | ||
730 | list_add(&pages->lru, &extra_pages); | ||
731 | pages = NULL; | ||
732 | } | ||
733 | } while (!pages); | ||
734 | |||
735 | if (pages) { | ||
736 | /* Remember the allocated page... */ | ||
737 | list_add(&pages->lru, &image->control_pages); | ||
738 | |||
739 | /* Because the page is already in it's destination | ||
740 | * location we will never allocate another page at | ||
741 | * that address. Therefore kimage_alloc_pages | ||
742 | * will not return it (again) and we don't need | ||
743 | * to give it an entry in image->segment[]. | ||
744 | */ | ||
745 | } | ||
746 | /* Deal with the destination pages I have inadvertently allocated. | ||
747 | * | ||
748 | * Ideally I would convert multi-page allocations into single | ||
749 | * page allocations, and add everything to image->dest_pages. | ||
750 | * | ||
751 | * For now it is simpler to just free the pages. | ||
752 | */ | ||
753 | kimage_free_page_list(&extra_pages); | ||
754 | |||
755 | return pages; | ||
756 | } | ||
757 | |||
758 | static struct page *kimage_alloc_crash_control_pages(struct kimage *image, | ||
759 | unsigned int order) | ||
760 | { | ||
761 | /* Control pages are special, they are the intermediaries | ||
762 | * that are needed while we copy the rest of the pages | ||
763 | * to their final resting place. As such they must | ||
764 | * not conflict with either the destination addresses | ||
765 | * or memory the kernel is already using. | ||
766 | * | ||
767 | * Control pages are also the only pags we must allocate | ||
768 | * when loading a crash kernel. All of the other pages | ||
769 | * are specified by the segments and we just memcpy | ||
770 | * into them directly. | ||
771 | * | ||
772 | * The only case where we really need more than one of | ||
773 | * these are for architectures where we cannot disable | ||
774 | * the MMU and must instead generate an identity mapped | ||
775 | * page table for all of the memory. | ||
776 | * | ||
777 | * Given the low demand this implements a very simple | ||
778 | * allocator that finds the first hole of the appropriate | ||
779 | * size in the reserved memory region, and allocates all | ||
780 | * of the memory up to and including the hole. | ||
781 | */ | ||
782 | unsigned long hole_start, hole_end, size; | ||
783 | struct page *pages; | ||
784 | |||
785 | pages = NULL; | ||
786 | size = (1 << order) << PAGE_SHIFT; | ||
787 | hole_start = (image->control_page + (size - 1)) & ~(size - 1); | ||
788 | hole_end = hole_start + size - 1; | ||
789 | while (hole_end <= crashk_res.end) { | ||
790 | unsigned long i; | ||
791 | |||
792 | if (hole_end > KEXEC_CRASH_CONTROL_MEMORY_LIMIT) | ||
793 | break; | ||
794 | /* See if I overlap any of the segments */ | ||
795 | for (i = 0; i < image->nr_segments; i++) { | ||
796 | unsigned long mstart, mend; | ||
797 | |||
798 | mstart = image->segment[i].mem; | ||
799 | mend = mstart + image->segment[i].memsz - 1; | ||
800 | if ((hole_end >= mstart) && (hole_start <= mend)) { | ||
801 | /* Advance the hole to the end of the segment */ | ||
802 | hole_start = (mend + (size - 1)) & ~(size - 1); | ||
803 | hole_end = hole_start + size - 1; | ||
804 | break; | ||
805 | } | ||
806 | } | ||
807 | /* If I don't overlap any segments I have found my hole! */ | ||
808 | if (i == image->nr_segments) { | ||
809 | pages = pfn_to_page(hole_start >> PAGE_SHIFT); | ||
810 | break; | ||
811 | } | ||
812 | } | ||
813 | if (pages) | ||
814 | image->control_page = hole_end; | ||
815 | |||
816 | return pages; | ||
817 | } | ||
818 | |||
819 | |||
820 | struct page *kimage_alloc_control_pages(struct kimage *image, | ||
821 | unsigned int order) | ||
822 | { | ||
823 | struct page *pages = NULL; | ||
824 | |||
825 | switch (image->type) { | ||
826 | case KEXEC_TYPE_DEFAULT: | ||
827 | pages = kimage_alloc_normal_control_pages(image, order); | ||
828 | break; | ||
829 | case KEXEC_TYPE_CRASH: | ||
830 | pages = kimage_alloc_crash_control_pages(image, order); | ||
831 | break; | ||
832 | } | ||
833 | |||
834 | return pages; | ||
835 | } | ||
836 | |||
837 | static int kimage_add_entry(struct kimage *image, kimage_entry_t entry) | ||
838 | { | ||
839 | if (*image->entry != 0) | ||
840 | image->entry++; | ||
841 | |||
842 | if (image->entry == image->last_entry) { | ||
843 | kimage_entry_t *ind_page; | ||
844 | struct page *page; | ||
845 | |||
846 | page = kimage_alloc_page(image, GFP_KERNEL, KIMAGE_NO_DEST); | ||
847 | if (!page) | ||
848 | return -ENOMEM; | ||
849 | |||
850 | ind_page = page_address(page); | ||
851 | *image->entry = virt_to_phys(ind_page) | IND_INDIRECTION; | ||
852 | image->entry = ind_page; | ||
853 | image->last_entry = ind_page + | ||
854 | ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1); | ||
855 | } | ||
856 | *image->entry = entry; | ||
857 | image->entry++; | ||
858 | *image->entry = 0; | ||
859 | |||
860 | return 0; | ||
861 | } | ||
862 | |||
863 | static int kimage_set_destination(struct kimage *image, | ||
864 | unsigned long destination) | ||
865 | { | ||
866 | int result; | ||
867 | |||
868 | destination &= PAGE_MASK; | ||
869 | result = kimage_add_entry(image, destination | IND_DESTINATION); | ||
870 | |||
871 | return result; | ||
872 | } | ||
873 | |||
874 | |||
875 | static int kimage_add_page(struct kimage *image, unsigned long page) | ||
876 | { | ||
877 | int result; | ||
878 | |||
879 | page &= PAGE_MASK; | ||
880 | result = kimage_add_entry(image, page | IND_SOURCE); | ||
881 | |||
882 | return result; | ||
883 | } | ||
884 | |||
885 | |||
886 | static void kimage_free_extra_pages(struct kimage *image) | ||
887 | { | ||
888 | /* Walk through and free any extra destination pages I may have */ | ||
889 | kimage_free_page_list(&image->dest_pages); | ||
890 | |||
891 | /* Walk through and free any unusable pages I have cached */ | ||
892 | kimage_free_page_list(&image->unusable_pages); | ||
893 | |||
894 | } | ||
895 | static void kimage_terminate(struct kimage *image) | ||
896 | { | ||
897 | if (*image->entry != 0) | ||
898 | image->entry++; | ||
899 | |||
900 | *image->entry = IND_DONE; | ||
901 | } | ||
902 | |||
903 | #define for_each_kimage_entry(image, ptr, entry) \ | ||
904 | for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \ | ||
905 | ptr = (entry & IND_INDIRECTION) ? \ | ||
906 | phys_to_virt((entry & PAGE_MASK)) : ptr + 1) | ||
907 | |||
908 | static void kimage_free_entry(kimage_entry_t entry) | ||
909 | { | ||
910 | struct page *page; | ||
911 | |||
912 | page = pfn_to_page(entry >> PAGE_SHIFT); | ||
913 | kimage_free_pages(page); | ||
914 | } | ||
915 | |||
916 | static void kimage_free(struct kimage *image) | ||
917 | { | ||
918 | kimage_entry_t *ptr, entry; | ||
919 | kimage_entry_t ind = 0; | ||
920 | |||
921 | if (!image) | ||
922 | return; | ||
923 | |||
924 | kimage_free_extra_pages(image); | ||
925 | for_each_kimage_entry(image, ptr, entry) { | ||
926 | if (entry & IND_INDIRECTION) { | ||
927 | /* Free the previous indirection page */ | ||
928 | if (ind & IND_INDIRECTION) | ||
929 | kimage_free_entry(ind); | ||
930 | /* Save this indirection page until we are | ||
931 | * done with it. | ||
932 | */ | ||
933 | ind = entry; | ||
934 | } else if (entry & IND_SOURCE) | ||
935 | kimage_free_entry(entry); | ||
936 | } | ||
937 | /* Free the final indirection page */ | ||
938 | if (ind & IND_INDIRECTION) | ||
939 | kimage_free_entry(ind); | ||
940 | |||
941 | /* Handle any machine specific cleanup */ | ||
942 | machine_kexec_cleanup(image); | ||
943 | |||
944 | /* Free the kexec control pages... */ | ||
945 | kimage_free_page_list(&image->control_pages); | ||
946 | |||
947 | /* | ||
948 | * Free up any temporary buffers allocated. This might hit if | ||
949 | * error occurred much later after buffer allocation. | ||
950 | */ | ||
951 | if (image->file_mode) | ||
952 | kimage_file_post_load_cleanup(image); | ||
953 | |||
954 | kfree(image); | ||
955 | } | ||
956 | |||
957 | static kimage_entry_t *kimage_dst_used(struct kimage *image, | ||
958 | unsigned long page) | ||
959 | { | ||
960 | kimage_entry_t *ptr, entry; | ||
961 | unsigned long destination = 0; | ||
962 | |||
963 | for_each_kimage_entry(image, ptr, entry) { | ||
964 | if (entry & IND_DESTINATION) | ||
965 | destination = entry & PAGE_MASK; | ||
966 | else if (entry & IND_SOURCE) { | ||
967 | if (page == destination) | ||
968 | return ptr; | ||
969 | destination += PAGE_SIZE; | ||
970 | } | ||
971 | } | ||
972 | |||
973 | return NULL; | ||
974 | } | ||
975 | |||
976 | static struct page *kimage_alloc_page(struct kimage *image, | ||
977 | gfp_t gfp_mask, | ||
978 | unsigned long destination) | ||
979 | { | ||
980 | /* | ||
981 | * Here we implement safeguards to ensure that a source page | ||
982 | * is not copied to its destination page before the data on | ||
983 | * the destination page is no longer useful. | ||
984 | * | ||
985 | * To do this we maintain the invariant that a source page is | ||
986 | * either its own destination page, or it is not a | ||
987 | * destination page at all. | ||
988 | * | ||
989 | * That is slightly stronger than required, but the proof | ||
990 | * that no problems will not occur is trivial, and the | ||
991 | * implementation is simply to verify. | ||
992 | * | ||
993 | * When allocating all pages normally this algorithm will run | ||
994 | * in O(N) time, but in the worst case it will run in O(N^2) | ||
995 | * time. If the runtime is a problem the data structures can | ||
996 | * be fixed. | ||
997 | */ | ||
998 | struct page *page; | ||
999 | unsigned long addr; | ||
1000 | |||
1001 | /* | ||
1002 | * Walk through the list of destination pages, and see if I | ||
1003 | * have a match. | ||
1004 | */ | ||
1005 | list_for_each_entry(page, &image->dest_pages, lru) { | ||
1006 | addr = page_to_pfn(page) << PAGE_SHIFT; | ||
1007 | if (addr == destination) { | ||
1008 | list_del(&page->lru); | ||
1009 | return page; | ||
1010 | } | ||
1011 | } | ||
1012 | page = NULL; | ||
1013 | while (1) { | ||
1014 | kimage_entry_t *old; | ||
1015 | |||
1016 | /* Allocate a page, if we run out of memory give up */ | ||
1017 | page = kimage_alloc_pages(gfp_mask, 0); | ||
1018 | if (!page) | ||
1019 | return NULL; | ||
1020 | /* If the page cannot be used file it away */ | ||
1021 | if (page_to_pfn(page) > | ||
1022 | (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) { | ||
1023 | list_add(&page->lru, &image->unusable_pages); | ||
1024 | continue; | ||
1025 | } | ||
1026 | addr = page_to_pfn(page) << PAGE_SHIFT; | ||
1027 | |||
1028 | /* If it is the destination page we want use it */ | ||
1029 | if (addr == destination) | ||
1030 | break; | ||
1031 | |||
1032 | /* If the page is not a destination page use it */ | ||
1033 | if (!kimage_is_destination_range(image, addr, | ||
1034 | addr + PAGE_SIZE)) | ||
1035 | break; | ||
1036 | |||
1037 | /* | ||
1038 | * I know that the page is someones destination page. | ||
1039 | * See if there is already a source page for this | ||
1040 | * destination page. And if so swap the source pages. | ||
1041 | */ | ||
1042 | old = kimage_dst_used(image, addr); | ||
1043 | if (old) { | ||
1044 | /* If so move it */ | ||
1045 | unsigned long old_addr; | ||
1046 | struct page *old_page; | ||
1047 | |||
1048 | old_addr = *old & PAGE_MASK; | ||
1049 | old_page = pfn_to_page(old_addr >> PAGE_SHIFT); | ||
1050 | copy_highpage(page, old_page); | ||
1051 | *old = addr | (*old & ~PAGE_MASK); | ||
1052 | |||
1053 | /* The old page I have found cannot be a | ||
1054 | * destination page, so return it if it's | ||
1055 | * gfp_flags honor the ones passed in. | ||
1056 | */ | ||
1057 | if (!(gfp_mask & __GFP_HIGHMEM) && | ||
1058 | PageHighMem(old_page)) { | ||
1059 | kimage_free_pages(old_page); | ||
1060 | continue; | ||
1061 | } | ||
1062 | addr = old_addr; | ||
1063 | page = old_page; | ||
1064 | break; | ||
1065 | } else { | ||
1066 | /* Place the page on the destination list I | ||
1067 | * will use it later. | ||
1068 | */ | ||
1069 | list_add(&page->lru, &image->dest_pages); | ||
1070 | } | ||
1071 | } | ||
1072 | |||
1073 | return page; | ||
1074 | } | ||
1075 | |||
1076 | static int kimage_load_normal_segment(struct kimage *image, | ||
1077 | struct kexec_segment *segment) | ||
1078 | { | ||
1079 | unsigned long maddr; | ||
1080 | size_t ubytes, mbytes; | ||
1081 | int result; | ||
1082 | unsigned char __user *buf = NULL; | ||
1083 | unsigned char *kbuf = NULL; | ||
1084 | |||
1085 | result = 0; | ||
1086 | if (image->file_mode) | ||
1087 | kbuf = segment->kbuf; | ||
1088 | else | ||
1089 | buf = segment->buf; | ||
1090 | ubytes = segment->bufsz; | ||
1091 | mbytes = segment->memsz; | ||
1092 | maddr = segment->mem; | ||
1093 | |||
1094 | result = kimage_set_destination(image, maddr); | ||
1095 | if (result < 0) | ||
1096 | goto out; | ||
1097 | |||
1098 | while (mbytes) { | ||
1099 | struct page *page; | ||
1100 | char *ptr; | ||
1101 | size_t uchunk, mchunk; | ||
1102 | |||
1103 | page = kimage_alloc_page(image, GFP_HIGHUSER, maddr); | ||
1104 | if (!page) { | ||
1105 | result = -ENOMEM; | ||
1106 | goto out; | ||
1107 | } | ||
1108 | result = kimage_add_page(image, page_to_pfn(page) | ||
1109 | << PAGE_SHIFT); | ||
1110 | if (result < 0) | ||
1111 | goto out; | ||
1112 | |||
1113 | ptr = kmap(page); | ||
1114 | /* Start with a clear page */ | ||
1115 | clear_page(ptr); | ||
1116 | ptr += maddr & ~PAGE_MASK; | ||
1117 | mchunk = min_t(size_t, mbytes, | ||
1118 | PAGE_SIZE - (maddr & ~PAGE_MASK)); | ||
1119 | uchunk = min(ubytes, mchunk); | ||
1120 | |||
1121 | /* For file based kexec, source pages are in kernel memory */ | ||
1122 | if (image->file_mode) | ||
1123 | memcpy(ptr, kbuf, uchunk); | ||
1124 | else | ||
1125 | result = copy_from_user(ptr, buf, uchunk); | ||
1126 | kunmap(page); | ||
1127 | if (result) { | ||
1128 | result = -EFAULT; | ||
1129 | goto out; | ||
1130 | } | ||
1131 | ubytes -= uchunk; | ||
1132 | maddr += mchunk; | ||
1133 | if (image->file_mode) | ||
1134 | kbuf += mchunk; | ||
1135 | else | ||
1136 | buf += mchunk; | ||
1137 | mbytes -= mchunk; | ||
1138 | } | ||
1139 | out: | ||
1140 | return result; | ||
1141 | } | ||
1142 | |||
1143 | static int kimage_load_crash_segment(struct kimage *image, | ||
1144 | struct kexec_segment *segment) | ||
1145 | { | ||
1146 | /* For crash dumps kernels we simply copy the data from | ||
1147 | * user space to it's destination. | ||
1148 | * We do things a page at a time for the sake of kmap. | ||
1149 | */ | ||
1150 | unsigned long maddr; | ||
1151 | size_t ubytes, mbytes; | ||
1152 | int result; | ||
1153 | unsigned char __user *buf = NULL; | ||
1154 | unsigned char *kbuf = NULL; | ||
1155 | |||
1156 | result = 0; | ||
1157 | if (image->file_mode) | ||
1158 | kbuf = segment->kbuf; | ||
1159 | else | ||
1160 | buf = segment->buf; | ||
1161 | ubytes = segment->bufsz; | ||
1162 | mbytes = segment->memsz; | ||
1163 | maddr = segment->mem; | ||
1164 | while (mbytes) { | ||
1165 | struct page *page; | ||
1166 | char *ptr; | ||
1167 | size_t uchunk, mchunk; | ||
1168 | |||
1169 | page = pfn_to_page(maddr >> PAGE_SHIFT); | ||
1170 | if (!page) { | ||
1171 | result = -ENOMEM; | ||
1172 | goto out; | ||
1173 | } | ||
1174 | ptr = kmap(page); | ||
1175 | ptr += maddr & ~PAGE_MASK; | ||
1176 | mchunk = min_t(size_t, mbytes, | ||
1177 | PAGE_SIZE - (maddr & ~PAGE_MASK)); | ||
1178 | uchunk = min(ubytes, mchunk); | ||
1179 | if (mchunk > uchunk) { | ||
1180 | /* Zero the trailing part of the page */ | ||
1181 | memset(ptr + uchunk, 0, mchunk - uchunk); | ||
1182 | } | ||
1183 | |||
1184 | /* For file based kexec, source pages are in kernel memory */ | ||
1185 | if (image->file_mode) | ||
1186 | memcpy(ptr, kbuf, uchunk); | ||
1187 | else | ||
1188 | result = copy_from_user(ptr, buf, uchunk); | ||
1189 | kexec_flush_icache_page(page); | ||
1190 | kunmap(page); | ||
1191 | if (result) { | ||
1192 | result = -EFAULT; | ||
1193 | goto out; | ||
1194 | } | ||
1195 | ubytes -= uchunk; | ||
1196 | maddr += mchunk; | ||
1197 | if (image->file_mode) | ||
1198 | kbuf += mchunk; | ||
1199 | else | ||
1200 | buf += mchunk; | ||
1201 | mbytes -= mchunk; | ||
1202 | } | ||
1203 | out: | ||
1204 | return result; | ||
1205 | } | ||
1206 | |||
1207 | static int kimage_load_segment(struct kimage *image, | ||
1208 | struct kexec_segment *segment) | ||
1209 | { | ||
1210 | int result = -ENOMEM; | ||
1211 | |||
1212 | switch (image->type) { | ||
1213 | case KEXEC_TYPE_DEFAULT: | ||
1214 | result = kimage_load_normal_segment(image, segment); | ||
1215 | break; | ||
1216 | case KEXEC_TYPE_CRASH: | ||
1217 | result = kimage_load_crash_segment(image, segment); | ||
1218 | break; | ||
1219 | } | ||
1220 | |||
1221 | return result; | ||
1222 | } | ||
1223 | |||
1224 | /* | 104 | /* |
1225 | * Exec Kernel system call: for obvious reasons only root may call it. | 105 | * Exec Kernel system call: for obvious reasons only root may call it. |
1226 | * | 106 | * |
@@ -1241,11 +121,6 @@ static int kimage_load_segment(struct kimage *image, | |||
1241 | * kexec does not sync, or unmount filesystems so if you need | 121 | * kexec does not sync, or unmount filesystems so if you need |
1242 | * that to happen you need to do that yourself. | 122 | * that to happen you need to do that yourself. |
1243 | */ | 123 | */ |
1244 | struct kimage *kexec_image; | ||
1245 | struct kimage *kexec_crash_image; | ||
1246 | int kexec_load_disabled; | ||
1247 | |||
1248 | static DEFINE_MUTEX(kexec_mutex); | ||
1249 | 124 | ||
1250 | SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments, | 125 | SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments, |
1251 | struct kexec_segment __user *, segments, unsigned long, flags) | 126 | struct kexec_segment __user *, segments, unsigned long, flags) |
@@ -1340,18 +215,6 @@ out: | |||
1340 | return result; | 215 | return result; |
1341 | } | 216 | } |
1342 | 217 | ||
1343 | /* | ||
1344 | * Add and remove page tables for crashkernel memory | ||
1345 | * | ||
1346 | * Provide an empty default implementation here -- architecture | ||
1347 | * code may override this | ||
1348 | */ | ||
1349 | void __weak crash_map_reserved_pages(void) | ||
1350 | {} | ||
1351 | |||
1352 | void __weak crash_unmap_reserved_pages(void) | ||
1353 | {} | ||
1354 | |||
1355 | #ifdef CONFIG_COMPAT | 218 | #ifdef CONFIG_COMPAT |
1356 | COMPAT_SYSCALL_DEFINE4(kexec_load, compat_ulong_t, entry, | 219 | COMPAT_SYSCALL_DEFINE4(kexec_load, compat_ulong_t, entry, |
1357 | compat_ulong_t, nr_segments, | 220 | compat_ulong_t, nr_segments, |
@@ -1390,1391 +253,3 @@ COMPAT_SYSCALL_DEFINE4(kexec_load, compat_ulong_t, entry, | |||
1390 | return sys_kexec_load(entry, nr_segments, ksegments, flags); | 253 | return sys_kexec_load(entry, nr_segments, ksegments, flags); |
1391 | } | 254 | } |
1392 | #endif | 255 | #endif |
1393 | |||
1394 | #ifdef CONFIG_KEXEC_FILE | ||
1395 | SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd, | ||
1396 | unsigned long, cmdline_len, const char __user *, cmdline_ptr, | ||
1397 | unsigned long, flags) | ||
1398 | { | ||
1399 | int ret = 0, i; | ||
1400 | struct kimage **dest_image, *image; | ||
1401 | |||
1402 | /* We only trust the superuser with rebooting the system. */ | ||
1403 | if (!capable(CAP_SYS_BOOT) || kexec_load_disabled) | ||
1404 | return -EPERM; | ||
1405 | |||
1406 | /* Make sure we have a legal set of flags */ | ||
1407 | if (flags != (flags & KEXEC_FILE_FLAGS)) | ||
1408 | return -EINVAL; | ||
1409 | |||
1410 | image = NULL; | ||
1411 | |||
1412 | if (!mutex_trylock(&kexec_mutex)) | ||
1413 | return -EBUSY; | ||
1414 | |||
1415 | dest_image = &kexec_image; | ||
1416 | if (flags & KEXEC_FILE_ON_CRASH) | ||
1417 | dest_image = &kexec_crash_image; | ||
1418 | |||
1419 | if (flags & KEXEC_FILE_UNLOAD) | ||
1420 | goto exchange; | ||
1421 | |||
1422 | /* | ||
1423 | * In case of crash, new kernel gets loaded in reserved region. It is | ||
1424 | * same memory where old crash kernel might be loaded. Free any | ||
1425 | * current crash dump kernel before we corrupt it. | ||
1426 | */ | ||
1427 | if (flags & KEXEC_FILE_ON_CRASH) | ||
1428 | kimage_free(xchg(&kexec_crash_image, NULL)); | ||
1429 | |||
1430 | ret = kimage_file_alloc_init(&image, kernel_fd, initrd_fd, cmdline_ptr, | ||
1431 | cmdline_len, flags); | ||
1432 | if (ret) | ||
1433 | goto out; | ||
1434 | |||
1435 | ret = machine_kexec_prepare(image); | ||
1436 | if (ret) | ||
1437 | goto out; | ||
1438 | |||
1439 | ret = kexec_calculate_store_digests(image); | ||
1440 | if (ret) | ||
1441 | goto out; | ||
1442 | |||
1443 | for (i = 0; i < image->nr_segments; i++) { | ||
1444 | struct kexec_segment *ksegment; | ||
1445 | |||
1446 | ksegment = &image->segment[i]; | ||
1447 | pr_debug("Loading segment %d: buf=0x%p bufsz=0x%zx mem=0x%lx memsz=0x%zx\n", | ||
1448 | i, ksegment->buf, ksegment->bufsz, ksegment->mem, | ||
1449 | ksegment->memsz); | ||
1450 | |||
1451 | ret = kimage_load_segment(image, &image->segment[i]); | ||
1452 | if (ret) | ||
1453 | goto out; | ||
1454 | } | ||
1455 | |||
1456 | kimage_terminate(image); | ||
1457 | |||
1458 | /* | ||
1459 | * Free up any temporary buffers allocated which are not needed | ||
1460 | * after image has been loaded | ||
1461 | */ | ||
1462 | kimage_file_post_load_cleanup(image); | ||
1463 | exchange: | ||
1464 | image = xchg(dest_image, image); | ||
1465 | out: | ||
1466 | mutex_unlock(&kexec_mutex); | ||
1467 | kimage_free(image); | ||
1468 | return ret; | ||
1469 | } | ||
1470 | |||
1471 | #endif /* CONFIG_KEXEC_FILE */ | ||
1472 | |||
1473 | void crash_kexec(struct pt_regs *regs) | ||
1474 | { | ||
1475 | /* Take the kexec_mutex here to prevent sys_kexec_load | ||
1476 | * running on one cpu from replacing the crash kernel | ||
1477 | * we are using after a panic on a different cpu. | ||
1478 | * | ||
1479 | * If the crash kernel was not located in a fixed area | ||
1480 | * of memory the xchg(&kexec_crash_image) would be | ||
1481 | * sufficient. But since I reuse the memory... | ||
1482 | */ | ||
1483 | if (mutex_trylock(&kexec_mutex)) { | ||
1484 | if (kexec_crash_image) { | ||
1485 | struct pt_regs fixed_regs; | ||
1486 | |||
1487 | crash_setup_regs(&fixed_regs, regs); | ||
1488 | crash_save_vmcoreinfo(); | ||
1489 | machine_crash_shutdown(&fixed_regs); | ||
1490 | machine_kexec(kexec_crash_image); | ||
1491 | } | ||
1492 | mutex_unlock(&kexec_mutex); | ||
1493 | } | ||
1494 | } | ||
1495 | |||
1496 | size_t crash_get_memory_size(void) | ||
1497 | { | ||
1498 | size_t size = 0; | ||
1499 | mutex_lock(&kexec_mutex); | ||
1500 | if (crashk_res.end != crashk_res.start) | ||
1501 | size = resource_size(&crashk_res); | ||
1502 | mutex_unlock(&kexec_mutex); | ||
1503 | return size; | ||
1504 | } | ||
1505 | |||
1506 | void __weak crash_free_reserved_phys_range(unsigned long begin, | ||
1507 | unsigned long end) | ||
1508 | { | ||
1509 | unsigned long addr; | ||
1510 | |||
1511 | for (addr = begin; addr < end; addr += PAGE_SIZE) | ||
1512 | free_reserved_page(pfn_to_page(addr >> PAGE_SHIFT)); | ||
1513 | } | ||
1514 | |||
1515 | int crash_shrink_memory(unsigned long new_size) | ||
1516 | { | ||
1517 | int ret = 0; | ||
1518 | unsigned long start, end; | ||
1519 | unsigned long old_size; | ||
1520 | struct resource *ram_res; | ||
1521 | |||
1522 | mutex_lock(&kexec_mutex); | ||
1523 | |||
1524 | if (kexec_crash_image) { | ||
1525 | ret = -ENOENT; | ||
1526 | goto unlock; | ||
1527 | } | ||
1528 | start = crashk_res.start; | ||
1529 | end = crashk_res.end; | ||
1530 | old_size = (end == 0) ? 0 : end - start + 1; | ||
1531 | if (new_size >= old_size) { | ||
1532 | ret = (new_size == old_size) ? 0 : -EINVAL; | ||
1533 | goto unlock; | ||
1534 | } | ||
1535 | |||
1536 | ram_res = kzalloc(sizeof(*ram_res), GFP_KERNEL); | ||
1537 | if (!ram_res) { | ||
1538 | ret = -ENOMEM; | ||
1539 | goto unlock; | ||
1540 | } | ||
1541 | |||
1542 | start = roundup(start, KEXEC_CRASH_MEM_ALIGN); | ||
1543 | end = roundup(start + new_size, KEXEC_CRASH_MEM_ALIGN); | ||
1544 | |||
1545 | crash_map_reserved_pages(); | ||
1546 | crash_free_reserved_phys_range(end, crashk_res.end); | ||
1547 | |||
1548 | if ((start == end) && (crashk_res.parent != NULL)) | ||
1549 | release_resource(&crashk_res); | ||
1550 | |||
1551 | ram_res->start = end; | ||
1552 | ram_res->end = crashk_res.end; | ||
1553 | ram_res->flags = IORESOURCE_BUSY | IORESOURCE_MEM; | ||
1554 | ram_res->name = "System RAM"; | ||
1555 | |||
1556 | crashk_res.end = end - 1; | ||
1557 | |||
1558 | insert_resource(&iomem_resource, ram_res); | ||
1559 | crash_unmap_reserved_pages(); | ||
1560 | |||
1561 | unlock: | ||
1562 | mutex_unlock(&kexec_mutex); | ||
1563 | return ret; | ||
1564 | } | ||
1565 | |||
1566 | static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data, | ||
1567 | size_t data_len) | ||
1568 | { | ||
1569 | struct elf_note note; | ||
1570 | |||
1571 | note.n_namesz = strlen(name) + 1; | ||
1572 | note.n_descsz = data_len; | ||
1573 | note.n_type = type; | ||
1574 | memcpy(buf, ¬e, sizeof(note)); | ||
1575 | buf += (sizeof(note) + 3)/4; | ||
1576 | memcpy(buf, name, note.n_namesz); | ||
1577 | buf += (note.n_namesz + 3)/4; | ||
1578 | memcpy(buf, data, note.n_descsz); | ||
1579 | buf += (note.n_descsz + 3)/4; | ||
1580 | |||
1581 | return buf; | ||
1582 | } | ||
1583 | |||
1584 | static void final_note(u32 *buf) | ||
1585 | { | ||
1586 | struct elf_note note; | ||
1587 | |||
1588 | note.n_namesz = 0; | ||
1589 | note.n_descsz = 0; | ||
1590 | note.n_type = 0; | ||
1591 | memcpy(buf, ¬e, sizeof(note)); | ||
1592 | } | ||
1593 | |||
1594 | void crash_save_cpu(struct pt_regs *regs, int cpu) | ||
1595 | { | ||
1596 | struct elf_prstatus prstatus; | ||
1597 | u32 *buf; | ||
1598 | |||
1599 | if ((cpu < 0) || (cpu >= nr_cpu_ids)) | ||
1600 | return; | ||
1601 | |||
1602 | /* Using ELF notes here is opportunistic. | ||
1603 | * I need a well defined structure format | ||
1604 | * for the data I pass, and I need tags | ||
1605 | * on the data to indicate what information I have | ||
1606 | * squirrelled away. ELF notes happen to provide | ||
1607 | * all of that, so there is no need to invent something new. | ||
1608 | */ | ||
1609 | buf = (u32 *)per_cpu_ptr(crash_notes, cpu); | ||
1610 | if (!buf) | ||
1611 | return; | ||
1612 | memset(&prstatus, 0, sizeof(prstatus)); | ||
1613 | prstatus.pr_pid = current->pid; | ||
1614 | elf_core_copy_kernel_regs(&prstatus.pr_reg, regs); | ||
1615 | buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS, | ||
1616 | &prstatus, sizeof(prstatus)); | ||
1617 | final_note(buf); | ||
1618 | } | ||
1619 | |||
1620 | static int __init crash_notes_memory_init(void) | ||
1621 | { | ||
1622 | /* Allocate memory for saving cpu registers. */ | ||
1623 | crash_notes = alloc_percpu(note_buf_t); | ||
1624 | if (!crash_notes) { | ||
1625 | pr_warn("Kexec: Memory allocation for saving cpu register states failed\n"); | ||
1626 | return -ENOMEM; | ||
1627 | } | ||
1628 | return 0; | ||
1629 | } | ||
1630 | subsys_initcall(crash_notes_memory_init); | ||
1631 | |||
1632 | |||
1633 | /* | ||
1634 | * parsing the "crashkernel" commandline | ||
1635 | * | ||
1636 | * this code is intended to be called from architecture specific code | ||
1637 | */ | ||
1638 | |||
1639 | |||
1640 | /* | ||
1641 | * This function parses command lines in the format | ||
1642 | * | ||
1643 | * crashkernel=ramsize-range:size[,...][@offset] | ||
1644 | * | ||
1645 | * The function returns 0 on success and -EINVAL on failure. | ||
1646 | */ | ||
1647 | static int __init parse_crashkernel_mem(char *cmdline, | ||
1648 | unsigned long long system_ram, | ||
1649 | unsigned long long *crash_size, | ||
1650 | unsigned long long *crash_base) | ||
1651 | { | ||
1652 | char *cur = cmdline, *tmp; | ||
1653 | |||
1654 | /* for each entry of the comma-separated list */ | ||
1655 | do { | ||
1656 | unsigned long long start, end = ULLONG_MAX, size; | ||
1657 | |||
1658 | /* get the start of the range */ | ||
1659 | start = memparse(cur, &tmp); | ||
1660 | if (cur == tmp) { | ||
1661 | pr_warn("crashkernel: Memory value expected\n"); | ||
1662 | return -EINVAL; | ||
1663 | } | ||
1664 | cur = tmp; | ||
1665 | if (*cur != '-') { | ||
1666 | pr_warn("crashkernel: '-' expected\n"); | ||
1667 | return -EINVAL; | ||
1668 | } | ||
1669 | cur++; | ||
1670 | |||
1671 | /* if no ':' is here, than we read the end */ | ||
1672 | if (*cur != ':') { | ||
1673 | end = memparse(cur, &tmp); | ||
1674 | if (cur == tmp) { | ||
1675 | pr_warn("crashkernel: Memory value expected\n"); | ||
1676 | return -EINVAL; | ||
1677 | } | ||
1678 | cur = tmp; | ||
1679 | if (end <= start) { | ||
1680 | pr_warn("crashkernel: end <= start\n"); | ||
1681 | return -EINVAL; | ||
1682 | } | ||
1683 | } | ||
1684 | |||
1685 | if (*cur != ':') { | ||
1686 | pr_warn("crashkernel: ':' expected\n"); | ||
1687 | return -EINVAL; | ||
1688 | } | ||
1689 | cur++; | ||
1690 | |||
1691 | size = memparse(cur, &tmp); | ||
1692 | if (cur == tmp) { | ||
1693 | pr_warn("Memory value expected\n"); | ||
1694 | return -EINVAL; | ||
1695 | } | ||
1696 | cur = tmp; | ||
1697 | if (size >= system_ram) { | ||
1698 | pr_warn("crashkernel: invalid size\n"); | ||
1699 | return -EINVAL; | ||
1700 | } | ||
1701 | |||
1702 | /* match ? */ | ||
1703 | if (system_ram >= start && system_ram < end) { | ||
1704 | *crash_size = size; | ||
1705 | break; | ||
1706 | } | ||
1707 | } while (*cur++ == ','); | ||
1708 | |||
1709 | if (*crash_size > 0) { | ||
1710 | while (*cur && *cur != ' ' && *cur != '@') | ||
1711 | cur++; | ||
1712 | if (*cur == '@') { | ||
1713 | cur++; | ||
1714 | *crash_base = memparse(cur, &tmp); | ||
1715 | if (cur == tmp) { | ||
1716 | pr_warn("Memory value expected after '@'\n"); | ||
1717 | return -EINVAL; | ||
1718 | } | ||
1719 | } | ||
1720 | } | ||
1721 | |||
1722 | return 0; | ||
1723 | } | ||
1724 | |||
1725 | /* | ||
1726 | * That function parses "simple" (old) crashkernel command lines like | ||
1727 | * | ||
1728 | * crashkernel=size[@offset] | ||
1729 | * | ||
1730 | * It returns 0 on success and -EINVAL on failure. | ||
1731 | */ | ||
1732 | static int __init parse_crashkernel_simple(char *cmdline, | ||
1733 | unsigned long long *crash_size, | ||
1734 | unsigned long long *crash_base) | ||
1735 | { | ||
1736 | char *cur = cmdline; | ||
1737 | |||
1738 | *crash_size = memparse(cmdline, &cur); | ||
1739 | if (cmdline == cur) { | ||
1740 | pr_warn("crashkernel: memory value expected\n"); | ||
1741 | return -EINVAL; | ||
1742 | } | ||
1743 | |||
1744 | if (*cur == '@') | ||
1745 | *crash_base = memparse(cur+1, &cur); | ||
1746 | else if (*cur != ' ' && *cur != '\0') { | ||
1747 | pr_warn("crashkernel: unrecognized char\n"); | ||
1748 | return -EINVAL; | ||
1749 | } | ||
1750 | |||
1751 | return 0; | ||
1752 | } | ||
1753 | |||
1754 | #define SUFFIX_HIGH 0 | ||
1755 | #define SUFFIX_LOW 1 | ||
1756 | #define SUFFIX_NULL 2 | ||
1757 | static __initdata char *suffix_tbl[] = { | ||
1758 | [SUFFIX_HIGH] = ",high", | ||
1759 | [SUFFIX_LOW] = ",low", | ||
1760 | [SUFFIX_NULL] = NULL, | ||
1761 | }; | ||
1762 | |||
1763 | /* | ||
1764 | * That function parses "suffix" crashkernel command lines like | ||
1765 | * | ||
1766 | * crashkernel=size,[high|low] | ||
1767 | * | ||
1768 | * It returns 0 on success and -EINVAL on failure. | ||
1769 | */ | ||
1770 | static int __init parse_crashkernel_suffix(char *cmdline, | ||
1771 | unsigned long long *crash_size, | ||
1772 | const char *suffix) | ||
1773 | { | ||
1774 | char *cur = cmdline; | ||
1775 | |||
1776 | *crash_size = memparse(cmdline, &cur); | ||
1777 | if (cmdline == cur) { | ||
1778 | pr_warn("crashkernel: memory value expected\n"); | ||
1779 | return -EINVAL; | ||
1780 | } | ||
1781 | |||
1782 | /* check with suffix */ | ||
1783 | if (strncmp(cur, suffix, strlen(suffix))) { | ||
1784 | pr_warn("crashkernel: unrecognized char\n"); | ||
1785 | return -EINVAL; | ||
1786 | } | ||
1787 | cur += strlen(suffix); | ||
1788 | if (*cur != ' ' && *cur != '\0') { | ||
1789 | pr_warn("crashkernel: unrecognized char\n"); | ||
1790 | return -EINVAL; | ||
1791 | } | ||
1792 | |||
1793 | return 0; | ||
1794 | } | ||
1795 | |||
1796 | static __init char *get_last_crashkernel(char *cmdline, | ||
1797 | const char *name, | ||
1798 | const char *suffix) | ||
1799 | { | ||
1800 | char *p = cmdline, *ck_cmdline = NULL; | ||
1801 | |||
1802 | /* find crashkernel and use the last one if there are more */ | ||
1803 | p = strstr(p, name); | ||
1804 | while (p) { | ||
1805 | char *end_p = strchr(p, ' '); | ||
1806 | char *q; | ||
1807 | |||
1808 | if (!end_p) | ||
1809 | end_p = p + strlen(p); | ||
1810 | |||
1811 | if (!suffix) { | ||
1812 | int i; | ||
1813 | |||
1814 | /* skip the one with any known suffix */ | ||
1815 | for (i = 0; suffix_tbl[i]; i++) { | ||
1816 | q = end_p - strlen(suffix_tbl[i]); | ||
1817 | if (!strncmp(q, suffix_tbl[i], | ||
1818 | strlen(suffix_tbl[i]))) | ||
1819 | goto next; | ||
1820 | } | ||
1821 | ck_cmdline = p; | ||
1822 | } else { | ||
1823 | q = end_p - strlen(suffix); | ||
1824 | if (!strncmp(q, suffix, strlen(suffix))) | ||
1825 | ck_cmdline = p; | ||
1826 | } | ||
1827 | next: | ||
1828 | p = strstr(p+1, name); | ||
1829 | } | ||
1830 | |||
1831 | if (!ck_cmdline) | ||
1832 | return NULL; | ||
1833 | |||
1834 | return ck_cmdline; | ||
1835 | } | ||
1836 | |||
1837 | static int __init __parse_crashkernel(char *cmdline, | ||
1838 | unsigned long long system_ram, | ||
1839 | unsigned long long *crash_size, | ||
1840 | unsigned long long *crash_base, | ||
1841 | const char *name, | ||
1842 | const char *suffix) | ||
1843 | { | ||
1844 | char *first_colon, *first_space; | ||
1845 | char *ck_cmdline; | ||
1846 | |||
1847 | BUG_ON(!crash_size || !crash_base); | ||
1848 | *crash_size = 0; | ||
1849 | *crash_base = 0; | ||
1850 | |||
1851 | ck_cmdline = get_last_crashkernel(cmdline, name, suffix); | ||
1852 | |||
1853 | if (!ck_cmdline) | ||
1854 | return -EINVAL; | ||
1855 | |||
1856 | ck_cmdline += strlen(name); | ||
1857 | |||
1858 | if (suffix) | ||
1859 | return parse_crashkernel_suffix(ck_cmdline, crash_size, | ||
1860 | suffix); | ||
1861 | /* | ||
1862 | * if the commandline contains a ':', then that's the extended | ||
1863 | * syntax -- if not, it must be the classic syntax | ||
1864 | */ | ||
1865 | first_colon = strchr(ck_cmdline, ':'); | ||
1866 | first_space = strchr(ck_cmdline, ' '); | ||
1867 | if (first_colon && (!first_space || first_colon < first_space)) | ||
1868 | return parse_crashkernel_mem(ck_cmdline, system_ram, | ||
1869 | crash_size, crash_base); | ||
1870 | |||
1871 | return parse_crashkernel_simple(ck_cmdline, crash_size, crash_base); | ||
1872 | } | ||
1873 | |||
1874 | /* | ||
1875 | * That function is the entry point for command line parsing and should be | ||
1876 | * called from the arch-specific code. | ||
1877 | */ | ||
1878 | int __init parse_crashkernel(char *cmdline, | ||
1879 | unsigned long long system_ram, | ||
1880 | unsigned long long *crash_size, | ||
1881 | unsigned long long *crash_base) | ||
1882 | { | ||
1883 | return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base, | ||
1884 | "crashkernel=", NULL); | ||
1885 | } | ||
1886 | |||
1887 | int __init parse_crashkernel_high(char *cmdline, | ||
1888 | unsigned long long system_ram, | ||
1889 | unsigned long long *crash_size, | ||
1890 | unsigned long long *crash_base) | ||
1891 | { | ||
1892 | return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base, | ||
1893 | "crashkernel=", suffix_tbl[SUFFIX_HIGH]); | ||
1894 | } | ||
1895 | |||
1896 | int __init parse_crashkernel_low(char *cmdline, | ||
1897 | unsigned long long system_ram, | ||
1898 | unsigned long long *crash_size, | ||
1899 | unsigned long long *crash_base) | ||
1900 | { | ||
1901 | return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base, | ||
1902 | "crashkernel=", suffix_tbl[SUFFIX_LOW]); | ||
1903 | } | ||
1904 | |||
1905 | static void update_vmcoreinfo_note(void) | ||
1906 | { | ||
1907 | u32 *buf = vmcoreinfo_note; | ||
1908 | |||
1909 | if (!vmcoreinfo_size) | ||
1910 | return; | ||
1911 | buf = append_elf_note(buf, VMCOREINFO_NOTE_NAME, 0, vmcoreinfo_data, | ||
1912 | vmcoreinfo_size); | ||
1913 | final_note(buf); | ||
1914 | } | ||
1915 | |||
1916 | void crash_save_vmcoreinfo(void) | ||
1917 | { | ||
1918 | vmcoreinfo_append_str("CRASHTIME=%ld\n", get_seconds()); | ||
1919 | update_vmcoreinfo_note(); | ||
1920 | } | ||
1921 | |||
1922 | void vmcoreinfo_append_str(const char *fmt, ...) | ||
1923 | { | ||
1924 | va_list args; | ||
1925 | char buf[0x50]; | ||
1926 | size_t r; | ||
1927 | |||
1928 | va_start(args, fmt); | ||
1929 | r = vscnprintf(buf, sizeof(buf), fmt, args); | ||
1930 | va_end(args); | ||
1931 | |||
1932 | r = min(r, vmcoreinfo_max_size - vmcoreinfo_size); | ||
1933 | |||
1934 | memcpy(&vmcoreinfo_data[vmcoreinfo_size], buf, r); | ||
1935 | |||
1936 | vmcoreinfo_size += r; | ||
1937 | } | ||
1938 | |||
1939 | /* | ||
1940 | * provide an empty default implementation here -- architecture | ||
1941 | * code may override this | ||
1942 | */ | ||
1943 | void __weak arch_crash_save_vmcoreinfo(void) | ||
1944 | {} | ||
1945 | |||
1946 | unsigned long __weak paddr_vmcoreinfo_note(void) | ||
1947 | { | ||
1948 | return __pa((unsigned long)(char *)&vmcoreinfo_note); | ||
1949 | } | ||
1950 | |||
1951 | static int __init crash_save_vmcoreinfo_init(void) | ||
1952 | { | ||
1953 | VMCOREINFO_OSRELEASE(init_uts_ns.name.release); | ||
1954 | VMCOREINFO_PAGESIZE(PAGE_SIZE); | ||
1955 | |||
1956 | VMCOREINFO_SYMBOL(init_uts_ns); | ||
1957 | VMCOREINFO_SYMBOL(node_online_map); | ||
1958 | #ifdef CONFIG_MMU | ||
1959 | VMCOREINFO_SYMBOL(swapper_pg_dir); | ||
1960 | #endif | ||
1961 | VMCOREINFO_SYMBOL(_stext); | ||
1962 | VMCOREINFO_SYMBOL(vmap_area_list); | ||
1963 | |||
1964 | #ifndef CONFIG_NEED_MULTIPLE_NODES | ||
1965 | VMCOREINFO_SYMBOL(mem_map); | ||
1966 | VMCOREINFO_SYMBOL(contig_page_data); | ||
1967 | #endif | ||
1968 | #ifdef CONFIG_SPARSEMEM | ||
1969 | VMCOREINFO_SYMBOL(mem_section); | ||
1970 | VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS); | ||
1971 | VMCOREINFO_STRUCT_SIZE(mem_section); | ||
1972 | VMCOREINFO_OFFSET(mem_section, section_mem_map); | ||
1973 | #endif | ||
1974 | VMCOREINFO_STRUCT_SIZE(page); | ||
1975 | VMCOREINFO_STRUCT_SIZE(pglist_data); | ||
1976 | VMCOREINFO_STRUCT_SIZE(zone); | ||
1977 | VMCOREINFO_STRUCT_SIZE(free_area); | ||
1978 | VMCOREINFO_STRUCT_SIZE(list_head); | ||
1979 | VMCOREINFO_SIZE(nodemask_t); | ||
1980 | VMCOREINFO_OFFSET(page, flags); | ||
1981 | VMCOREINFO_OFFSET(page, _count); | ||
1982 | VMCOREINFO_OFFSET(page, mapping); | ||
1983 | VMCOREINFO_OFFSET(page, lru); | ||
1984 | VMCOREINFO_OFFSET(page, _mapcount); | ||
1985 | VMCOREINFO_OFFSET(page, private); | ||
1986 | VMCOREINFO_OFFSET(pglist_data, node_zones); | ||
1987 | VMCOREINFO_OFFSET(pglist_data, nr_zones); | ||
1988 | #ifdef CONFIG_FLAT_NODE_MEM_MAP | ||
1989 | VMCOREINFO_OFFSET(pglist_data, node_mem_map); | ||
1990 | #endif | ||
1991 | VMCOREINFO_OFFSET(pglist_data, node_start_pfn); | ||
1992 | VMCOREINFO_OFFSET(pglist_data, node_spanned_pages); | ||
1993 | VMCOREINFO_OFFSET(pglist_data, node_id); | ||
1994 | VMCOREINFO_OFFSET(zone, free_area); | ||
1995 | VMCOREINFO_OFFSET(zone, vm_stat); | ||
1996 | VMCOREINFO_OFFSET(zone, spanned_pages); | ||
1997 | VMCOREINFO_OFFSET(free_area, free_list); | ||
1998 | VMCOREINFO_OFFSET(list_head, next); | ||
1999 | VMCOREINFO_OFFSET(list_head, prev); | ||
2000 | VMCOREINFO_OFFSET(vmap_area, va_start); | ||
2001 | VMCOREINFO_OFFSET(vmap_area, list); | ||
2002 | VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER); | ||
2003 | log_buf_kexec_setup(); | ||
2004 | VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES); | ||
2005 | VMCOREINFO_NUMBER(NR_FREE_PAGES); | ||
2006 | VMCOREINFO_NUMBER(PG_lru); | ||
2007 | VMCOREINFO_NUMBER(PG_private); | ||
2008 | VMCOREINFO_NUMBER(PG_swapcache); | ||
2009 | VMCOREINFO_NUMBER(PG_slab); | ||
2010 | #ifdef CONFIG_MEMORY_FAILURE | ||
2011 | VMCOREINFO_NUMBER(PG_hwpoison); | ||
2012 | #endif | ||
2013 | VMCOREINFO_NUMBER(PG_head_mask); | ||
2014 | VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE); | ||
2015 | #ifdef CONFIG_HUGETLBFS | ||
2016 | VMCOREINFO_SYMBOL(free_huge_page); | ||
2017 | #endif | ||
2018 | |||
2019 | arch_crash_save_vmcoreinfo(); | ||
2020 | update_vmcoreinfo_note(); | ||
2021 | |||
2022 | return 0; | ||
2023 | } | ||
2024 | |||
2025 | subsys_initcall(crash_save_vmcoreinfo_init); | ||
2026 | |||
2027 | #ifdef CONFIG_KEXEC_FILE | ||
2028 | static int locate_mem_hole_top_down(unsigned long start, unsigned long end, | ||
2029 | struct kexec_buf *kbuf) | ||
2030 | { | ||
2031 | struct kimage *image = kbuf->image; | ||
2032 | unsigned long temp_start, temp_end; | ||
2033 | |||
2034 | temp_end = min(end, kbuf->buf_max); | ||
2035 | temp_start = temp_end - kbuf->memsz; | ||
2036 | |||
2037 | do { | ||
2038 | /* align down start */ | ||
2039 | temp_start = temp_start & (~(kbuf->buf_align - 1)); | ||
2040 | |||
2041 | if (temp_start < start || temp_start < kbuf->buf_min) | ||
2042 | return 0; | ||
2043 | |||
2044 | temp_end = temp_start + kbuf->memsz - 1; | ||
2045 | |||
2046 | /* | ||
2047 | * Make sure this does not conflict with any of existing | ||
2048 | * segments | ||
2049 | */ | ||
2050 | if (kimage_is_destination_range(image, temp_start, temp_end)) { | ||
2051 | temp_start = temp_start - PAGE_SIZE; | ||
2052 | continue; | ||
2053 | } | ||
2054 | |||
2055 | /* We found a suitable memory range */ | ||
2056 | break; | ||
2057 | } while (1); | ||
2058 | |||
2059 | /* If we are here, we found a suitable memory range */ | ||
2060 | kbuf->mem = temp_start; | ||
2061 | |||
2062 | /* Success, stop navigating through remaining System RAM ranges */ | ||
2063 | return 1; | ||
2064 | } | ||
2065 | |||
2066 | static int locate_mem_hole_bottom_up(unsigned long start, unsigned long end, | ||
2067 | struct kexec_buf *kbuf) | ||
2068 | { | ||
2069 | struct kimage *image = kbuf->image; | ||
2070 | unsigned long temp_start, temp_end; | ||
2071 | |||
2072 | temp_start = max(start, kbuf->buf_min); | ||
2073 | |||
2074 | do { | ||
2075 | temp_start = ALIGN(temp_start, kbuf->buf_align); | ||
2076 | temp_end = temp_start + kbuf->memsz - 1; | ||
2077 | |||
2078 | if (temp_end > end || temp_end > kbuf->buf_max) | ||
2079 | return 0; | ||
2080 | /* | ||
2081 | * Make sure this does not conflict with any of existing | ||
2082 | * segments | ||
2083 | */ | ||
2084 | if (kimage_is_destination_range(image, temp_start, temp_end)) { | ||
2085 | temp_start = temp_start + PAGE_SIZE; | ||
2086 | continue; | ||
2087 | } | ||
2088 | |||
2089 | /* We found a suitable memory range */ | ||
2090 | break; | ||
2091 | } while (1); | ||
2092 | |||
2093 | /* If we are here, we found a suitable memory range */ | ||
2094 | kbuf->mem = temp_start; | ||
2095 | |||
2096 | /* Success, stop navigating through remaining System RAM ranges */ | ||
2097 | return 1; | ||
2098 | } | ||
2099 | |||
2100 | static int locate_mem_hole_callback(u64 start, u64 end, void *arg) | ||
2101 | { | ||
2102 | struct kexec_buf *kbuf = (struct kexec_buf *)arg; | ||
2103 | unsigned long sz = end - start + 1; | ||
2104 | |||
2105 | /* Returning 0 will take to next memory range */ | ||
2106 | if (sz < kbuf->memsz) | ||
2107 | return 0; | ||
2108 | |||
2109 | if (end < kbuf->buf_min || start > kbuf->buf_max) | ||
2110 | return 0; | ||
2111 | |||
2112 | /* | ||
2113 | * Allocate memory top down with-in ram range. Otherwise bottom up | ||
2114 | * allocation. | ||
2115 | */ | ||
2116 | if (kbuf->top_down) | ||
2117 | return locate_mem_hole_top_down(start, end, kbuf); | ||
2118 | return locate_mem_hole_bottom_up(start, end, kbuf); | ||
2119 | } | ||
2120 | |||
2121 | /* | ||
2122 | * Helper function for placing a buffer in a kexec segment. This assumes | ||
2123 | * that kexec_mutex is held. | ||
2124 | */ | ||
2125 | int kexec_add_buffer(struct kimage *image, char *buffer, unsigned long bufsz, | ||
2126 | unsigned long memsz, unsigned long buf_align, | ||
2127 | unsigned long buf_min, unsigned long buf_max, | ||
2128 | bool top_down, unsigned long *load_addr) | ||
2129 | { | ||
2130 | |||
2131 | struct kexec_segment *ksegment; | ||
2132 | struct kexec_buf buf, *kbuf; | ||
2133 | int ret; | ||
2134 | |||
2135 | /* Currently adding segment this way is allowed only in file mode */ | ||
2136 | if (!image->file_mode) | ||
2137 | return -EINVAL; | ||
2138 | |||
2139 | if (image->nr_segments >= KEXEC_SEGMENT_MAX) | ||
2140 | return -EINVAL; | ||
2141 | |||
2142 | /* | ||
2143 | * Make sure we are not trying to add buffer after allocating | ||
2144 | * control pages. All segments need to be placed first before | ||
2145 | * any control pages are allocated. As control page allocation | ||
2146 | * logic goes through list of segments to make sure there are | ||
2147 | * no destination overlaps. | ||
2148 | */ | ||
2149 | if (!list_empty(&image->control_pages)) { | ||
2150 | WARN_ON(1); | ||
2151 | return -EINVAL; | ||
2152 | } | ||
2153 | |||
2154 | memset(&buf, 0, sizeof(struct kexec_buf)); | ||
2155 | kbuf = &buf; | ||
2156 | kbuf->image = image; | ||
2157 | kbuf->buffer = buffer; | ||
2158 | kbuf->bufsz = bufsz; | ||
2159 | |||
2160 | kbuf->memsz = ALIGN(memsz, PAGE_SIZE); | ||
2161 | kbuf->buf_align = max(buf_align, PAGE_SIZE); | ||
2162 | kbuf->buf_min = buf_min; | ||
2163 | kbuf->buf_max = buf_max; | ||
2164 | kbuf->top_down = top_down; | ||
2165 | |||
2166 | /* Walk the RAM ranges and allocate a suitable range for the buffer */ | ||
2167 | if (image->type == KEXEC_TYPE_CRASH) | ||
2168 | ret = walk_iomem_res("Crash kernel", | ||
2169 | IORESOURCE_MEM | IORESOURCE_BUSY, | ||
2170 | crashk_res.start, crashk_res.end, kbuf, | ||
2171 | locate_mem_hole_callback); | ||
2172 | else | ||
2173 | ret = walk_system_ram_res(0, -1, kbuf, | ||
2174 | locate_mem_hole_callback); | ||
2175 | if (ret != 1) { | ||
2176 | /* A suitable memory range could not be found for buffer */ | ||
2177 | return -EADDRNOTAVAIL; | ||
2178 | } | ||
2179 | |||
2180 | /* Found a suitable memory range */ | ||
2181 | ksegment = &image->segment[image->nr_segments]; | ||
2182 | ksegment->kbuf = kbuf->buffer; | ||
2183 | ksegment->bufsz = kbuf->bufsz; | ||
2184 | ksegment->mem = kbuf->mem; | ||
2185 | ksegment->memsz = kbuf->memsz; | ||
2186 | image->nr_segments++; | ||
2187 | *load_addr = ksegment->mem; | ||
2188 | return 0; | ||
2189 | } | ||
2190 | |||
2191 | /* Calculate and store the digest of segments */ | ||
2192 | static int kexec_calculate_store_digests(struct kimage *image) | ||
2193 | { | ||
2194 | struct crypto_shash *tfm; | ||
2195 | struct shash_desc *desc; | ||
2196 | int ret = 0, i, j, zero_buf_sz, sha_region_sz; | ||
2197 | size_t desc_size, nullsz; | ||
2198 | char *digest; | ||
2199 | void *zero_buf; | ||
2200 | struct kexec_sha_region *sha_regions; | ||
2201 | struct purgatory_info *pi = &image->purgatory_info; | ||
2202 | |||
2203 | zero_buf = __va(page_to_pfn(ZERO_PAGE(0)) << PAGE_SHIFT); | ||
2204 | zero_buf_sz = PAGE_SIZE; | ||
2205 | |||
2206 | tfm = crypto_alloc_shash("sha256", 0, 0); | ||
2207 | if (IS_ERR(tfm)) { | ||
2208 | ret = PTR_ERR(tfm); | ||
2209 | goto out; | ||
2210 | } | ||
2211 | |||
2212 | desc_size = crypto_shash_descsize(tfm) + sizeof(*desc); | ||
2213 | desc = kzalloc(desc_size, GFP_KERNEL); | ||
2214 | if (!desc) { | ||
2215 | ret = -ENOMEM; | ||
2216 | goto out_free_tfm; | ||
2217 | } | ||
2218 | |||
2219 | sha_region_sz = KEXEC_SEGMENT_MAX * sizeof(struct kexec_sha_region); | ||
2220 | sha_regions = vzalloc(sha_region_sz); | ||
2221 | if (!sha_regions) | ||
2222 | goto out_free_desc; | ||
2223 | |||
2224 | desc->tfm = tfm; | ||
2225 | desc->flags = 0; | ||
2226 | |||
2227 | ret = crypto_shash_init(desc); | ||
2228 | if (ret < 0) | ||
2229 | goto out_free_sha_regions; | ||
2230 | |||
2231 | digest = kzalloc(SHA256_DIGEST_SIZE, GFP_KERNEL); | ||
2232 | if (!digest) { | ||
2233 | ret = -ENOMEM; | ||
2234 | goto out_free_sha_regions; | ||
2235 | } | ||
2236 | |||
2237 | for (j = i = 0; i < image->nr_segments; i++) { | ||
2238 | struct kexec_segment *ksegment; | ||
2239 | |||
2240 | ksegment = &image->segment[i]; | ||
2241 | /* | ||
2242 | * Skip purgatory as it will be modified once we put digest | ||
2243 | * info in purgatory. | ||
2244 | */ | ||
2245 | if (ksegment->kbuf == pi->purgatory_buf) | ||
2246 | continue; | ||
2247 | |||
2248 | ret = crypto_shash_update(desc, ksegment->kbuf, | ||
2249 | ksegment->bufsz); | ||
2250 | if (ret) | ||
2251 | break; | ||
2252 | |||
2253 | /* | ||
2254 | * Assume rest of the buffer is filled with zero and | ||
2255 | * update digest accordingly. | ||
2256 | */ | ||
2257 | nullsz = ksegment->memsz - ksegment->bufsz; | ||
2258 | while (nullsz) { | ||
2259 | unsigned long bytes = nullsz; | ||
2260 | |||
2261 | if (bytes > zero_buf_sz) | ||
2262 | bytes = zero_buf_sz; | ||
2263 | ret = crypto_shash_update(desc, zero_buf, bytes); | ||
2264 | if (ret) | ||
2265 | break; | ||
2266 | nullsz -= bytes; | ||
2267 | } | ||
2268 | |||
2269 | if (ret) | ||
2270 | break; | ||
2271 | |||
2272 | sha_regions[j].start = ksegment->mem; | ||
2273 | sha_regions[j].len = ksegment->memsz; | ||
2274 | j++; | ||
2275 | } | ||
2276 | |||
2277 | if (!ret) { | ||
2278 | ret = crypto_shash_final(desc, digest); | ||
2279 | if (ret) | ||
2280 | goto out_free_digest; | ||
2281 | ret = kexec_purgatory_get_set_symbol(image, "sha_regions", | ||
2282 | sha_regions, sha_region_sz, 0); | ||
2283 | if (ret) | ||
2284 | goto out_free_digest; | ||
2285 | |||
2286 | ret = kexec_purgatory_get_set_symbol(image, "sha256_digest", | ||
2287 | digest, SHA256_DIGEST_SIZE, 0); | ||
2288 | if (ret) | ||
2289 | goto out_free_digest; | ||
2290 | } | ||
2291 | |||
2292 | out_free_digest: | ||
2293 | kfree(digest); | ||
2294 | out_free_sha_regions: | ||
2295 | vfree(sha_regions); | ||
2296 | out_free_desc: | ||
2297 | kfree(desc); | ||
2298 | out_free_tfm: | ||
2299 | kfree(tfm); | ||
2300 | out: | ||
2301 | return ret; | ||
2302 | } | ||
2303 | |||
2304 | /* Actually load purgatory. Lot of code taken from kexec-tools */ | ||
2305 | static int __kexec_load_purgatory(struct kimage *image, unsigned long min, | ||
2306 | unsigned long max, int top_down) | ||
2307 | { | ||
2308 | struct purgatory_info *pi = &image->purgatory_info; | ||
2309 | unsigned long align, buf_align, bss_align, buf_sz, bss_sz, bss_pad; | ||
2310 | unsigned long memsz, entry, load_addr, curr_load_addr, bss_addr, offset; | ||
2311 | unsigned char *buf_addr, *src; | ||
2312 | int i, ret = 0, entry_sidx = -1; | ||
2313 | const Elf_Shdr *sechdrs_c; | ||
2314 | Elf_Shdr *sechdrs = NULL; | ||
2315 | void *purgatory_buf = NULL; | ||
2316 | |||
2317 | /* | ||
2318 | * sechdrs_c points to section headers in purgatory and are read | ||
2319 | * only. No modifications allowed. | ||
2320 | */ | ||
2321 | sechdrs_c = (void *)pi->ehdr + pi->ehdr->e_shoff; | ||
2322 | |||
2323 | /* | ||
2324 | * We can not modify sechdrs_c[] and its fields. It is read only. | ||
2325 | * Copy it over to a local copy where one can store some temporary | ||
2326 | * data and free it at the end. We need to modify ->sh_addr and | ||
2327 | * ->sh_offset fields to keep track of permanent and temporary | ||
2328 | * locations of sections. | ||
2329 | */ | ||
2330 | sechdrs = vzalloc(pi->ehdr->e_shnum * sizeof(Elf_Shdr)); | ||
2331 | if (!sechdrs) | ||
2332 | return -ENOMEM; | ||
2333 | |||
2334 | memcpy(sechdrs, sechdrs_c, pi->ehdr->e_shnum * sizeof(Elf_Shdr)); | ||
2335 | |||
2336 | /* | ||
2337 | * We seem to have multiple copies of sections. First copy is which | ||
2338 | * is embedded in kernel in read only section. Some of these sections | ||
2339 | * will be copied to a temporary buffer and relocated. And these | ||
2340 | * sections will finally be copied to their final destination at | ||
2341 | * segment load time. | ||
2342 | * | ||
2343 | * Use ->sh_offset to reflect section address in memory. It will | ||
2344 | * point to original read only copy if section is not allocatable. | ||
2345 | * Otherwise it will point to temporary copy which will be relocated. | ||
2346 | * | ||
2347 | * Use ->sh_addr to contain final address of the section where it | ||
2348 | * will go during execution time. | ||
2349 | */ | ||
2350 | for (i = 0; i < pi->ehdr->e_shnum; i++) { | ||
2351 | if (sechdrs[i].sh_type == SHT_NOBITS) | ||
2352 | continue; | ||
2353 | |||
2354 | sechdrs[i].sh_offset = (unsigned long)pi->ehdr + | ||
2355 | sechdrs[i].sh_offset; | ||
2356 | } | ||
2357 | |||
2358 | /* | ||
2359 | * Identify entry point section and make entry relative to section | ||
2360 | * start. | ||
2361 | */ | ||
2362 | entry = pi->ehdr->e_entry; | ||
2363 | for (i = 0; i < pi->ehdr->e_shnum; i++) { | ||
2364 | if (!(sechdrs[i].sh_flags & SHF_ALLOC)) | ||
2365 | continue; | ||
2366 | |||
2367 | if (!(sechdrs[i].sh_flags & SHF_EXECINSTR)) | ||
2368 | continue; | ||
2369 | |||
2370 | /* Make entry section relative */ | ||
2371 | if (sechdrs[i].sh_addr <= pi->ehdr->e_entry && | ||
2372 | ((sechdrs[i].sh_addr + sechdrs[i].sh_size) > | ||
2373 | pi->ehdr->e_entry)) { | ||
2374 | entry_sidx = i; | ||
2375 | entry -= sechdrs[i].sh_addr; | ||
2376 | break; | ||
2377 | } | ||
2378 | } | ||
2379 | |||
2380 | /* Determine how much memory is needed to load relocatable object. */ | ||
2381 | buf_align = 1; | ||
2382 | bss_align = 1; | ||
2383 | buf_sz = 0; | ||
2384 | bss_sz = 0; | ||
2385 | |||
2386 | for (i = 0; i < pi->ehdr->e_shnum; i++) { | ||
2387 | if (!(sechdrs[i].sh_flags & SHF_ALLOC)) | ||
2388 | continue; | ||
2389 | |||
2390 | align = sechdrs[i].sh_addralign; | ||
2391 | if (sechdrs[i].sh_type != SHT_NOBITS) { | ||
2392 | if (buf_align < align) | ||
2393 | buf_align = align; | ||
2394 | buf_sz = ALIGN(buf_sz, align); | ||
2395 | buf_sz += sechdrs[i].sh_size; | ||
2396 | } else { | ||
2397 | /* bss section */ | ||
2398 | if (bss_align < align) | ||
2399 | bss_align = align; | ||
2400 | bss_sz = ALIGN(bss_sz, align); | ||
2401 | bss_sz += sechdrs[i].sh_size; | ||
2402 | } | ||
2403 | } | ||
2404 | |||
2405 | /* Determine the bss padding required to align bss properly */ | ||
2406 | bss_pad = 0; | ||
2407 | if (buf_sz & (bss_align - 1)) | ||
2408 | bss_pad = bss_align - (buf_sz & (bss_align - 1)); | ||
2409 | |||
2410 | memsz = buf_sz + bss_pad + bss_sz; | ||
2411 | |||
2412 | /* Allocate buffer for purgatory */ | ||
2413 | purgatory_buf = vzalloc(buf_sz); | ||
2414 | if (!purgatory_buf) { | ||
2415 | ret = -ENOMEM; | ||
2416 | goto out; | ||
2417 | } | ||
2418 | |||
2419 | if (buf_align < bss_align) | ||
2420 | buf_align = bss_align; | ||
2421 | |||
2422 | /* Add buffer to segment list */ | ||
2423 | ret = kexec_add_buffer(image, purgatory_buf, buf_sz, memsz, | ||
2424 | buf_align, min, max, top_down, | ||
2425 | &pi->purgatory_load_addr); | ||
2426 | if (ret) | ||
2427 | goto out; | ||
2428 | |||
2429 | /* Load SHF_ALLOC sections */ | ||
2430 | buf_addr = purgatory_buf; | ||
2431 | load_addr = curr_load_addr = pi->purgatory_load_addr; | ||
2432 | bss_addr = load_addr + buf_sz + bss_pad; | ||
2433 | |||
2434 | for (i = 0; i < pi->ehdr->e_shnum; i++) { | ||
2435 | if (!(sechdrs[i].sh_flags & SHF_ALLOC)) | ||
2436 | continue; | ||
2437 | |||
2438 | align = sechdrs[i].sh_addralign; | ||
2439 | if (sechdrs[i].sh_type != SHT_NOBITS) { | ||
2440 | curr_load_addr = ALIGN(curr_load_addr, align); | ||
2441 | offset = curr_load_addr - load_addr; | ||
2442 | /* We already modifed ->sh_offset to keep src addr */ | ||
2443 | src = (char *) sechdrs[i].sh_offset; | ||
2444 | memcpy(buf_addr + offset, src, sechdrs[i].sh_size); | ||
2445 | |||
2446 | /* Store load address and source address of section */ | ||
2447 | sechdrs[i].sh_addr = curr_load_addr; | ||
2448 | |||
2449 | /* | ||
2450 | * This section got copied to temporary buffer. Update | ||
2451 | * ->sh_offset accordingly. | ||
2452 | */ | ||
2453 | sechdrs[i].sh_offset = (unsigned long)(buf_addr + offset); | ||
2454 | |||
2455 | /* Advance to the next address */ | ||
2456 | curr_load_addr += sechdrs[i].sh_size; | ||
2457 | } else { | ||
2458 | bss_addr = ALIGN(bss_addr, align); | ||
2459 | sechdrs[i].sh_addr = bss_addr; | ||
2460 | bss_addr += sechdrs[i].sh_size; | ||
2461 | } | ||
2462 | } | ||
2463 | |||
2464 | /* Update entry point based on load address of text section */ | ||
2465 | if (entry_sidx >= 0) | ||
2466 | entry += sechdrs[entry_sidx].sh_addr; | ||
2467 | |||
2468 | /* Make kernel jump to purgatory after shutdown */ | ||
2469 | image->start = entry; | ||
2470 | |||
2471 | /* Used later to get/set symbol values */ | ||
2472 | pi->sechdrs = sechdrs; | ||
2473 | |||
2474 | /* | ||
2475 | * Used later to identify which section is purgatory and skip it | ||
2476 | * from checksumming. | ||
2477 | */ | ||
2478 | pi->purgatory_buf = purgatory_buf; | ||
2479 | return ret; | ||
2480 | out: | ||
2481 | vfree(sechdrs); | ||
2482 | vfree(purgatory_buf); | ||
2483 | return ret; | ||
2484 | } | ||
2485 | |||
2486 | static int kexec_apply_relocations(struct kimage *image) | ||
2487 | { | ||
2488 | int i, ret; | ||
2489 | struct purgatory_info *pi = &image->purgatory_info; | ||
2490 | Elf_Shdr *sechdrs = pi->sechdrs; | ||
2491 | |||
2492 | /* Apply relocations */ | ||
2493 | for (i = 0; i < pi->ehdr->e_shnum; i++) { | ||
2494 | Elf_Shdr *section, *symtab; | ||
2495 | |||
2496 | if (sechdrs[i].sh_type != SHT_RELA && | ||
2497 | sechdrs[i].sh_type != SHT_REL) | ||
2498 | continue; | ||
2499 | |||
2500 | /* | ||
2501 | * For section of type SHT_RELA/SHT_REL, | ||
2502 | * ->sh_link contains section header index of associated | ||
2503 | * symbol table. And ->sh_info contains section header | ||
2504 | * index of section to which relocations apply. | ||
2505 | */ | ||
2506 | if (sechdrs[i].sh_info >= pi->ehdr->e_shnum || | ||
2507 | sechdrs[i].sh_link >= pi->ehdr->e_shnum) | ||
2508 | return -ENOEXEC; | ||
2509 | |||
2510 | section = &sechdrs[sechdrs[i].sh_info]; | ||
2511 | symtab = &sechdrs[sechdrs[i].sh_link]; | ||
2512 | |||
2513 | if (!(section->sh_flags & SHF_ALLOC)) | ||
2514 | continue; | ||
2515 | |||
2516 | /* | ||
2517 | * symtab->sh_link contain section header index of associated | ||
2518 | * string table. | ||
2519 | */ | ||
2520 | if (symtab->sh_link >= pi->ehdr->e_shnum) | ||
2521 | /* Invalid section number? */ | ||
2522 | continue; | ||
2523 | |||
2524 | /* | ||
2525 | * Respective architecture needs to provide support for applying | ||
2526 | * relocations of type SHT_RELA/SHT_REL. | ||
2527 | */ | ||
2528 | if (sechdrs[i].sh_type == SHT_RELA) | ||
2529 | ret = arch_kexec_apply_relocations_add(pi->ehdr, | ||
2530 | sechdrs, i); | ||
2531 | else if (sechdrs[i].sh_type == SHT_REL) | ||
2532 | ret = arch_kexec_apply_relocations(pi->ehdr, | ||
2533 | sechdrs, i); | ||
2534 | if (ret) | ||
2535 | return ret; | ||
2536 | } | ||
2537 | |||
2538 | return 0; | ||
2539 | } | ||
2540 | |||
2541 | /* Load relocatable purgatory object and relocate it appropriately */ | ||
2542 | int kexec_load_purgatory(struct kimage *image, unsigned long min, | ||
2543 | unsigned long max, int top_down, | ||
2544 | unsigned long *load_addr) | ||
2545 | { | ||
2546 | struct purgatory_info *pi = &image->purgatory_info; | ||
2547 | int ret; | ||
2548 | |||
2549 | if (kexec_purgatory_size <= 0) | ||
2550 | return -EINVAL; | ||
2551 | |||
2552 | if (kexec_purgatory_size < sizeof(Elf_Ehdr)) | ||
2553 | return -ENOEXEC; | ||
2554 | |||
2555 | pi->ehdr = (Elf_Ehdr *)kexec_purgatory; | ||
2556 | |||
2557 | if (memcmp(pi->ehdr->e_ident, ELFMAG, SELFMAG) != 0 | ||
2558 | || pi->ehdr->e_type != ET_REL | ||
2559 | || !elf_check_arch(pi->ehdr) | ||
2560 | || pi->ehdr->e_shentsize != sizeof(Elf_Shdr)) | ||
2561 | return -ENOEXEC; | ||
2562 | |||
2563 | if (pi->ehdr->e_shoff >= kexec_purgatory_size | ||
2564 | || (pi->ehdr->e_shnum * sizeof(Elf_Shdr) > | ||
2565 | kexec_purgatory_size - pi->ehdr->e_shoff)) | ||
2566 | return -ENOEXEC; | ||
2567 | |||
2568 | ret = __kexec_load_purgatory(image, min, max, top_down); | ||
2569 | if (ret) | ||
2570 | return ret; | ||
2571 | |||
2572 | ret = kexec_apply_relocations(image); | ||
2573 | if (ret) | ||
2574 | goto out; | ||
2575 | |||
2576 | *load_addr = pi->purgatory_load_addr; | ||
2577 | return 0; | ||
2578 | out: | ||
2579 | vfree(pi->sechdrs); | ||
2580 | vfree(pi->purgatory_buf); | ||
2581 | return ret; | ||
2582 | } | ||
2583 | |||
2584 | static Elf_Sym *kexec_purgatory_find_symbol(struct purgatory_info *pi, | ||
2585 | const char *name) | ||
2586 | { | ||
2587 | Elf_Sym *syms; | ||
2588 | Elf_Shdr *sechdrs; | ||
2589 | Elf_Ehdr *ehdr; | ||
2590 | int i, k; | ||
2591 | const char *strtab; | ||
2592 | |||
2593 | if (!pi->sechdrs || !pi->ehdr) | ||
2594 | return NULL; | ||
2595 | |||
2596 | sechdrs = pi->sechdrs; | ||
2597 | ehdr = pi->ehdr; | ||
2598 | |||
2599 | for (i = 0; i < ehdr->e_shnum; i++) { | ||
2600 | if (sechdrs[i].sh_type != SHT_SYMTAB) | ||
2601 | continue; | ||
2602 | |||
2603 | if (sechdrs[i].sh_link >= ehdr->e_shnum) | ||
2604 | /* Invalid strtab section number */ | ||
2605 | continue; | ||
2606 | strtab = (char *)sechdrs[sechdrs[i].sh_link].sh_offset; | ||
2607 | syms = (Elf_Sym *)sechdrs[i].sh_offset; | ||
2608 | |||
2609 | /* Go through symbols for a match */ | ||
2610 | for (k = 0; k < sechdrs[i].sh_size/sizeof(Elf_Sym); k++) { | ||
2611 | if (ELF_ST_BIND(syms[k].st_info) != STB_GLOBAL) | ||
2612 | continue; | ||
2613 | |||
2614 | if (strcmp(strtab + syms[k].st_name, name) != 0) | ||
2615 | continue; | ||
2616 | |||
2617 | if (syms[k].st_shndx == SHN_UNDEF || | ||
2618 | syms[k].st_shndx >= ehdr->e_shnum) { | ||
2619 | pr_debug("Symbol: %s has bad section index %d.\n", | ||
2620 | name, syms[k].st_shndx); | ||
2621 | return NULL; | ||
2622 | } | ||
2623 | |||
2624 | /* Found the symbol we are looking for */ | ||
2625 | return &syms[k]; | ||
2626 | } | ||
2627 | } | ||
2628 | |||
2629 | return NULL; | ||
2630 | } | ||
2631 | |||
2632 | void *kexec_purgatory_get_symbol_addr(struct kimage *image, const char *name) | ||
2633 | { | ||
2634 | struct purgatory_info *pi = &image->purgatory_info; | ||
2635 | Elf_Sym *sym; | ||
2636 | Elf_Shdr *sechdr; | ||
2637 | |||
2638 | sym = kexec_purgatory_find_symbol(pi, name); | ||
2639 | if (!sym) | ||
2640 | return ERR_PTR(-EINVAL); | ||
2641 | |||
2642 | sechdr = &pi->sechdrs[sym->st_shndx]; | ||
2643 | |||
2644 | /* | ||
2645 | * Returns the address where symbol will finally be loaded after | ||
2646 | * kexec_load_segment() | ||
2647 | */ | ||
2648 | return (void *)(sechdr->sh_addr + sym->st_value); | ||
2649 | } | ||
2650 | |||
2651 | /* | ||
2652 | * Get or set value of a symbol. If "get_value" is true, symbol value is | ||
2653 | * returned in buf otherwise symbol value is set based on value in buf. | ||
2654 | */ | ||
2655 | int kexec_purgatory_get_set_symbol(struct kimage *image, const char *name, | ||
2656 | void *buf, unsigned int size, bool get_value) | ||
2657 | { | ||
2658 | Elf_Sym *sym; | ||
2659 | Elf_Shdr *sechdrs; | ||
2660 | struct purgatory_info *pi = &image->purgatory_info; | ||
2661 | char *sym_buf; | ||
2662 | |||
2663 | sym = kexec_purgatory_find_symbol(pi, name); | ||
2664 | if (!sym) | ||
2665 | return -EINVAL; | ||
2666 | |||
2667 | if (sym->st_size != size) { | ||
2668 | pr_err("symbol %s size mismatch: expected %lu actual %u\n", | ||
2669 | name, (unsigned long)sym->st_size, size); | ||
2670 | return -EINVAL; | ||
2671 | } | ||
2672 | |||
2673 | sechdrs = pi->sechdrs; | ||
2674 | |||
2675 | if (sechdrs[sym->st_shndx].sh_type == SHT_NOBITS) { | ||
2676 | pr_err("symbol %s is in a bss section. Cannot %s\n", name, | ||
2677 | get_value ? "get" : "set"); | ||
2678 | return -EINVAL; | ||
2679 | } | ||
2680 | |||
2681 | sym_buf = (unsigned char *)sechdrs[sym->st_shndx].sh_offset + | ||
2682 | sym->st_value; | ||
2683 | |||
2684 | if (get_value) | ||
2685 | memcpy((void *)buf, sym_buf, size); | ||
2686 | else | ||
2687 | memcpy((void *)sym_buf, buf, size); | ||
2688 | |||
2689 | return 0; | ||
2690 | } | ||
2691 | #endif /* CONFIG_KEXEC_FILE */ | ||
2692 | |||
2693 | /* | ||
2694 | * Move into place and start executing a preloaded standalone | ||
2695 | * executable. If nothing was preloaded return an error. | ||
2696 | */ | ||
2697 | int kernel_kexec(void) | ||
2698 | { | ||
2699 | int error = 0; | ||
2700 | |||
2701 | if (!mutex_trylock(&kexec_mutex)) | ||
2702 | return -EBUSY; | ||
2703 | if (!kexec_image) { | ||
2704 | error = -EINVAL; | ||
2705 | goto Unlock; | ||
2706 | } | ||
2707 | |||
2708 | #ifdef CONFIG_KEXEC_JUMP | ||
2709 | if (kexec_image->preserve_context) { | ||
2710 | lock_system_sleep(); | ||
2711 | pm_prepare_console(); | ||
2712 | error = freeze_processes(); | ||
2713 | if (error) { | ||
2714 | error = -EBUSY; | ||
2715 | goto Restore_console; | ||
2716 | } | ||
2717 | suspend_console(); | ||
2718 | error = dpm_suspend_start(PMSG_FREEZE); | ||
2719 | if (error) | ||
2720 | goto Resume_console; | ||
2721 | /* At this point, dpm_suspend_start() has been called, | ||
2722 | * but *not* dpm_suspend_end(). We *must* call | ||
2723 | * dpm_suspend_end() now. Otherwise, drivers for | ||
2724 | * some devices (e.g. interrupt controllers) become | ||
2725 | * desynchronized with the actual state of the | ||
2726 | * hardware at resume time, and evil weirdness ensues. | ||
2727 | */ | ||
2728 | error = dpm_suspend_end(PMSG_FREEZE); | ||
2729 | if (error) | ||
2730 | goto Resume_devices; | ||
2731 | error = disable_nonboot_cpus(); | ||
2732 | if (error) | ||
2733 | goto Enable_cpus; | ||
2734 | local_irq_disable(); | ||
2735 | error = syscore_suspend(); | ||
2736 | if (error) | ||
2737 | goto Enable_irqs; | ||
2738 | } else | ||
2739 | #endif | ||
2740 | { | ||
2741 | kexec_in_progress = true; | ||
2742 | kernel_restart_prepare(NULL); | ||
2743 | migrate_to_reboot_cpu(); | ||
2744 | |||
2745 | /* | ||
2746 | * migrate_to_reboot_cpu() disables CPU hotplug assuming that | ||
2747 | * no further code needs to use CPU hotplug (which is true in | ||
2748 | * the reboot case). However, the kexec path depends on using | ||
2749 | * CPU hotplug again; so re-enable it here. | ||
2750 | */ | ||
2751 | cpu_hotplug_enable(); | ||
2752 | pr_emerg("Starting new kernel\n"); | ||
2753 | machine_shutdown(); | ||
2754 | } | ||
2755 | |||
2756 | machine_kexec(kexec_image); | ||
2757 | |||
2758 | #ifdef CONFIG_KEXEC_JUMP | ||
2759 | if (kexec_image->preserve_context) { | ||
2760 | syscore_resume(); | ||
2761 | Enable_irqs: | ||
2762 | local_irq_enable(); | ||
2763 | Enable_cpus: | ||
2764 | enable_nonboot_cpus(); | ||
2765 | dpm_resume_start(PMSG_RESTORE); | ||
2766 | Resume_devices: | ||
2767 | dpm_resume_end(PMSG_RESTORE); | ||
2768 | Resume_console: | ||
2769 | resume_console(); | ||
2770 | thaw_processes(); | ||
2771 | Restore_console: | ||
2772 | pm_restore_console(); | ||
2773 | unlock_system_sleep(); | ||
2774 | } | ||
2775 | #endif | ||
2776 | |||
2777 | Unlock: | ||
2778 | mutex_unlock(&kexec_mutex); | ||
2779 | return error; | ||
2780 | } | ||
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c new file mode 100644 index 000000000000..201b45327804 --- /dev/null +++ b/kernel/kexec_core.c | |||
@@ -0,0 +1,1534 @@ | |||
1 | /* | ||
2 | * kexec.c - kexec system call core code. | ||
3 | * Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com> | ||
4 | * | ||
5 | * This source code is licensed under the GNU General Public License, | ||
6 | * Version 2. See the file COPYING for more details. | ||
7 | */ | ||
8 | |||
9 | #define pr_fmt(fmt) "kexec: " fmt | ||
10 | |||
11 | #include <linux/capability.h> | ||
12 | #include <linux/mm.h> | ||
13 | #include <linux/file.h> | ||
14 | #include <linux/slab.h> | ||
15 | #include <linux/fs.h> | ||
16 | #include <linux/kexec.h> | ||
17 | #include <linux/mutex.h> | ||
18 | #include <linux/list.h> | ||
19 | #include <linux/highmem.h> | ||
20 | #include <linux/syscalls.h> | ||
21 | #include <linux/reboot.h> | ||
22 | #include <linux/ioport.h> | ||
23 | #include <linux/hardirq.h> | ||
24 | #include <linux/elf.h> | ||
25 | #include <linux/elfcore.h> | ||
26 | #include <linux/utsname.h> | ||
27 | #include <linux/numa.h> | ||
28 | #include <linux/suspend.h> | ||
29 | #include <linux/device.h> | ||
30 | #include <linux/freezer.h> | ||
31 | #include <linux/pm.h> | ||
32 | #include <linux/cpu.h> | ||
33 | #include <linux/uaccess.h> | ||
34 | #include <linux/io.h> | ||
35 | #include <linux/console.h> | ||
36 | #include <linux/vmalloc.h> | ||
37 | #include <linux/swap.h> | ||
38 | #include <linux/syscore_ops.h> | ||
39 | #include <linux/compiler.h> | ||
40 | #include <linux/hugetlb.h> | ||
41 | |||
42 | #include <asm/page.h> | ||
43 | #include <asm/sections.h> | ||
44 | |||
45 | #include <crypto/hash.h> | ||
46 | #include <crypto/sha.h> | ||
47 | #include "kexec_internal.h" | ||
48 | |||
49 | DEFINE_MUTEX(kexec_mutex); | ||
50 | |||
51 | /* Per cpu memory for storing cpu states in case of system crash. */ | ||
52 | note_buf_t __percpu *crash_notes; | ||
53 | |||
54 | /* vmcoreinfo stuff */ | ||
55 | static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES]; | ||
56 | u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4]; | ||
57 | size_t vmcoreinfo_size; | ||
58 | size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data); | ||
59 | |||
60 | /* Flag to indicate we are going to kexec a new kernel */ | ||
61 | bool kexec_in_progress = false; | ||
62 | |||
63 | |||
64 | /* Location of the reserved area for the crash kernel */ | ||
65 | struct resource crashk_res = { | ||
66 | .name = "Crash kernel", | ||
67 | .start = 0, | ||
68 | .end = 0, | ||
69 | .flags = IORESOURCE_BUSY | IORESOURCE_MEM | ||
70 | }; | ||
71 | struct resource crashk_low_res = { | ||
72 | .name = "Crash kernel", | ||
73 | .start = 0, | ||
74 | .end = 0, | ||
75 | .flags = IORESOURCE_BUSY | IORESOURCE_MEM | ||
76 | }; | ||
77 | |||
78 | int kexec_should_crash(struct task_struct *p) | ||
79 | { | ||
80 | /* | ||
81 | * If crash_kexec_post_notifiers is enabled, don't run | ||
82 | * crash_kexec() here yet, which must be run after panic | ||
83 | * notifiers in panic(). | ||
84 | */ | ||
85 | if (crash_kexec_post_notifiers) | ||
86 | return 0; | ||
87 | /* | ||
88 | * There are 4 panic() calls in do_exit() path, each of which | ||
89 | * corresponds to each of these 4 conditions. | ||
90 | */ | ||
91 | if (in_interrupt() || !p->pid || is_global_init(p) || panic_on_oops) | ||
92 | return 1; | ||
93 | return 0; | ||
94 | } | ||
95 | |||
96 | /* | ||
97 | * When kexec transitions to the new kernel there is a one-to-one | ||
98 | * mapping between physical and virtual addresses. On processors | ||
99 | * where you can disable the MMU this is trivial, and easy. For | ||
100 | * others it is still a simple predictable page table to setup. | ||
101 | * | ||
102 | * In that environment kexec copies the new kernel to its final | ||
103 | * resting place. This means I can only support memory whose | ||
104 | * physical address can fit in an unsigned long. In particular | ||
105 | * addresses where (pfn << PAGE_SHIFT) > ULONG_MAX cannot be handled. | ||
106 | * If the assembly stub has more restrictive requirements | ||
107 | * KEXEC_SOURCE_MEMORY_LIMIT and KEXEC_DEST_MEMORY_LIMIT can be | ||
108 | * defined more restrictively in <asm/kexec.h>. | ||
109 | * | ||
110 | * The code for the transition from the current kernel to the | ||
111 | * the new kernel is placed in the control_code_buffer, whose size | ||
112 | * is given by KEXEC_CONTROL_PAGE_SIZE. In the best case only a single | ||
113 | * page of memory is necessary, but some architectures require more. | ||
114 | * Because this memory must be identity mapped in the transition from | ||
115 | * virtual to physical addresses it must live in the range | ||
116 | * 0 - TASK_SIZE, as only the user space mappings are arbitrarily | ||
117 | * modifiable. | ||
118 | * | ||
119 | * The assembly stub in the control code buffer is passed a linked list | ||
120 | * of descriptor pages detailing the source pages of the new kernel, | ||
121 | * and the destination addresses of those source pages. As this data | ||
122 | * structure is not used in the context of the current OS, it must | ||
123 | * be self-contained. | ||
124 | * | ||
125 | * The code has been made to work with highmem pages and will use a | ||
126 | * destination page in its final resting place (if it happens | ||
127 | * to allocate it). The end product of this is that most of the | ||
128 | * physical address space, and most of RAM can be used. | ||
129 | * | ||
130 | * Future directions include: | ||
131 | * - allocating a page table with the control code buffer identity | ||
132 | * mapped, to simplify machine_kexec and make kexec_on_panic more | ||
133 | * reliable. | ||
134 | */ | ||
135 | |||
136 | /* | ||
137 | * KIMAGE_NO_DEST is an impossible destination address..., for | ||
138 | * allocating pages whose destination address we do not care about. | ||
139 | */ | ||
140 | #define KIMAGE_NO_DEST (-1UL) | ||
141 | |||
142 | static struct page *kimage_alloc_page(struct kimage *image, | ||
143 | gfp_t gfp_mask, | ||
144 | unsigned long dest); | ||
145 | |||
146 | int sanity_check_segment_list(struct kimage *image) | ||
147 | { | ||
148 | int result, i; | ||
149 | unsigned long nr_segments = image->nr_segments; | ||
150 | |||
151 | /* | ||
152 | * Verify we have good destination addresses. The caller is | ||
153 | * responsible for making certain we don't attempt to load | ||
154 | * the new image into invalid or reserved areas of RAM. This | ||
155 | * just verifies it is an address we can use. | ||
156 | * | ||
157 | * Since the kernel does everything in page size chunks ensure | ||
158 | * the destination addresses are page aligned. Too many | ||
159 | * special cases crop of when we don't do this. The most | ||
160 | * insidious is getting overlapping destination addresses | ||
161 | * simply because addresses are changed to page size | ||
162 | * granularity. | ||
163 | */ | ||
164 | result = -EADDRNOTAVAIL; | ||
165 | for (i = 0; i < nr_segments; i++) { | ||
166 | unsigned long mstart, mend; | ||
167 | |||
168 | mstart = image->segment[i].mem; | ||
169 | mend = mstart + image->segment[i].memsz; | ||
170 | if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK)) | ||
171 | return result; | ||
172 | if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT) | ||
173 | return result; | ||
174 | } | ||
175 | |||
176 | /* Verify our destination addresses do not overlap. | ||
177 | * If we alloed overlapping destination addresses | ||
178 | * through very weird things can happen with no | ||
179 | * easy explanation as one segment stops on another. | ||
180 | */ | ||
181 | result = -EINVAL; | ||
182 | for (i = 0; i < nr_segments; i++) { | ||
183 | unsigned long mstart, mend; | ||
184 | unsigned long j; | ||
185 | |||
186 | mstart = image->segment[i].mem; | ||
187 | mend = mstart + image->segment[i].memsz; | ||
188 | for (j = 0; j < i; j++) { | ||
189 | unsigned long pstart, pend; | ||
190 | |||
191 | pstart = image->segment[j].mem; | ||
192 | pend = pstart + image->segment[j].memsz; | ||
193 | /* Do the segments overlap ? */ | ||
194 | if ((mend > pstart) && (mstart < pend)) | ||
195 | return result; | ||
196 | } | ||
197 | } | ||
198 | |||
199 | /* Ensure our buffer sizes are strictly less than | ||
200 | * our memory sizes. This should always be the case, | ||
201 | * and it is easier to check up front than to be surprised | ||
202 | * later on. | ||
203 | */ | ||
204 | result = -EINVAL; | ||
205 | for (i = 0; i < nr_segments; i++) { | ||
206 | if (image->segment[i].bufsz > image->segment[i].memsz) | ||
207 | return result; | ||
208 | } | ||
209 | |||
210 | /* | ||
211 | * Verify we have good destination addresses. Normally | ||
212 | * the caller is responsible for making certain we don't | ||
213 | * attempt to load the new image into invalid or reserved | ||
214 | * areas of RAM. But crash kernels are preloaded into a | ||
215 | * reserved area of ram. We must ensure the addresses | ||
216 | * are in the reserved area otherwise preloading the | ||
217 | * kernel could corrupt things. | ||
218 | */ | ||
219 | |||
220 | if (image->type == KEXEC_TYPE_CRASH) { | ||
221 | result = -EADDRNOTAVAIL; | ||
222 | for (i = 0; i < nr_segments; i++) { | ||
223 | unsigned long mstart, mend; | ||
224 | |||
225 | mstart = image->segment[i].mem; | ||
226 | mend = mstart + image->segment[i].memsz - 1; | ||
227 | /* Ensure we are within the crash kernel limits */ | ||
228 | if ((mstart < crashk_res.start) || | ||
229 | (mend > crashk_res.end)) | ||
230 | return result; | ||
231 | } | ||
232 | } | ||
233 | |||
234 | return 0; | ||
235 | } | ||
236 | |||
237 | struct kimage *do_kimage_alloc_init(void) | ||
238 | { | ||
239 | struct kimage *image; | ||
240 | |||
241 | /* Allocate a controlling structure */ | ||
242 | image = kzalloc(sizeof(*image), GFP_KERNEL); | ||
243 | if (!image) | ||
244 | return NULL; | ||
245 | |||
246 | image->head = 0; | ||
247 | image->entry = &image->head; | ||
248 | image->last_entry = &image->head; | ||
249 | image->control_page = ~0; /* By default this does not apply */ | ||
250 | image->type = KEXEC_TYPE_DEFAULT; | ||
251 | |||
252 | /* Initialize the list of control pages */ | ||
253 | INIT_LIST_HEAD(&image->control_pages); | ||
254 | |||
255 | /* Initialize the list of destination pages */ | ||
256 | INIT_LIST_HEAD(&image->dest_pages); | ||
257 | |||
258 | /* Initialize the list of unusable pages */ | ||
259 | INIT_LIST_HEAD(&image->unusable_pages); | ||
260 | |||
261 | return image; | ||
262 | } | ||
263 | |||
264 | int kimage_is_destination_range(struct kimage *image, | ||
265 | unsigned long start, | ||
266 | unsigned long end) | ||
267 | { | ||
268 | unsigned long i; | ||
269 | |||
270 | for (i = 0; i < image->nr_segments; i++) { | ||
271 | unsigned long mstart, mend; | ||
272 | |||
273 | mstart = image->segment[i].mem; | ||
274 | mend = mstart + image->segment[i].memsz; | ||
275 | if ((end > mstart) && (start < mend)) | ||
276 | return 1; | ||
277 | } | ||
278 | |||
279 | return 0; | ||
280 | } | ||
281 | |||
282 | static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order) | ||
283 | { | ||
284 | struct page *pages; | ||
285 | |||
286 | pages = alloc_pages(gfp_mask, order); | ||
287 | if (pages) { | ||
288 | unsigned int count, i; | ||
289 | |||
290 | pages->mapping = NULL; | ||
291 | set_page_private(pages, order); | ||
292 | count = 1 << order; | ||
293 | for (i = 0; i < count; i++) | ||
294 | SetPageReserved(pages + i); | ||
295 | } | ||
296 | |||
297 | return pages; | ||
298 | } | ||
299 | |||
300 | static void kimage_free_pages(struct page *page) | ||
301 | { | ||
302 | unsigned int order, count, i; | ||
303 | |||
304 | order = page_private(page); | ||
305 | count = 1 << order; | ||
306 | for (i = 0; i < count; i++) | ||
307 | ClearPageReserved(page + i); | ||
308 | __free_pages(page, order); | ||
309 | } | ||
310 | |||
311 | void kimage_free_page_list(struct list_head *list) | ||
312 | { | ||
313 | struct list_head *pos, *next; | ||
314 | |||
315 | list_for_each_safe(pos, next, list) { | ||
316 | struct page *page; | ||
317 | |||
318 | page = list_entry(pos, struct page, lru); | ||
319 | list_del(&page->lru); | ||
320 | kimage_free_pages(page); | ||
321 | } | ||
322 | } | ||
323 | |||
324 | static struct page *kimage_alloc_normal_control_pages(struct kimage *image, | ||
325 | unsigned int order) | ||
326 | { | ||
327 | /* Control pages are special, they are the intermediaries | ||
328 | * that are needed while we copy the rest of the pages | ||
329 | * to their final resting place. As such they must | ||
330 | * not conflict with either the destination addresses | ||
331 | * or memory the kernel is already using. | ||
332 | * | ||
333 | * The only case where we really need more than one of | ||
334 | * these are for architectures where we cannot disable | ||
335 | * the MMU and must instead generate an identity mapped | ||
336 | * page table for all of the memory. | ||
337 | * | ||
338 | * At worst this runs in O(N) of the image size. | ||
339 | */ | ||
340 | struct list_head extra_pages; | ||
341 | struct page *pages; | ||
342 | unsigned int count; | ||
343 | |||
344 | count = 1 << order; | ||
345 | INIT_LIST_HEAD(&extra_pages); | ||
346 | |||
347 | /* Loop while I can allocate a page and the page allocated | ||
348 | * is a destination page. | ||
349 | */ | ||
350 | do { | ||
351 | unsigned long pfn, epfn, addr, eaddr; | ||
352 | |||
353 | pages = kimage_alloc_pages(KEXEC_CONTROL_MEMORY_GFP, order); | ||
354 | if (!pages) | ||
355 | break; | ||
356 | pfn = page_to_pfn(pages); | ||
357 | epfn = pfn + count; | ||
358 | addr = pfn << PAGE_SHIFT; | ||
359 | eaddr = epfn << PAGE_SHIFT; | ||
360 | if ((epfn >= (KEXEC_CONTROL_MEMORY_LIMIT >> PAGE_SHIFT)) || | ||
361 | kimage_is_destination_range(image, addr, eaddr)) { | ||
362 | list_add(&pages->lru, &extra_pages); | ||
363 | pages = NULL; | ||
364 | } | ||
365 | } while (!pages); | ||
366 | |||
367 | if (pages) { | ||
368 | /* Remember the allocated page... */ | ||
369 | list_add(&pages->lru, &image->control_pages); | ||
370 | |||
371 | /* Because the page is already in it's destination | ||
372 | * location we will never allocate another page at | ||
373 | * that address. Therefore kimage_alloc_pages | ||
374 | * will not return it (again) and we don't need | ||
375 | * to give it an entry in image->segment[]. | ||
376 | */ | ||
377 | } | ||
378 | /* Deal with the destination pages I have inadvertently allocated. | ||
379 | * | ||
380 | * Ideally I would convert multi-page allocations into single | ||
381 | * page allocations, and add everything to image->dest_pages. | ||
382 | * | ||
383 | * For now it is simpler to just free the pages. | ||
384 | */ | ||
385 | kimage_free_page_list(&extra_pages); | ||
386 | |||
387 | return pages; | ||
388 | } | ||
389 | |||
390 | static struct page *kimage_alloc_crash_control_pages(struct kimage *image, | ||
391 | unsigned int order) | ||
392 | { | ||
393 | /* Control pages are special, they are the intermediaries | ||
394 | * that are needed while we copy the rest of the pages | ||
395 | * to their final resting place. As such they must | ||
396 | * not conflict with either the destination addresses | ||
397 | * or memory the kernel is already using. | ||
398 | * | ||
399 | * Control pages are also the only pags we must allocate | ||
400 | * when loading a crash kernel. All of the other pages | ||
401 | * are specified by the segments and we just memcpy | ||
402 | * into them directly. | ||
403 | * | ||
404 | * The only case where we really need more than one of | ||
405 | * these are for architectures where we cannot disable | ||
406 | * the MMU and must instead generate an identity mapped | ||
407 | * page table for all of the memory. | ||
408 | * | ||
409 | * Given the low demand this implements a very simple | ||
410 | * allocator that finds the first hole of the appropriate | ||
411 | * size in the reserved memory region, and allocates all | ||
412 | * of the memory up to and including the hole. | ||
413 | */ | ||
414 | unsigned long hole_start, hole_end, size; | ||
415 | struct page *pages; | ||
416 | |||
417 | pages = NULL; | ||
418 | size = (1 << order) << PAGE_SHIFT; | ||
419 | hole_start = (image->control_page + (size - 1)) & ~(size - 1); | ||
420 | hole_end = hole_start + size - 1; | ||
421 | while (hole_end <= crashk_res.end) { | ||
422 | unsigned long i; | ||
423 | |||
424 | if (hole_end > KEXEC_CRASH_CONTROL_MEMORY_LIMIT) | ||
425 | break; | ||
426 | /* See if I overlap any of the segments */ | ||
427 | for (i = 0; i < image->nr_segments; i++) { | ||
428 | unsigned long mstart, mend; | ||
429 | |||
430 | mstart = image->segment[i].mem; | ||
431 | mend = mstart + image->segment[i].memsz - 1; | ||
432 | if ((hole_end >= mstart) && (hole_start <= mend)) { | ||
433 | /* Advance the hole to the end of the segment */ | ||
434 | hole_start = (mend + (size - 1)) & ~(size - 1); | ||
435 | hole_end = hole_start + size - 1; | ||
436 | break; | ||
437 | } | ||
438 | } | ||
439 | /* If I don't overlap any segments I have found my hole! */ | ||
440 | if (i == image->nr_segments) { | ||
441 | pages = pfn_to_page(hole_start >> PAGE_SHIFT); | ||
442 | image->control_page = hole_end; | ||
443 | break; | ||
444 | } | ||
445 | } | ||
446 | |||
447 | return pages; | ||
448 | } | ||
449 | |||
450 | |||
451 | struct page *kimage_alloc_control_pages(struct kimage *image, | ||
452 | unsigned int order) | ||
453 | { | ||
454 | struct page *pages = NULL; | ||
455 | |||
456 | switch (image->type) { | ||
457 | case KEXEC_TYPE_DEFAULT: | ||
458 | pages = kimage_alloc_normal_control_pages(image, order); | ||
459 | break; | ||
460 | case KEXEC_TYPE_CRASH: | ||
461 | pages = kimage_alloc_crash_control_pages(image, order); | ||
462 | break; | ||
463 | } | ||
464 | |||
465 | return pages; | ||
466 | } | ||
467 | |||
468 | static int kimage_add_entry(struct kimage *image, kimage_entry_t entry) | ||
469 | { | ||
470 | if (*image->entry != 0) | ||
471 | image->entry++; | ||
472 | |||
473 | if (image->entry == image->last_entry) { | ||
474 | kimage_entry_t *ind_page; | ||
475 | struct page *page; | ||
476 | |||
477 | page = kimage_alloc_page(image, GFP_KERNEL, KIMAGE_NO_DEST); | ||
478 | if (!page) | ||
479 | return -ENOMEM; | ||
480 | |||
481 | ind_page = page_address(page); | ||
482 | *image->entry = virt_to_phys(ind_page) | IND_INDIRECTION; | ||
483 | image->entry = ind_page; | ||
484 | image->last_entry = ind_page + | ||
485 | ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1); | ||
486 | } | ||
487 | *image->entry = entry; | ||
488 | image->entry++; | ||
489 | *image->entry = 0; | ||
490 | |||
491 | return 0; | ||
492 | } | ||
493 | |||
494 | static int kimage_set_destination(struct kimage *image, | ||
495 | unsigned long destination) | ||
496 | { | ||
497 | int result; | ||
498 | |||
499 | destination &= PAGE_MASK; | ||
500 | result = kimage_add_entry(image, destination | IND_DESTINATION); | ||
501 | |||
502 | return result; | ||
503 | } | ||
504 | |||
505 | |||
506 | static int kimage_add_page(struct kimage *image, unsigned long page) | ||
507 | { | ||
508 | int result; | ||
509 | |||
510 | page &= PAGE_MASK; | ||
511 | result = kimage_add_entry(image, page | IND_SOURCE); | ||
512 | |||
513 | return result; | ||
514 | } | ||
515 | |||
516 | |||
517 | static void kimage_free_extra_pages(struct kimage *image) | ||
518 | { | ||
519 | /* Walk through and free any extra destination pages I may have */ | ||
520 | kimage_free_page_list(&image->dest_pages); | ||
521 | |||
522 | /* Walk through and free any unusable pages I have cached */ | ||
523 | kimage_free_page_list(&image->unusable_pages); | ||
524 | |||
525 | } | ||
526 | void kimage_terminate(struct kimage *image) | ||
527 | { | ||
528 | if (*image->entry != 0) | ||
529 | image->entry++; | ||
530 | |||
531 | *image->entry = IND_DONE; | ||
532 | } | ||
533 | |||
534 | #define for_each_kimage_entry(image, ptr, entry) \ | ||
535 | for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \ | ||
536 | ptr = (entry & IND_INDIRECTION) ? \ | ||
537 | phys_to_virt((entry & PAGE_MASK)) : ptr + 1) | ||
538 | |||
539 | static void kimage_free_entry(kimage_entry_t entry) | ||
540 | { | ||
541 | struct page *page; | ||
542 | |||
543 | page = pfn_to_page(entry >> PAGE_SHIFT); | ||
544 | kimage_free_pages(page); | ||
545 | } | ||
546 | |||
547 | void kimage_free(struct kimage *image) | ||
548 | { | ||
549 | kimage_entry_t *ptr, entry; | ||
550 | kimage_entry_t ind = 0; | ||
551 | |||
552 | if (!image) | ||
553 | return; | ||
554 | |||
555 | kimage_free_extra_pages(image); | ||
556 | for_each_kimage_entry(image, ptr, entry) { | ||
557 | if (entry & IND_INDIRECTION) { | ||
558 | /* Free the previous indirection page */ | ||
559 | if (ind & IND_INDIRECTION) | ||
560 | kimage_free_entry(ind); | ||
561 | /* Save this indirection page until we are | ||
562 | * done with it. | ||
563 | */ | ||
564 | ind = entry; | ||
565 | } else if (entry & IND_SOURCE) | ||
566 | kimage_free_entry(entry); | ||
567 | } | ||
568 | /* Free the final indirection page */ | ||
569 | if (ind & IND_INDIRECTION) | ||
570 | kimage_free_entry(ind); | ||
571 | |||
572 | /* Handle any machine specific cleanup */ | ||
573 | machine_kexec_cleanup(image); | ||
574 | |||
575 | /* Free the kexec control pages... */ | ||
576 | kimage_free_page_list(&image->control_pages); | ||
577 | |||
578 | /* | ||
579 | * Free up any temporary buffers allocated. This might hit if | ||
580 | * error occurred much later after buffer allocation. | ||
581 | */ | ||
582 | if (image->file_mode) | ||
583 | kimage_file_post_load_cleanup(image); | ||
584 | |||
585 | kfree(image); | ||
586 | } | ||
587 | |||
588 | static kimage_entry_t *kimage_dst_used(struct kimage *image, | ||
589 | unsigned long page) | ||
590 | { | ||
591 | kimage_entry_t *ptr, entry; | ||
592 | unsigned long destination = 0; | ||
593 | |||
594 | for_each_kimage_entry(image, ptr, entry) { | ||
595 | if (entry & IND_DESTINATION) | ||
596 | destination = entry & PAGE_MASK; | ||
597 | else if (entry & IND_SOURCE) { | ||
598 | if (page == destination) | ||
599 | return ptr; | ||
600 | destination += PAGE_SIZE; | ||
601 | } | ||
602 | } | ||
603 | |||
604 | return NULL; | ||
605 | } | ||
606 | |||
607 | static struct page *kimage_alloc_page(struct kimage *image, | ||
608 | gfp_t gfp_mask, | ||
609 | unsigned long destination) | ||
610 | { | ||
611 | /* | ||
612 | * Here we implement safeguards to ensure that a source page | ||
613 | * is not copied to its destination page before the data on | ||
614 | * the destination page is no longer useful. | ||
615 | * | ||
616 | * To do this we maintain the invariant that a source page is | ||
617 | * either its own destination page, or it is not a | ||
618 | * destination page at all. | ||
619 | * | ||
620 | * That is slightly stronger than required, but the proof | ||
621 | * that no problems will not occur is trivial, and the | ||
622 | * implementation is simply to verify. | ||
623 | * | ||
624 | * When allocating all pages normally this algorithm will run | ||
625 | * in O(N) time, but in the worst case it will run in O(N^2) | ||
626 | * time. If the runtime is a problem the data structures can | ||
627 | * be fixed. | ||
628 | */ | ||
629 | struct page *page; | ||
630 | unsigned long addr; | ||
631 | |||
632 | /* | ||
633 | * Walk through the list of destination pages, and see if I | ||
634 | * have a match. | ||
635 | */ | ||
636 | list_for_each_entry(page, &image->dest_pages, lru) { | ||
637 | addr = page_to_pfn(page) << PAGE_SHIFT; | ||
638 | if (addr == destination) { | ||
639 | list_del(&page->lru); | ||
640 | return page; | ||
641 | } | ||
642 | } | ||
643 | page = NULL; | ||
644 | while (1) { | ||
645 | kimage_entry_t *old; | ||
646 | |||
647 | /* Allocate a page, if we run out of memory give up */ | ||
648 | page = kimage_alloc_pages(gfp_mask, 0); | ||
649 | if (!page) | ||
650 | return NULL; | ||
651 | /* If the page cannot be used file it away */ | ||
652 | if (page_to_pfn(page) > | ||
653 | (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) { | ||
654 | list_add(&page->lru, &image->unusable_pages); | ||
655 | continue; | ||
656 | } | ||
657 | addr = page_to_pfn(page) << PAGE_SHIFT; | ||
658 | |||
659 | /* If it is the destination page we want use it */ | ||
660 | if (addr == destination) | ||
661 | break; | ||
662 | |||
663 | /* If the page is not a destination page use it */ | ||
664 | if (!kimage_is_destination_range(image, addr, | ||
665 | addr + PAGE_SIZE)) | ||
666 | break; | ||
667 | |||
668 | /* | ||
669 | * I know that the page is someones destination page. | ||
670 | * See if there is already a source page for this | ||
671 | * destination page. And if so swap the source pages. | ||
672 | */ | ||
673 | old = kimage_dst_used(image, addr); | ||
674 | if (old) { | ||
675 | /* If so move it */ | ||
676 | unsigned long old_addr; | ||
677 | struct page *old_page; | ||
678 | |||
679 | old_addr = *old & PAGE_MASK; | ||
680 | old_page = pfn_to_page(old_addr >> PAGE_SHIFT); | ||
681 | copy_highpage(page, old_page); | ||
682 | *old = addr | (*old & ~PAGE_MASK); | ||
683 | |||
684 | /* The old page I have found cannot be a | ||
685 | * destination page, so return it if it's | ||
686 | * gfp_flags honor the ones passed in. | ||
687 | */ | ||
688 | if (!(gfp_mask & __GFP_HIGHMEM) && | ||
689 | PageHighMem(old_page)) { | ||
690 | kimage_free_pages(old_page); | ||
691 | continue; | ||
692 | } | ||
693 | addr = old_addr; | ||
694 | page = old_page; | ||
695 | break; | ||
696 | } | ||
697 | /* Place the page on the destination list, to be used later */ | ||
698 | list_add(&page->lru, &image->dest_pages); | ||
699 | } | ||
700 | |||
701 | return page; | ||
702 | } | ||
703 | |||
704 | static int kimage_load_normal_segment(struct kimage *image, | ||
705 | struct kexec_segment *segment) | ||
706 | { | ||
707 | unsigned long maddr; | ||
708 | size_t ubytes, mbytes; | ||
709 | int result; | ||
710 | unsigned char __user *buf = NULL; | ||
711 | unsigned char *kbuf = NULL; | ||
712 | |||
713 | result = 0; | ||
714 | if (image->file_mode) | ||
715 | kbuf = segment->kbuf; | ||
716 | else | ||
717 | buf = segment->buf; | ||
718 | ubytes = segment->bufsz; | ||
719 | mbytes = segment->memsz; | ||
720 | maddr = segment->mem; | ||
721 | |||
722 | result = kimage_set_destination(image, maddr); | ||
723 | if (result < 0) | ||
724 | goto out; | ||
725 | |||
726 | while (mbytes) { | ||
727 | struct page *page; | ||
728 | char *ptr; | ||
729 | size_t uchunk, mchunk; | ||
730 | |||
731 | page = kimage_alloc_page(image, GFP_HIGHUSER, maddr); | ||
732 | if (!page) { | ||
733 | result = -ENOMEM; | ||
734 | goto out; | ||
735 | } | ||
736 | result = kimage_add_page(image, page_to_pfn(page) | ||
737 | << PAGE_SHIFT); | ||
738 | if (result < 0) | ||
739 | goto out; | ||
740 | |||
741 | ptr = kmap(page); | ||
742 | /* Start with a clear page */ | ||
743 | clear_page(ptr); | ||
744 | ptr += maddr & ~PAGE_MASK; | ||
745 | mchunk = min_t(size_t, mbytes, | ||
746 | PAGE_SIZE - (maddr & ~PAGE_MASK)); | ||
747 | uchunk = min(ubytes, mchunk); | ||
748 | |||
749 | /* For file based kexec, source pages are in kernel memory */ | ||
750 | if (image->file_mode) | ||
751 | memcpy(ptr, kbuf, uchunk); | ||
752 | else | ||
753 | result = copy_from_user(ptr, buf, uchunk); | ||
754 | kunmap(page); | ||
755 | if (result) { | ||
756 | result = -EFAULT; | ||
757 | goto out; | ||
758 | } | ||
759 | ubytes -= uchunk; | ||
760 | maddr += mchunk; | ||
761 | if (image->file_mode) | ||
762 | kbuf += mchunk; | ||
763 | else | ||
764 | buf += mchunk; | ||
765 | mbytes -= mchunk; | ||
766 | } | ||
767 | out: | ||
768 | return result; | ||
769 | } | ||
770 | |||
771 | static int kimage_load_crash_segment(struct kimage *image, | ||
772 | struct kexec_segment *segment) | ||
773 | { | ||
774 | /* For crash dumps kernels we simply copy the data from | ||
775 | * user space to it's destination. | ||
776 | * We do things a page at a time for the sake of kmap. | ||
777 | */ | ||
778 | unsigned long maddr; | ||
779 | size_t ubytes, mbytes; | ||
780 | int result; | ||
781 | unsigned char __user *buf = NULL; | ||
782 | unsigned char *kbuf = NULL; | ||
783 | |||
784 | result = 0; | ||
785 | if (image->file_mode) | ||
786 | kbuf = segment->kbuf; | ||
787 | else | ||
788 | buf = segment->buf; | ||
789 | ubytes = segment->bufsz; | ||
790 | mbytes = segment->memsz; | ||
791 | maddr = segment->mem; | ||
792 | while (mbytes) { | ||
793 | struct page *page; | ||
794 | char *ptr; | ||
795 | size_t uchunk, mchunk; | ||
796 | |||
797 | page = pfn_to_page(maddr >> PAGE_SHIFT); | ||
798 | if (!page) { | ||
799 | result = -ENOMEM; | ||
800 | goto out; | ||
801 | } | ||
802 | ptr = kmap(page); | ||
803 | ptr += maddr & ~PAGE_MASK; | ||
804 | mchunk = min_t(size_t, mbytes, | ||
805 | PAGE_SIZE - (maddr & ~PAGE_MASK)); | ||
806 | uchunk = min(ubytes, mchunk); | ||
807 | if (mchunk > uchunk) { | ||
808 | /* Zero the trailing part of the page */ | ||
809 | memset(ptr + uchunk, 0, mchunk - uchunk); | ||
810 | } | ||
811 | |||
812 | /* For file based kexec, source pages are in kernel memory */ | ||
813 | if (image->file_mode) | ||
814 | memcpy(ptr, kbuf, uchunk); | ||
815 | else | ||
816 | result = copy_from_user(ptr, buf, uchunk); | ||
817 | kexec_flush_icache_page(page); | ||
818 | kunmap(page); | ||
819 | if (result) { | ||
820 | result = -EFAULT; | ||
821 | goto out; | ||
822 | } | ||
823 | ubytes -= uchunk; | ||
824 | maddr += mchunk; | ||
825 | if (image->file_mode) | ||
826 | kbuf += mchunk; | ||
827 | else | ||
828 | buf += mchunk; | ||
829 | mbytes -= mchunk; | ||
830 | } | ||
831 | out: | ||
832 | return result; | ||
833 | } | ||
834 | |||
835 | int kimage_load_segment(struct kimage *image, | ||
836 | struct kexec_segment *segment) | ||
837 | { | ||
838 | int result = -ENOMEM; | ||
839 | |||
840 | switch (image->type) { | ||
841 | case KEXEC_TYPE_DEFAULT: | ||
842 | result = kimage_load_normal_segment(image, segment); | ||
843 | break; | ||
844 | case KEXEC_TYPE_CRASH: | ||
845 | result = kimage_load_crash_segment(image, segment); | ||
846 | break; | ||
847 | } | ||
848 | |||
849 | return result; | ||
850 | } | ||
851 | |||
852 | struct kimage *kexec_image; | ||
853 | struct kimage *kexec_crash_image; | ||
854 | int kexec_load_disabled; | ||
855 | |||
856 | void crash_kexec(struct pt_regs *regs) | ||
857 | { | ||
858 | /* Take the kexec_mutex here to prevent sys_kexec_load | ||
859 | * running on one cpu from replacing the crash kernel | ||
860 | * we are using after a panic on a different cpu. | ||
861 | * | ||
862 | * If the crash kernel was not located in a fixed area | ||
863 | * of memory the xchg(&kexec_crash_image) would be | ||
864 | * sufficient. But since I reuse the memory... | ||
865 | */ | ||
866 | if (mutex_trylock(&kexec_mutex)) { | ||
867 | if (kexec_crash_image) { | ||
868 | struct pt_regs fixed_regs; | ||
869 | |||
870 | crash_setup_regs(&fixed_regs, regs); | ||
871 | crash_save_vmcoreinfo(); | ||
872 | machine_crash_shutdown(&fixed_regs); | ||
873 | machine_kexec(kexec_crash_image); | ||
874 | } | ||
875 | mutex_unlock(&kexec_mutex); | ||
876 | } | ||
877 | } | ||
878 | |||
879 | size_t crash_get_memory_size(void) | ||
880 | { | ||
881 | size_t size = 0; | ||
882 | |||
883 | mutex_lock(&kexec_mutex); | ||
884 | if (crashk_res.end != crashk_res.start) | ||
885 | size = resource_size(&crashk_res); | ||
886 | mutex_unlock(&kexec_mutex); | ||
887 | return size; | ||
888 | } | ||
889 | |||
890 | void __weak crash_free_reserved_phys_range(unsigned long begin, | ||
891 | unsigned long end) | ||
892 | { | ||
893 | unsigned long addr; | ||
894 | |||
895 | for (addr = begin; addr < end; addr += PAGE_SIZE) | ||
896 | free_reserved_page(pfn_to_page(addr >> PAGE_SHIFT)); | ||
897 | } | ||
898 | |||
899 | int crash_shrink_memory(unsigned long new_size) | ||
900 | { | ||
901 | int ret = 0; | ||
902 | unsigned long start, end; | ||
903 | unsigned long old_size; | ||
904 | struct resource *ram_res; | ||
905 | |||
906 | mutex_lock(&kexec_mutex); | ||
907 | |||
908 | if (kexec_crash_image) { | ||
909 | ret = -ENOENT; | ||
910 | goto unlock; | ||
911 | } | ||
912 | start = crashk_res.start; | ||
913 | end = crashk_res.end; | ||
914 | old_size = (end == 0) ? 0 : end - start + 1; | ||
915 | if (new_size >= old_size) { | ||
916 | ret = (new_size == old_size) ? 0 : -EINVAL; | ||
917 | goto unlock; | ||
918 | } | ||
919 | |||
920 | ram_res = kzalloc(sizeof(*ram_res), GFP_KERNEL); | ||
921 | if (!ram_res) { | ||
922 | ret = -ENOMEM; | ||
923 | goto unlock; | ||
924 | } | ||
925 | |||
926 | start = roundup(start, KEXEC_CRASH_MEM_ALIGN); | ||
927 | end = roundup(start + new_size, KEXEC_CRASH_MEM_ALIGN); | ||
928 | |||
929 | crash_map_reserved_pages(); | ||
930 | crash_free_reserved_phys_range(end, crashk_res.end); | ||
931 | |||
932 | if ((start == end) && (crashk_res.parent != NULL)) | ||
933 | release_resource(&crashk_res); | ||
934 | |||
935 | ram_res->start = end; | ||
936 | ram_res->end = crashk_res.end; | ||
937 | ram_res->flags = IORESOURCE_BUSY | IORESOURCE_MEM; | ||
938 | ram_res->name = "System RAM"; | ||
939 | |||
940 | crashk_res.end = end - 1; | ||
941 | |||
942 | insert_resource(&iomem_resource, ram_res); | ||
943 | crash_unmap_reserved_pages(); | ||
944 | |||
945 | unlock: | ||
946 | mutex_unlock(&kexec_mutex); | ||
947 | return ret; | ||
948 | } | ||
949 | |||
950 | static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data, | ||
951 | size_t data_len) | ||
952 | { | ||
953 | struct elf_note note; | ||
954 | |||
955 | note.n_namesz = strlen(name) + 1; | ||
956 | note.n_descsz = data_len; | ||
957 | note.n_type = type; | ||
958 | memcpy(buf, ¬e, sizeof(note)); | ||
959 | buf += (sizeof(note) + 3)/4; | ||
960 | memcpy(buf, name, note.n_namesz); | ||
961 | buf += (note.n_namesz + 3)/4; | ||
962 | memcpy(buf, data, note.n_descsz); | ||
963 | buf += (note.n_descsz + 3)/4; | ||
964 | |||
965 | return buf; | ||
966 | } | ||
967 | |||
968 | static void final_note(u32 *buf) | ||
969 | { | ||
970 | struct elf_note note; | ||
971 | |||
972 | note.n_namesz = 0; | ||
973 | note.n_descsz = 0; | ||
974 | note.n_type = 0; | ||
975 | memcpy(buf, ¬e, sizeof(note)); | ||
976 | } | ||
977 | |||
978 | void crash_save_cpu(struct pt_regs *regs, int cpu) | ||
979 | { | ||
980 | struct elf_prstatus prstatus; | ||
981 | u32 *buf; | ||
982 | |||
983 | if ((cpu < 0) || (cpu >= nr_cpu_ids)) | ||
984 | return; | ||
985 | |||
986 | /* Using ELF notes here is opportunistic. | ||
987 | * I need a well defined structure format | ||
988 | * for the data I pass, and I need tags | ||
989 | * on the data to indicate what information I have | ||
990 | * squirrelled away. ELF notes happen to provide | ||
991 | * all of that, so there is no need to invent something new. | ||
992 | */ | ||
993 | buf = (u32 *)per_cpu_ptr(crash_notes, cpu); | ||
994 | if (!buf) | ||
995 | return; | ||
996 | memset(&prstatus, 0, sizeof(prstatus)); | ||
997 | prstatus.pr_pid = current->pid; | ||
998 | elf_core_copy_kernel_regs(&prstatus.pr_reg, regs); | ||
999 | buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS, | ||
1000 | &prstatus, sizeof(prstatus)); | ||
1001 | final_note(buf); | ||
1002 | } | ||
1003 | |||
1004 | static int __init crash_notes_memory_init(void) | ||
1005 | { | ||
1006 | /* Allocate memory for saving cpu registers. */ | ||
1007 | size_t size, align; | ||
1008 | |||
1009 | /* | ||
1010 | * crash_notes could be allocated across 2 vmalloc pages when percpu | ||
1011 | * is vmalloc based . vmalloc doesn't guarantee 2 continuous vmalloc | ||
1012 | * pages are also on 2 continuous physical pages. In this case the | ||
1013 | * 2nd part of crash_notes in 2nd page could be lost since only the | ||
1014 | * starting address and size of crash_notes are exported through sysfs. | ||
1015 | * Here round up the size of crash_notes to the nearest power of two | ||
1016 | * and pass it to __alloc_percpu as align value. This can make sure | ||
1017 | * crash_notes is allocated inside one physical page. | ||
1018 | */ | ||
1019 | size = sizeof(note_buf_t); | ||
1020 | align = min(roundup_pow_of_two(sizeof(note_buf_t)), PAGE_SIZE); | ||
1021 | |||
1022 | /* | ||
1023 | * Break compile if size is bigger than PAGE_SIZE since crash_notes | ||
1024 | * definitely will be in 2 pages with that. | ||
1025 | */ | ||
1026 | BUILD_BUG_ON(size > PAGE_SIZE); | ||
1027 | |||
1028 | crash_notes = __alloc_percpu(size, align); | ||
1029 | if (!crash_notes) { | ||
1030 | pr_warn("Kexec: Memory allocation for saving cpu register states failed\n"); | ||
1031 | return -ENOMEM; | ||
1032 | } | ||
1033 | return 0; | ||
1034 | } | ||
1035 | subsys_initcall(crash_notes_memory_init); | ||
1036 | |||
1037 | |||
1038 | /* | ||
1039 | * parsing the "crashkernel" commandline | ||
1040 | * | ||
1041 | * this code is intended to be called from architecture specific code | ||
1042 | */ | ||
1043 | |||
1044 | |||
1045 | /* | ||
1046 | * This function parses command lines in the format | ||
1047 | * | ||
1048 | * crashkernel=ramsize-range:size[,...][@offset] | ||
1049 | * | ||
1050 | * The function returns 0 on success and -EINVAL on failure. | ||
1051 | */ | ||
1052 | static int __init parse_crashkernel_mem(char *cmdline, | ||
1053 | unsigned long long system_ram, | ||
1054 | unsigned long long *crash_size, | ||
1055 | unsigned long long *crash_base) | ||
1056 | { | ||
1057 | char *cur = cmdline, *tmp; | ||
1058 | |||
1059 | /* for each entry of the comma-separated list */ | ||
1060 | do { | ||
1061 | unsigned long long start, end = ULLONG_MAX, size; | ||
1062 | |||
1063 | /* get the start of the range */ | ||
1064 | start = memparse(cur, &tmp); | ||
1065 | if (cur == tmp) { | ||
1066 | pr_warn("crashkernel: Memory value expected\n"); | ||
1067 | return -EINVAL; | ||
1068 | } | ||
1069 | cur = tmp; | ||
1070 | if (*cur != '-') { | ||
1071 | pr_warn("crashkernel: '-' expected\n"); | ||
1072 | return -EINVAL; | ||
1073 | } | ||
1074 | cur++; | ||
1075 | |||
1076 | /* if no ':' is here, than we read the end */ | ||
1077 | if (*cur != ':') { | ||
1078 | end = memparse(cur, &tmp); | ||
1079 | if (cur == tmp) { | ||
1080 | pr_warn("crashkernel: Memory value expected\n"); | ||
1081 | return -EINVAL; | ||
1082 | } | ||
1083 | cur = tmp; | ||
1084 | if (end <= start) { | ||
1085 | pr_warn("crashkernel: end <= start\n"); | ||
1086 | return -EINVAL; | ||
1087 | } | ||
1088 | } | ||
1089 | |||
1090 | if (*cur != ':') { | ||
1091 | pr_warn("crashkernel: ':' expected\n"); | ||
1092 | return -EINVAL; | ||
1093 | } | ||
1094 | cur++; | ||
1095 | |||
1096 | size = memparse(cur, &tmp); | ||
1097 | if (cur == tmp) { | ||
1098 | pr_warn("Memory value expected\n"); | ||
1099 | return -EINVAL; | ||
1100 | } | ||
1101 | cur = tmp; | ||
1102 | if (size >= system_ram) { | ||
1103 | pr_warn("crashkernel: invalid size\n"); | ||
1104 | return -EINVAL; | ||
1105 | } | ||
1106 | |||
1107 | /* match ? */ | ||
1108 | if (system_ram >= start && system_ram < end) { | ||
1109 | *crash_size = size; | ||
1110 | break; | ||
1111 | } | ||
1112 | } while (*cur++ == ','); | ||
1113 | |||
1114 | if (*crash_size > 0) { | ||
1115 | while (*cur && *cur != ' ' && *cur != '@') | ||
1116 | cur++; | ||
1117 | if (*cur == '@') { | ||
1118 | cur++; | ||
1119 | *crash_base = memparse(cur, &tmp); | ||
1120 | if (cur == tmp) { | ||
1121 | pr_warn("Memory value expected after '@'\n"); | ||
1122 | return -EINVAL; | ||
1123 | } | ||
1124 | } | ||
1125 | } | ||
1126 | |||
1127 | return 0; | ||
1128 | } | ||
1129 | |||
1130 | /* | ||
1131 | * That function parses "simple" (old) crashkernel command lines like | ||
1132 | * | ||
1133 | * crashkernel=size[@offset] | ||
1134 | * | ||
1135 | * It returns 0 on success and -EINVAL on failure. | ||
1136 | */ | ||
1137 | static int __init parse_crashkernel_simple(char *cmdline, | ||
1138 | unsigned long long *crash_size, | ||
1139 | unsigned long long *crash_base) | ||
1140 | { | ||
1141 | char *cur = cmdline; | ||
1142 | |||
1143 | *crash_size = memparse(cmdline, &cur); | ||
1144 | if (cmdline == cur) { | ||
1145 | pr_warn("crashkernel: memory value expected\n"); | ||
1146 | return -EINVAL; | ||
1147 | } | ||
1148 | |||
1149 | if (*cur == '@') | ||
1150 | *crash_base = memparse(cur+1, &cur); | ||
1151 | else if (*cur != ' ' && *cur != '\0') { | ||
1152 | pr_warn("crashkernel: unrecognized char\n"); | ||
1153 | return -EINVAL; | ||
1154 | } | ||
1155 | |||
1156 | return 0; | ||
1157 | } | ||
1158 | |||
1159 | #define SUFFIX_HIGH 0 | ||
1160 | #define SUFFIX_LOW 1 | ||
1161 | #define SUFFIX_NULL 2 | ||
1162 | static __initdata char *suffix_tbl[] = { | ||
1163 | [SUFFIX_HIGH] = ",high", | ||
1164 | [SUFFIX_LOW] = ",low", | ||
1165 | [SUFFIX_NULL] = NULL, | ||
1166 | }; | ||
1167 | |||
1168 | /* | ||
1169 | * That function parses "suffix" crashkernel command lines like | ||
1170 | * | ||
1171 | * crashkernel=size,[high|low] | ||
1172 | * | ||
1173 | * It returns 0 on success and -EINVAL on failure. | ||
1174 | */ | ||
1175 | static int __init parse_crashkernel_suffix(char *cmdline, | ||
1176 | unsigned long long *crash_size, | ||
1177 | const char *suffix) | ||
1178 | { | ||
1179 | char *cur = cmdline; | ||
1180 | |||
1181 | *crash_size = memparse(cmdline, &cur); | ||
1182 | if (cmdline == cur) { | ||
1183 | pr_warn("crashkernel: memory value expected\n"); | ||
1184 | return -EINVAL; | ||
1185 | } | ||
1186 | |||
1187 | /* check with suffix */ | ||
1188 | if (strncmp(cur, suffix, strlen(suffix))) { | ||
1189 | pr_warn("crashkernel: unrecognized char\n"); | ||
1190 | return -EINVAL; | ||
1191 | } | ||
1192 | cur += strlen(suffix); | ||
1193 | if (*cur != ' ' && *cur != '\0') { | ||
1194 | pr_warn("crashkernel: unrecognized char\n"); | ||
1195 | return -EINVAL; | ||
1196 | } | ||
1197 | |||
1198 | return 0; | ||
1199 | } | ||
1200 | |||
1201 | static __init char *get_last_crashkernel(char *cmdline, | ||
1202 | const char *name, | ||
1203 | const char *suffix) | ||
1204 | { | ||
1205 | char *p = cmdline, *ck_cmdline = NULL; | ||
1206 | |||
1207 | /* find crashkernel and use the last one if there are more */ | ||
1208 | p = strstr(p, name); | ||
1209 | while (p) { | ||
1210 | char *end_p = strchr(p, ' '); | ||
1211 | char *q; | ||
1212 | |||
1213 | if (!end_p) | ||
1214 | end_p = p + strlen(p); | ||
1215 | |||
1216 | if (!suffix) { | ||
1217 | int i; | ||
1218 | |||
1219 | /* skip the one with any known suffix */ | ||
1220 | for (i = 0; suffix_tbl[i]; i++) { | ||
1221 | q = end_p - strlen(suffix_tbl[i]); | ||
1222 | if (!strncmp(q, suffix_tbl[i], | ||
1223 | strlen(suffix_tbl[i]))) | ||
1224 | goto next; | ||
1225 | } | ||
1226 | ck_cmdline = p; | ||
1227 | } else { | ||
1228 | q = end_p - strlen(suffix); | ||
1229 | if (!strncmp(q, suffix, strlen(suffix))) | ||
1230 | ck_cmdline = p; | ||
1231 | } | ||
1232 | next: | ||
1233 | p = strstr(p+1, name); | ||
1234 | } | ||
1235 | |||
1236 | if (!ck_cmdline) | ||
1237 | return NULL; | ||
1238 | |||
1239 | return ck_cmdline; | ||
1240 | } | ||
1241 | |||
1242 | static int __init __parse_crashkernel(char *cmdline, | ||
1243 | unsigned long long system_ram, | ||
1244 | unsigned long long *crash_size, | ||
1245 | unsigned long long *crash_base, | ||
1246 | const char *name, | ||
1247 | const char *suffix) | ||
1248 | { | ||
1249 | char *first_colon, *first_space; | ||
1250 | char *ck_cmdline; | ||
1251 | |||
1252 | BUG_ON(!crash_size || !crash_base); | ||
1253 | *crash_size = 0; | ||
1254 | *crash_base = 0; | ||
1255 | |||
1256 | ck_cmdline = get_last_crashkernel(cmdline, name, suffix); | ||
1257 | |||
1258 | if (!ck_cmdline) | ||
1259 | return -EINVAL; | ||
1260 | |||
1261 | ck_cmdline += strlen(name); | ||
1262 | |||
1263 | if (suffix) | ||
1264 | return parse_crashkernel_suffix(ck_cmdline, crash_size, | ||
1265 | suffix); | ||
1266 | /* | ||
1267 | * if the commandline contains a ':', then that's the extended | ||
1268 | * syntax -- if not, it must be the classic syntax | ||
1269 | */ | ||
1270 | first_colon = strchr(ck_cmdline, ':'); | ||
1271 | first_space = strchr(ck_cmdline, ' '); | ||
1272 | if (first_colon && (!first_space || first_colon < first_space)) | ||
1273 | return parse_crashkernel_mem(ck_cmdline, system_ram, | ||
1274 | crash_size, crash_base); | ||
1275 | |||
1276 | return parse_crashkernel_simple(ck_cmdline, crash_size, crash_base); | ||
1277 | } | ||
1278 | |||
1279 | /* | ||
1280 | * That function is the entry point for command line parsing and should be | ||
1281 | * called from the arch-specific code. | ||
1282 | */ | ||
1283 | int __init parse_crashkernel(char *cmdline, | ||
1284 | unsigned long long system_ram, | ||
1285 | unsigned long long *crash_size, | ||
1286 | unsigned long long *crash_base) | ||
1287 | { | ||
1288 | return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base, | ||
1289 | "crashkernel=", NULL); | ||
1290 | } | ||
1291 | |||
1292 | int __init parse_crashkernel_high(char *cmdline, | ||
1293 | unsigned long long system_ram, | ||
1294 | unsigned long long *crash_size, | ||
1295 | unsigned long long *crash_base) | ||
1296 | { | ||
1297 | return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base, | ||
1298 | "crashkernel=", suffix_tbl[SUFFIX_HIGH]); | ||
1299 | } | ||
1300 | |||
1301 | int __init parse_crashkernel_low(char *cmdline, | ||
1302 | unsigned long long system_ram, | ||
1303 | unsigned long long *crash_size, | ||
1304 | unsigned long long *crash_base) | ||
1305 | { | ||
1306 | return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base, | ||
1307 | "crashkernel=", suffix_tbl[SUFFIX_LOW]); | ||
1308 | } | ||
1309 | |||
1310 | static void update_vmcoreinfo_note(void) | ||
1311 | { | ||
1312 | u32 *buf = vmcoreinfo_note; | ||
1313 | |||
1314 | if (!vmcoreinfo_size) | ||
1315 | return; | ||
1316 | buf = append_elf_note(buf, VMCOREINFO_NOTE_NAME, 0, vmcoreinfo_data, | ||
1317 | vmcoreinfo_size); | ||
1318 | final_note(buf); | ||
1319 | } | ||
1320 | |||
1321 | void crash_save_vmcoreinfo(void) | ||
1322 | { | ||
1323 | vmcoreinfo_append_str("CRASHTIME=%ld\n", get_seconds()); | ||
1324 | update_vmcoreinfo_note(); | ||
1325 | } | ||
1326 | |||
1327 | void vmcoreinfo_append_str(const char *fmt, ...) | ||
1328 | { | ||
1329 | va_list args; | ||
1330 | char buf[0x50]; | ||
1331 | size_t r; | ||
1332 | |||
1333 | va_start(args, fmt); | ||
1334 | r = vscnprintf(buf, sizeof(buf), fmt, args); | ||
1335 | va_end(args); | ||
1336 | |||
1337 | r = min(r, vmcoreinfo_max_size - vmcoreinfo_size); | ||
1338 | |||
1339 | memcpy(&vmcoreinfo_data[vmcoreinfo_size], buf, r); | ||
1340 | |||
1341 | vmcoreinfo_size += r; | ||
1342 | } | ||
1343 | |||
1344 | /* | ||
1345 | * provide an empty default implementation here -- architecture | ||
1346 | * code may override this | ||
1347 | */ | ||
1348 | void __weak arch_crash_save_vmcoreinfo(void) | ||
1349 | {} | ||
1350 | |||
1351 | unsigned long __weak paddr_vmcoreinfo_note(void) | ||
1352 | { | ||
1353 | return __pa((unsigned long)(char *)&vmcoreinfo_note); | ||
1354 | } | ||
1355 | |||
1356 | static int __init crash_save_vmcoreinfo_init(void) | ||
1357 | { | ||
1358 | VMCOREINFO_OSRELEASE(init_uts_ns.name.release); | ||
1359 | VMCOREINFO_PAGESIZE(PAGE_SIZE); | ||
1360 | |||
1361 | VMCOREINFO_SYMBOL(init_uts_ns); | ||
1362 | VMCOREINFO_SYMBOL(node_online_map); | ||
1363 | #ifdef CONFIG_MMU | ||
1364 | VMCOREINFO_SYMBOL(swapper_pg_dir); | ||
1365 | #endif | ||
1366 | VMCOREINFO_SYMBOL(_stext); | ||
1367 | VMCOREINFO_SYMBOL(vmap_area_list); | ||
1368 | |||
1369 | #ifndef CONFIG_NEED_MULTIPLE_NODES | ||
1370 | VMCOREINFO_SYMBOL(mem_map); | ||
1371 | VMCOREINFO_SYMBOL(contig_page_data); | ||
1372 | #endif | ||
1373 | #ifdef CONFIG_SPARSEMEM | ||
1374 | VMCOREINFO_SYMBOL(mem_section); | ||
1375 | VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS); | ||
1376 | VMCOREINFO_STRUCT_SIZE(mem_section); | ||
1377 | VMCOREINFO_OFFSET(mem_section, section_mem_map); | ||
1378 | #endif | ||
1379 | VMCOREINFO_STRUCT_SIZE(page); | ||
1380 | VMCOREINFO_STRUCT_SIZE(pglist_data); | ||
1381 | VMCOREINFO_STRUCT_SIZE(zone); | ||
1382 | VMCOREINFO_STRUCT_SIZE(free_area); | ||
1383 | VMCOREINFO_STRUCT_SIZE(list_head); | ||
1384 | VMCOREINFO_SIZE(nodemask_t); | ||
1385 | VMCOREINFO_OFFSET(page, flags); | ||
1386 | VMCOREINFO_OFFSET(page, _count); | ||
1387 | VMCOREINFO_OFFSET(page, mapping); | ||
1388 | VMCOREINFO_OFFSET(page, lru); | ||
1389 | VMCOREINFO_OFFSET(page, _mapcount); | ||
1390 | VMCOREINFO_OFFSET(page, private); | ||
1391 | VMCOREINFO_OFFSET(pglist_data, node_zones); | ||
1392 | VMCOREINFO_OFFSET(pglist_data, nr_zones); | ||
1393 | #ifdef CONFIG_FLAT_NODE_MEM_MAP | ||
1394 | VMCOREINFO_OFFSET(pglist_data, node_mem_map); | ||
1395 | #endif | ||
1396 | VMCOREINFO_OFFSET(pglist_data, node_start_pfn); | ||
1397 | VMCOREINFO_OFFSET(pglist_data, node_spanned_pages); | ||
1398 | VMCOREINFO_OFFSET(pglist_data, node_id); | ||
1399 | VMCOREINFO_OFFSET(zone, free_area); | ||
1400 | VMCOREINFO_OFFSET(zone, vm_stat); | ||
1401 | VMCOREINFO_OFFSET(zone, spanned_pages); | ||
1402 | VMCOREINFO_OFFSET(free_area, free_list); | ||
1403 | VMCOREINFO_OFFSET(list_head, next); | ||
1404 | VMCOREINFO_OFFSET(list_head, prev); | ||
1405 | VMCOREINFO_OFFSET(vmap_area, va_start); | ||
1406 | VMCOREINFO_OFFSET(vmap_area, list); | ||
1407 | VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER); | ||
1408 | log_buf_kexec_setup(); | ||
1409 | VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES); | ||
1410 | VMCOREINFO_NUMBER(NR_FREE_PAGES); | ||
1411 | VMCOREINFO_NUMBER(PG_lru); | ||
1412 | VMCOREINFO_NUMBER(PG_private); | ||
1413 | VMCOREINFO_NUMBER(PG_swapcache); | ||
1414 | VMCOREINFO_NUMBER(PG_slab); | ||
1415 | #ifdef CONFIG_MEMORY_FAILURE | ||
1416 | VMCOREINFO_NUMBER(PG_hwpoison); | ||
1417 | #endif | ||
1418 | VMCOREINFO_NUMBER(PG_head_mask); | ||
1419 | VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE); | ||
1420 | #ifdef CONFIG_X86 | ||
1421 | VMCOREINFO_NUMBER(KERNEL_IMAGE_SIZE); | ||
1422 | #endif | ||
1423 | #ifdef CONFIG_HUGETLBFS | ||
1424 | VMCOREINFO_SYMBOL(free_huge_page); | ||
1425 | #endif | ||
1426 | |||
1427 | arch_crash_save_vmcoreinfo(); | ||
1428 | update_vmcoreinfo_note(); | ||
1429 | |||
1430 | return 0; | ||
1431 | } | ||
1432 | |||
1433 | subsys_initcall(crash_save_vmcoreinfo_init); | ||
1434 | |||
1435 | /* | ||
1436 | * Move into place and start executing a preloaded standalone | ||
1437 | * executable. If nothing was preloaded return an error. | ||
1438 | */ | ||
1439 | int kernel_kexec(void) | ||
1440 | { | ||
1441 | int error = 0; | ||
1442 | |||
1443 | if (!mutex_trylock(&kexec_mutex)) | ||
1444 | return -EBUSY; | ||
1445 | if (!kexec_image) { | ||
1446 | error = -EINVAL; | ||
1447 | goto Unlock; | ||
1448 | } | ||
1449 | |||
1450 | #ifdef CONFIG_KEXEC_JUMP | ||
1451 | if (kexec_image->preserve_context) { | ||
1452 | lock_system_sleep(); | ||
1453 | pm_prepare_console(); | ||
1454 | error = freeze_processes(); | ||
1455 | if (error) { | ||
1456 | error = -EBUSY; | ||
1457 | goto Restore_console; | ||
1458 | } | ||
1459 | suspend_console(); | ||
1460 | error = dpm_suspend_start(PMSG_FREEZE); | ||
1461 | if (error) | ||
1462 | goto Resume_console; | ||
1463 | /* At this point, dpm_suspend_start() has been called, | ||
1464 | * but *not* dpm_suspend_end(). We *must* call | ||
1465 | * dpm_suspend_end() now. Otherwise, drivers for | ||
1466 | * some devices (e.g. interrupt controllers) become | ||
1467 | * desynchronized with the actual state of the | ||
1468 | * hardware at resume time, and evil weirdness ensues. | ||
1469 | */ | ||
1470 | error = dpm_suspend_end(PMSG_FREEZE); | ||
1471 | if (error) | ||
1472 | goto Resume_devices; | ||
1473 | error = disable_nonboot_cpus(); | ||
1474 | if (error) | ||
1475 | goto Enable_cpus; | ||
1476 | local_irq_disable(); | ||
1477 | error = syscore_suspend(); | ||
1478 | if (error) | ||
1479 | goto Enable_irqs; | ||
1480 | } else | ||
1481 | #endif | ||
1482 | { | ||
1483 | kexec_in_progress = true; | ||
1484 | kernel_restart_prepare(NULL); | ||
1485 | migrate_to_reboot_cpu(); | ||
1486 | |||
1487 | /* | ||
1488 | * migrate_to_reboot_cpu() disables CPU hotplug assuming that | ||
1489 | * no further code needs to use CPU hotplug (which is true in | ||
1490 | * the reboot case). However, the kexec path depends on using | ||
1491 | * CPU hotplug again; so re-enable it here. | ||
1492 | */ | ||
1493 | cpu_hotplug_enable(); | ||
1494 | pr_emerg("Starting new kernel\n"); | ||
1495 | machine_shutdown(); | ||
1496 | } | ||
1497 | |||
1498 | machine_kexec(kexec_image); | ||
1499 | |||
1500 | #ifdef CONFIG_KEXEC_JUMP | ||
1501 | if (kexec_image->preserve_context) { | ||
1502 | syscore_resume(); | ||
1503 | Enable_irqs: | ||
1504 | local_irq_enable(); | ||
1505 | Enable_cpus: | ||
1506 | enable_nonboot_cpus(); | ||
1507 | dpm_resume_start(PMSG_RESTORE); | ||
1508 | Resume_devices: | ||
1509 | dpm_resume_end(PMSG_RESTORE); | ||
1510 | Resume_console: | ||
1511 | resume_console(); | ||
1512 | thaw_processes(); | ||
1513 | Restore_console: | ||
1514 | pm_restore_console(); | ||
1515 | unlock_system_sleep(); | ||
1516 | } | ||
1517 | #endif | ||
1518 | |||
1519 | Unlock: | ||
1520 | mutex_unlock(&kexec_mutex); | ||
1521 | return error; | ||
1522 | } | ||
1523 | |||
1524 | /* | ||
1525 | * Add and remove page tables for crashkernel memory | ||
1526 | * | ||
1527 | * Provide an empty default implementation here -- architecture | ||
1528 | * code may override this | ||
1529 | */ | ||
1530 | void __weak crash_map_reserved_pages(void) | ||
1531 | {} | ||
1532 | |||
1533 | void __weak crash_unmap_reserved_pages(void) | ||
1534 | {} | ||
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c new file mode 100644 index 000000000000..6a9a3f2a0e8e --- /dev/null +++ b/kernel/kexec_file.c | |||
@@ -0,0 +1,1045 @@ | |||
1 | /* | ||
2 | * kexec: kexec_file_load system call | ||
3 | * | ||
4 | * Copyright (C) 2014 Red Hat Inc. | ||
5 | * Authors: | ||
6 | * Vivek Goyal <vgoyal@redhat.com> | ||
7 | * | ||
8 | * This source code is licensed under the GNU General Public License, | ||
9 | * Version 2. See the file COPYING for more details. | ||
10 | */ | ||
11 | |||
12 | #include <linux/capability.h> | ||
13 | #include <linux/mm.h> | ||
14 | #include <linux/file.h> | ||
15 | #include <linux/slab.h> | ||
16 | #include <linux/kexec.h> | ||
17 | #include <linux/mutex.h> | ||
18 | #include <linux/list.h> | ||
19 | #include <crypto/hash.h> | ||
20 | #include <crypto/sha.h> | ||
21 | #include <linux/syscalls.h> | ||
22 | #include <linux/vmalloc.h> | ||
23 | #include "kexec_internal.h" | ||
24 | |||
25 | /* | ||
26 | * Declare these symbols weak so that if architecture provides a purgatory, | ||
27 | * these will be overridden. | ||
28 | */ | ||
29 | char __weak kexec_purgatory[0]; | ||
30 | size_t __weak kexec_purgatory_size = 0; | ||
31 | |||
32 | static int kexec_calculate_store_digests(struct kimage *image); | ||
33 | |||
34 | static int copy_file_from_fd(int fd, void **buf, unsigned long *buf_len) | ||
35 | { | ||
36 | struct fd f = fdget(fd); | ||
37 | int ret; | ||
38 | struct kstat stat; | ||
39 | loff_t pos; | ||
40 | ssize_t bytes = 0; | ||
41 | |||
42 | if (!f.file) | ||
43 | return -EBADF; | ||
44 | |||
45 | ret = vfs_getattr(&f.file->f_path, &stat); | ||
46 | if (ret) | ||
47 | goto out; | ||
48 | |||
49 | if (stat.size > INT_MAX) { | ||
50 | ret = -EFBIG; | ||
51 | goto out; | ||
52 | } | ||
53 | |||
54 | /* Don't hand 0 to vmalloc, it whines. */ | ||
55 | if (stat.size == 0) { | ||
56 | ret = -EINVAL; | ||
57 | goto out; | ||
58 | } | ||
59 | |||
60 | *buf = vmalloc(stat.size); | ||
61 | if (!*buf) { | ||
62 | ret = -ENOMEM; | ||
63 | goto out; | ||
64 | } | ||
65 | |||
66 | pos = 0; | ||
67 | while (pos < stat.size) { | ||
68 | bytes = kernel_read(f.file, pos, (char *)(*buf) + pos, | ||
69 | stat.size - pos); | ||
70 | if (bytes < 0) { | ||
71 | vfree(*buf); | ||
72 | ret = bytes; | ||
73 | goto out; | ||
74 | } | ||
75 | |||
76 | if (bytes == 0) | ||
77 | break; | ||
78 | pos += bytes; | ||
79 | } | ||
80 | |||
81 | if (pos != stat.size) { | ||
82 | ret = -EBADF; | ||
83 | vfree(*buf); | ||
84 | goto out; | ||
85 | } | ||
86 | |||
87 | *buf_len = pos; | ||
88 | out: | ||
89 | fdput(f); | ||
90 | return ret; | ||
91 | } | ||
92 | |||
93 | /* Architectures can provide this probe function */ | ||
94 | int __weak arch_kexec_kernel_image_probe(struct kimage *image, void *buf, | ||
95 | unsigned long buf_len) | ||
96 | { | ||
97 | return -ENOEXEC; | ||
98 | } | ||
99 | |||
100 | void * __weak arch_kexec_kernel_image_load(struct kimage *image) | ||
101 | { | ||
102 | return ERR_PTR(-ENOEXEC); | ||
103 | } | ||
104 | |||
105 | int __weak arch_kimage_file_post_load_cleanup(struct kimage *image) | ||
106 | { | ||
107 | return -EINVAL; | ||
108 | } | ||
109 | |||
110 | int __weak arch_kexec_kernel_verify_sig(struct kimage *image, void *buf, | ||
111 | unsigned long buf_len) | ||
112 | { | ||
113 | return -EKEYREJECTED; | ||
114 | } | ||
115 | |||
116 | /* Apply relocations of type RELA */ | ||
117 | int __weak | ||
118 | arch_kexec_apply_relocations_add(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, | ||
119 | unsigned int relsec) | ||
120 | { | ||
121 | pr_err("RELA relocation unsupported.\n"); | ||
122 | return -ENOEXEC; | ||
123 | } | ||
124 | |||
125 | /* Apply relocations of type REL */ | ||
126 | int __weak | ||
127 | arch_kexec_apply_relocations(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, | ||
128 | unsigned int relsec) | ||
129 | { | ||
130 | pr_err("REL relocation unsupported.\n"); | ||
131 | return -ENOEXEC; | ||
132 | } | ||
133 | |||
134 | /* | ||
135 | * Free up memory used by kernel, initrd, and command line. This is temporary | ||
136 | * memory allocation which is not needed any more after these buffers have | ||
137 | * been loaded into separate segments and have been copied elsewhere. | ||
138 | */ | ||
139 | void kimage_file_post_load_cleanup(struct kimage *image) | ||
140 | { | ||
141 | struct purgatory_info *pi = &image->purgatory_info; | ||
142 | |||
143 | vfree(image->kernel_buf); | ||
144 | image->kernel_buf = NULL; | ||
145 | |||
146 | vfree(image->initrd_buf); | ||
147 | image->initrd_buf = NULL; | ||
148 | |||
149 | kfree(image->cmdline_buf); | ||
150 | image->cmdline_buf = NULL; | ||
151 | |||
152 | vfree(pi->purgatory_buf); | ||
153 | pi->purgatory_buf = NULL; | ||
154 | |||
155 | vfree(pi->sechdrs); | ||
156 | pi->sechdrs = NULL; | ||
157 | |||
158 | /* See if architecture has anything to cleanup post load */ | ||
159 | arch_kimage_file_post_load_cleanup(image); | ||
160 | |||
161 | /* | ||
162 | * Above call should have called into bootloader to free up | ||
163 | * any data stored in kimage->image_loader_data. It should | ||
164 | * be ok now to free it up. | ||
165 | */ | ||
166 | kfree(image->image_loader_data); | ||
167 | image->image_loader_data = NULL; | ||
168 | } | ||
169 | |||
170 | /* | ||
171 | * In file mode list of segments is prepared by kernel. Copy relevant | ||
172 | * data from user space, do error checking, prepare segment list | ||
173 | */ | ||
174 | static int | ||
175 | kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd, | ||
176 | const char __user *cmdline_ptr, | ||
177 | unsigned long cmdline_len, unsigned flags) | ||
178 | { | ||
179 | int ret = 0; | ||
180 | void *ldata; | ||
181 | |||
182 | ret = copy_file_from_fd(kernel_fd, &image->kernel_buf, | ||
183 | &image->kernel_buf_len); | ||
184 | if (ret) | ||
185 | return ret; | ||
186 | |||
187 | /* Call arch image probe handlers */ | ||
188 | ret = arch_kexec_kernel_image_probe(image, image->kernel_buf, | ||
189 | image->kernel_buf_len); | ||
190 | |||
191 | if (ret) | ||
192 | goto out; | ||
193 | |||
194 | #ifdef CONFIG_KEXEC_VERIFY_SIG | ||
195 | ret = arch_kexec_kernel_verify_sig(image, image->kernel_buf, | ||
196 | image->kernel_buf_len); | ||
197 | if (ret) { | ||
198 | pr_debug("kernel signature verification failed.\n"); | ||
199 | goto out; | ||
200 | } | ||
201 | pr_debug("kernel signature verification successful.\n"); | ||
202 | #endif | ||
203 | /* It is possible that there no initramfs is being loaded */ | ||
204 | if (!(flags & KEXEC_FILE_NO_INITRAMFS)) { | ||
205 | ret = copy_file_from_fd(initrd_fd, &image->initrd_buf, | ||
206 | &image->initrd_buf_len); | ||
207 | if (ret) | ||
208 | goto out; | ||
209 | } | ||
210 | |||
211 | if (cmdline_len) { | ||
212 | image->cmdline_buf = kzalloc(cmdline_len, GFP_KERNEL); | ||
213 | if (!image->cmdline_buf) { | ||
214 | ret = -ENOMEM; | ||
215 | goto out; | ||
216 | } | ||
217 | |||
218 | ret = copy_from_user(image->cmdline_buf, cmdline_ptr, | ||
219 | cmdline_len); | ||
220 | if (ret) { | ||
221 | ret = -EFAULT; | ||
222 | goto out; | ||
223 | } | ||
224 | |||
225 | image->cmdline_buf_len = cmdline_len; | ||
226 | |||
227 | /* command line should be a string with last byte null */ | ||
228 | if (image->cmdline_buf[cmdline_len - 1] != '\0') { | ||
229 | ret = -EINVAL; | ||
230 | goto out; | ||
231 | } | ||
232 | } | ||
233 | |||
234 | /* Call arch image load handlers */ | ||
235 | ldata = arch_kexec_kernel_image_load(image); | ||
236 | |||
237 | if (IS_ERR(ldata)) { | ||
238 | ret = PTR_ERR(ldata); | ||
239 | goto out; | ||
240 | } | ||
241 | |||
242 | image->image_loader_data = ldata; | ||
243 | out: | ||
244 | /* In case of error, free up all allocated memory in this function */ | ||
245 | if (ret) | ||
246 | kimage_file_post_load_cleanup(image); | ||
247 | return ret; | ||
248 | } | ||
249 | |||
250 | static int | ||
251 | kimage_file_alloc_init(struct kimage **rimage, int kernel_fd, | ||
252 | int initrd_fd, const char __user *cmdline_ptr, | ||
253 | unsigned long cmdline_len, unsigned long flags) | ||
254 | { | ||
255 | int ret; | ||
256 | struct kimage *image; | ||
257 | bool kexec_on_panic = flags & KEXEC_FILE_ON_CRASH; | ||
258 | |||
259 | image = do_kimage_alloc_init(); | ||
260 | if (!image) | ||
261 | return -ENOMEM; | ||
262 | |||
263 | image->file_mode = 1; | ||
264 | |||
265 | if (kexec_on_panic) { | ||
266 | /* Enable special crash kernel control page alloc policy. */ | ||
267 | image->control_page = crashk_res.start; | ||
268 | image->type = KEXEC_TYPE_CRASH; | ||
269 | } | ||
270 | |||
271 | ret = kimage_file_prepare_segments(image, kernel_fd, initrd_fd, | ||
272 | cmdline_ptr, cmdline_len, flags); | ||
273 | if (ret) | ||
274 | goto out_free_image; | ||
275 | |||
276 | ret = sanity_check_segment_list(image); | ||
277 | if (ret) | ||
278 | goto out_free_post_load_bufs; | ||
279 | |||
280 | ret = -ENOMEM; | ||
281 | image->control_code_page = kimage_alloc_control_pages(image, | ||
282 | get_order(KEXEC_CONTROL_PAGE_SIZE)); | ||
283 | if (!image->control_code_page) { | ||
284 | pr_err("Could not allocate control_code_buffer\n"); | ||
285 | goto out_free_post_load_bufs; | ||
286 | } | ||
287 | |||
288 | if (!kexec_on_panic) { | ||
289 | image->swap_page = kimage_alloc_control_pages(image, 0); | ||
290 | if (!image->swap_page) { | ||
291 | pr_err("Could not allocate swap buffer\n"); | ||
292 | goto out_free_control_pages; | ||
293 | } | ||
294 | } | ||
295 | |||
296 | *rimage = image; | ||
297 | return 0; | ||
298 | out_free_control_pages: | ||
299 | kimage_free_page_list(&image->control_pages); | ||
300 | out_free_post_load_bufs: | ||
301 | kimage_file_post_load_cleanup(image); | ||
302 | out_free_image: | ||
303 | kfree(image); | ||
304 | return ret; | ||
305 | } | ||
306 | |||
307 | SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd, | ||
308 | unsigned long, cmdline_len, const char __user *, cmdline_ptr, | ||
309 | unsigned long, flags) | ||
310 | { | ||
311 | int ret = 0, i; | ||
312 | struct kimage **dest_image, *image; | ||
313 | |||
314 | /* We only trust the superuser with rebooting the system. */ | ||
315 | if (!capable(CAP_SYS_BOOT) || kexec_load_disabled) | ||
316 | return -EPERM; | ||
317 | |||
318 | /* Make sure we have a legal set of flags */ | ||
319 | if (flags != (flags & KEXEC_FILE_FLAGS)) | ||
320 | return -EINVAL; | ||
321 | |||
322 | image = NULL; | ||
323 | |||
324 | if (!mutex_trylock(&kexec_mutex)) | ||
325 | return -EBUSY; | ||
326 | |||
327 | dest_image = &kexec_image; | ||
328 | if (flags & KEXEC_FILE_ON_CRASH) | ||
329 | dest_image = &kexec_crash_image; | ||
330 | |||
331 | if (flags & KEXEC_FILE_UNLOAD) | ||
332 | goto exchange; | ||
333 | |||
334 | /* | ||
335 | * In case of crash, new kernel gets loaded in reserved region. It is | ||
336 | * same memory where old crash kernel might be loaded. Free any | ||
337 | * current crash dump kernel before we corrupt it. | ||
338 | */ | ||
339 | if (flags & KEXEC_FILE_ON_CRASH) | ||
340 | kimage_free(xchg(&kexec_crash_image, NULL)); | ||
341 | |||
342 | ret = kimage_file_alloc_init(&image, kernel_fd, initrd_fd, cmdline_ptr, | ||
343 | cmdline_len, flags); | ||
344 | if (ret) | ||
345 | goto out; | ||
346 | |||
347 | ret = machine_kexec_prepare(image); | ||
348 | if (ret) | ||
349 | goto out; | ||
350 | |||
351 | ret = kexec_calculate_store_digests(image); | ||
352 | if (ret) | ||
353 | goto out; | ||
354 | |||
355 | for (i = 0; i < image->nr_segments; i++) { | ||
356 | struct kexec_segment *ksegment; | ||
357 | |||
358 | ksegment = &image->segment[i]; | ||
359 | pr_debug("Loading segment %d: buf=0x%p bufsz=0x%zx mem=0x%lx memsz=0x%zx\n", | ||
360 | i, ksegment->buf, ksegment->bufsz, ksegment->mem, | ||
361 | ksegment->memsz); | ||
362 | |||
363 | ret = kimage_load_segment(image, &image->segment[i]); | ||
364 | if (ret) | ||
365 | goto out; | ||
366 | } | ||
367 | |||
368 | kimage_terminate(image); | ||
369 | |||
370 | /* | ||
371 | * Free up any temporary buffers allocated which are not needed | ||
372 | * after image has been loaded | ||
373 | */ | ||
374 | kimage_file_post_load_cleanup(image); | ||
375 | exchange: | ||
376 | image = xchg(dest_image, image); | ||
377 | out: | ||
378 | mutex_unlock(&kexec_mutex); | ||
379 | kimage_free(image); | ||
380 | return ret; | ||
381 | } | ||
382 | |||
383 | static int locate_mem_hole_top_down(unsigned long start, unsigned long end, | ||
384 | struct kexec_buf *kbuf) | ||
385 | { | ||
386 | struct kimage *image = kbuf->image; | ||
387 | unsigned long temp_start, temp_end; | ||
388 | |||
389 | temp_end = min(end, kbuf->buf_max); | ||
390 | temp_start = temp_end - kbuf->memsz; | ||
391 | |||
392 | do { | ||
393 | /* align down start */ | ||
394 | temp_start = temp_start & (~(kbuf->buf_align - 1)); | ||
395 | |||
396 | if (temp_start < start || temp_start < kbuf->buf_min) | ||
397 | return 0; | ||
398 | |||
399 | temp_end = temp_start + kbuf->memsz - 1; | ||
400 | |||
401 | /* | ||
402 | * Make sure this does not conflict with any of existing | ||
403 | * segments | ||
404 | */ | ||
405 | if (kimage_is_destination_range(image, temp_start, temp_end)) { | ||
406 | temp_start = temp_start - PAGE_SIZE; | ||
407 | continue; | ||
408 | } | ||
409 | |||
410 | /* We found a suitable memory range */ | ||
411 | break; | ||
412 | } while (1); | ||
413 | |||
414 | /* If we are here, we found a suitable memory range */ | ||
415 | kbuf->mem = temp_start; | ||
416 | |||
417 | /* Success, stop navigating through remaining System RAM ranges */ | ||
418 | return 1; | ||
419 | } | ||
420 | |||
421 | static int locate_mem_hole_bottom_up(unsigned long start, unsigned long end, | ||
422 | struct kexec_buf *kbuf) | ||
423 | { | ||
424 | struct kimage *image = kbuf->image; | ||
425 | unsigned long temp_start, temp_end; | ||
426 | |||
427 | temp_start = max(start, kbuf->buf_min); | ||
428 | |||
429 | do { | ||
430 | temp_start = ALIGN(temp_start, kbuf->buf_align); | ||
431 | temp_end = temp_start + kbuf->memsz - 1; | ||
432 | |||
433 | if (temp_end > end || temp_end > kbuf->buf_max) | ||
434 | return 0; | ||
435 | /* | ||
436 | * Make sure this does not conflict with any of existing | ||
437 | * segments | ||
438 | */ | ||
439 | if (kimage_is_destination_range(image, temp_start, temp_end)) { | ||
440 | temp_start = temp_start + PAGE_SIZE; | ||
441 | continue; | ||
442 | } | ||
443 | |||
444 | /* We found a suitable memory range */ | ||
445 | break; | ||
446 | } while (1); | ||
447 | |||
448 | /* If we are here, we found a suitable memory range */ | ||
449 | kbuf->mem = temp_start; | ||
450 | |||
451 | /* Success, stop navigating through remaining System RAM ranges */ | ||
452 | return 1; | ||
453 | } | ||
454 | |||
455 | static int locate_mem_hole_callback(u64 start, u64 end, void *arg) | ||
456 | { | ||
457 | struct kexec_buf *kbuf = (struct kexec_buf *)arg; | ||
458 | unsigned long sz = end - start + 1; | ||
459 | |||
460 | /* Returning 0 will take to next memory range */ | ||
461 | if (sz < kbuf->memsz) | ||
462 | return 0; | ||
463 | |||
464 | if (end < kbuf->buf_min || start > kbuf->buf_max) | ||
465 | return 0; | ||
466 | |||
467 | /* | ||
468 | * Allocate memory top down with-in ram range. Otherwise bottom up | ||
469 | * allocation. | ||
470 | */ | ||
471 | if (kbuf->top_down) | ||
472 | return locate_mem_hole_top_down(start, end, kbuf); | ||
473 | return locate_mem_hole_bottom_up(start, end, kbuf); | ||
474 | } | ||
475 | |||
476 | /* | ||
477 | * Helper function for placing a buffer in a kexec segment. This assumes | ||
478 | * that kexec_mutex is held. | ||
479 | */ | ||
480 | int kexec_add_buffer(struct kimage *image, char *buffer, unsigned long bufsz, | ||
481 | unsigned long memsz, unsigned long buf_align, | ||
482 | unsigned long buf_min, unsigned long buf_max, | ||
483 | bool top_down, unsigned long *load_addr) | ||
484 | { | ||
485 | |||
486 | struct kexec_segment *ksegment; | ||
487 | struct kexec_buf buf, *kbuf; | ||
488 | int ret; | ||
489 | |||
490 | /* Currently adding segment this way is allowed only in file mode */ | ||
491 | if (!image->file_mode) | ||
492 | return -EINVAL; | ||
493 | |||
494 | if (image->nr_segments >= KEXEC_SEGMENT_MAX) | ||
495 | return -EINVAL; | ||
496 | |||
497 | /* | ||
498 | * Make sure we are not trying to add buffer after allocating | ||
499 | * control pages. All segments need to be placed first before | ||
500 | * any control pages are allocated. As control page allocation | ||
501 | * logic goes through list of segments to make sure there are | ||
502 | * no destination overlaps. | ||
503 | */ | ||
504 | if (!list_empty(&image->control_pages)) { | ||
505 | WARN_ON(1); | ||
506 | return -EINVAL; | ||
507 | } | ||
508 | |||
509 | memset(&buf, 0, sizeof(struct kexec_buf)); | ||
510 | kbuf = &buf; | ||
511 | kbuf->image = image; | ||
512 | kbuf->buffer = buffer; | ||
513 | kbuf->bufsz = bufsz; | ||
514 | |||
515 | kbuf->memsz = ALIGN(memsz, PAGE_SIZE); | ||
516 | kbuf->buf_align = max(buf_align, PAGE_SIZE); | ||
517 | kbuf->buf_min = buf_min; | ||
518 | kbuf->buf_max = buf_max; | ||
519 | kbuf->top_down = top_down; | ||
520 | |||
521 | /* Walk the RAM ranges and allocate a suitable range for the buffer */ | ||
522 | if (image->type == KEXEC_TYPE_CRASH) | ||
523 | ret = walk_iomem_res("Crash kernel", | ||
524 | IORESOURCE_MEM | IORESOURCE_BUSY, | ||
525 | crashk_res.start, crashk_res.end, kbuf, | ||
526 | locate_mem_hole_callback); | ||
527 | else | ||
528 | ret = walk_system_ram_res(0, -1, kbuf, | ||
529 | locate_mem_hole_callback); | ||
530 | if (ret != 1) { | ||
531 | /* A suitable memory range could not be found for buffer */ | ||
532 | return -EADDRNOTAVAIL; | ||
533 | } | ||
534 | |||
535 | /* Found a suitable memory range */ | ||
536 | ksegment = &image->segment[image->nr_segments]; | ||
537 | ksegment->kbuf = kbuf->buffer; | ||
538 | ksegment->bufsz = kbuf->bufsz; | ||
539 | ksegment->mem = kbuf->mem; | ||
540 | ksegment->memsz = kbuf->memsz; | ||
541 | image->nr_segments++; | ||
542 | *load_addr = ksegment->mem; | ||
543 | return 0; | ||
544 | } | ||
545 | |||
546 | /* Calculate and store the digest of segments */ | ||
547 | static int kexec_calculate_store_digests(struct kimage *image) | ||
548 | { | ||
549 | struct crypto_shash *tfm; | ||
550 | struct shash_desc *desc; | ||
551 | int ret = 0, i, j, zero_buf_sz, sha_region_sz; | ||
552 | size_t desc_size, nullsz; | ||
553 | char *digest; | ||
554 | void *zero_buf; | ||
555 | struct kexec_sha_region *sha_regions; | ||
556 | struct purgatory_info *pi = &image->purgatory_info; | ||
557 | |||
558 | zero_buf = __va(page_to_pfn(ZERO_PAGE(0)) << PAGE_SHIFT); | ||
559 | zero_buf_sz = PAGE_SIZE; | ||
560 | |||
561 | tfm = crypto_alloc_shash("sha256", 0, 0); | ||
562 | if (IS_ERR(tfm)) { | ||
563 | ret = PTR_ERR(tfm); | ||
564 | goto out; | ||
565 | } | ||
566 | |||
567 | desc_size = crypto_shash_descsize(tfm) + sizeof(*desc); | ||
568 | desc = kzalloc(desc_size, GFP_KERNEL); | ||
569 | if (!desc) { | ||
570 | ret = -ENOMEM; | ||
571 | goto out_free_tfm; | ||
572 | } | ||
573 | |||
574 | sha_region_sz = KEXEC_SEGMENT_MAX * sizeof(struct kexec_sha_region); | ||
575 | sha_regions = vzalloc(sha_region_sz); | ||
576 | if (!sha_regions) | ||
577 | goto out_free_desc; | ||
578 | |||
579 | desc->tfm = tfm; | ||
580 | desc->flags = 0; | ||
581 | |||
582 | ret = crypto_shash_init(desc); | ||
583 | if (ret < 0) | ||
584 | goto out_free_sha_regions; | ||
585 | |||
586 | digest = kzalloc(SHA256_DIGEST_SIZE, GFP_KERNEL); | ||
587 | if (!digest) { | ||
588 | ret = -ENOMEM; | ||
589 | goto out_free_sha_regions; | ||
590 | } | ||
591 | |||
592 | for (j = i = 0; i < image->nr_segments; i++) { | ||
593 | struct kexec_segment *ksegment; | ||
594 | |||
595 | ksegment = &image->segment[i]; | ||
596 | /* | ||
597 | * Skip purgatory as it will be modified once we put digest | ||
598 | * info in purgatory. | ||
599 | */ | ||
600 | if (ksegment->kbuf == pi->purgatory_buf) | ||
601 | continue; | ||
602 | |||
603 | ret = crypto_shash_update(desc, ksegment->kbuf, | ||
604 | ksegment->bufsz); | ||
605 | if (ret) | ||
606 | break; | ||
607 | |||
608 | /* | ||
609 | * Assume rest of the buffer is filled with zero and | ||
610 | * update digest accordingly. | ||
611 | */ | ||
612 | nullsz = ksegment->memsz - ksegment->bufsz; | ||
613 | while (nullsz) { | ||
614 | unsigned long bytes = nullsz; | ||
615 | |||
616 | if (bytes > zero_buf_sz) | ||
617 | bytes = zero_buf_sz; | ||
618 | ret = crypto_shash_update(desc, zero_buf, bytes); | ||
619 | if (ret) | ||
620 | break; | ||
621 | nullsz -= bytes; | ||
622 | } | ||
623 | |||
624 | if (ret) | ||
625 | break; | ||
626 | |||
627 | sha_regions[j].start = ksegment->mem; | ||
628 | sha_regions[j].len = ksegment->memsz; | ||
629 | j++; | ||
630 | } | ||
631 | |||
632 | if (!ret) { | ||
633 | ret = crypto_shash_final(desc, digest); | ||
634 | if (ret) | ||
635 | goto out_free_digest; | ||
636 | ret = kexec_purgatory_get_set_symbol(image, "sha_regions", | ||
637 | sha_regions, sha_region_sz, 0); | ||
638 | if (ret) | ||
639 | goto out_free_digest; | ||
640 | |||
641 | ret = kexec_purgatory_get_set_symbol(image, "sha256_digest", | ||
642 | digest, SHA256_DIGEST_SIZE, 0); | ||
643 | if (ret) | ||
644 | goto out_free_digest; | ||
645 | } | ||
646 | |||
647 | out_free_digest: | ||
648 | kfree(digest); | ||
649 | out_free_sha_regions: | ||
650 | vfree(sha_regions); | ||
651 | out_free_desc: | ||
652 | kfree(desc); | ||
653 | out_free_tfm: | ||
654 | kfree(tfm); | ||
655 | out: | ||
656 | return ret; | ||
657 | } | ||
658 | |||
659 | /* Actually load purgatory. Lot of code taken from kexec-tools */ | ||
660 | static int __kexec_load_purgatory(struct kimage *image, unsigned long min, | ||
661 | unsigned long max, int top_down) | ||
662 | { | ||
663 | struct purgatory_info *pi = &image->purgatory_info; | ||
664 | unsigned long align, buf_align, bss_align, buf_sz, bss_sz, bss_pad; | ||
665 | unsigned long memsz, entry, load_addr, curr_load_addr, bss_addr, offset; | ||
666 | unsigned char *buf_addr, *src; | ||
667 | int i, ret = 0, entry_sidx = -1; | ||
668 | const Elf_Shdr *sechdrs_c; | ||
669 | Elf_Shdr *sechdrs = NULL; | ||
670 | void *purgatory_buf = NULL; | ||
671 | |||
672 | /* | ||
673 | * sechdrs_c points to section headers in purgatory and are read | ||
674 | * only. No modifications allowed. | ||
675 | */ | ||
676 | sechdrs_c = (void *)pi->ehdr + pi->ehdr->e_shoff; | ||
677 | |||
678 | /* | ||
679 | * We can not modify sechdrs_c[] and its fields. It is read only. | ||
680 | * Copy it over to a local copy where one can store some temporary | ||
681 | * data and free it at the end. We need to modify ->sh_addr and | ||
682 | * ->sh_offset fields to keep track of permanent and temporary | ||
683 | * locations of sections. | ||
684 | */ | ||
685 | sechdrs = vzalloc(pi->ehdr->e_shnum * sizeof(Elf_Shdr)); | ||
686 | if (!sechdrs) | ||
687 | return -ENOMEM; | ||
688 | |||
689 | memcpy(sechdrs, sechdrs_c, pi->ehdr->e_shnum * sizeof(Elf_Shdr)); | ||
690 | |||
691 | /* | ||
692 | * We seem to have multiple copies of sections. First copy is which | ||
693 | * is embedded in kernel in read only section. Some of these sections | ||
694 | * will be copied to a temporary buffer and relocated. And these | ||
695 | * sections will finally be copied to their final destination at | ||
696 | * segment load time. | ||
697 | * | ||
698 | * Use ->sh_offset to reflect section address in memory. It will | ||
699 | * point to original read only copy if section is not allocatable. | ||
700 | * Otherwise it will point to temporary copy which will be relocated. | ||
701 | * | ||
702 | * Use ->sh_addr to contain final address of the section where it | ||
703 | * will go during execution time. | ||
704 | */ | ||
705 | for (i = 0; i < pi->ehdr->e_shnum; i++) { | ||
706 | if (sechdrs[i].sh_type == SHT_NOBITS) | ||
707 | continue; | ||
708 | |||
709 | sechdrs[i].sh_offset = (unsigned long)pi->ehdr + | ||
710 | sechdrs[i].sh_offset; | ||
711 | } | ||
712 | |||
713 | /* | ||
714 | * Identify entry point section and make entry relative to section | ||
715 | * start. | ||
716 | */ | ||
717 | entry = pi->ehdr->e_entry; | ||
718 | for (i = 0; i < pi->ehdr->e_shnum; i++) { | ||
719 | if (!(sechdrs[i].sh_flags & SHF_ALLOC)) | ||
720 | continue; | ||
721 | |||
722 | if (!(sechdrs[i].sh_flags & SHF_EXECINSTR)) | ||
723 | continue; | ||
724 | |||
725 | /* Make entry section relative */ | ||
726 | if (sechdrs[i].sh_addr <= pi->ehdr->e_entry && | ||
727 | ((sechdrs[i].sh_addr + sechdrs[i].sh_size) > | ||
728 | pi->ehdr->e_entry)) { | ||
729 | entry_sidx = i; | ||
730 | entry -= sechdrs[i].sh_addr; | ||
731 | break; | ||
732 | } | ||
733 | } | ||
734 | |||
735 | /* Determine how much memory is needed to load relocatable object. */ | ||
736 | buf_align = 1; | ||
737 | bss_align = 1; | ||
738 | buf_sz = 0; | ||
739 | bss_sz = 0; | ||
740 | |||
741 | for (i = 0; i < pi->ehdr->e_shnum; i++) { | ||
742 | if (!(sechdrs[i].sh_flags & SHF_ALLOC)) | ||
743 | continue; | ||
744 | |||
745 | align = sechdrs[i].sh_addralign; | ||
746 | if (sechdrs[i].sh_type != SHT_NOBITS) { | ||
747 | if (buf_align < align) | ||
748 | buf_align = align; | ||
749 | buf_sz = ALIGN(buf_sz, align); | ||
750 | buf_sz += sechdrs[i].sh_size; | ||
751 | } else { | ||
752 | /* bss section */ | ||
753 | if (bss_align < align) | ||
754 | bss_align = align; | ||
755 | bss_sz = ALIGN(bss_sz, align); | ||
756 | bss_sz += sechdrs[i].sh_size; | ||
757 | } | ||
758 | } | ||
759 | |||
760 | /* Determine the bss padding required to align bss properly */ | ||
761 | bss_pad = 0; | ||
762 | if (buf_sz & (bss_align - 1)) | ||
763 | bss_pad = bss_align - (buf_sz & (bss_align - 1)); | ||
764 | |||
765 | memsz = buf_sz + bss_pad + bss_sz; | ||
766 | |||
767 | /* Allocate buffer for purgatory */ | ||
768 | purgatory_buf = vzalloc(buf_sz); | ||
769 | if (!purgatory_buf) { | ||
770 | ret = -ENOMEM; | ||
771 | goto out; | ||
772 | } | ||
773 | |||
774 | if (buf_align < bss_align) | ||
775 | buf_align = bss_align; | ||
776 | |||
777 | /* Add buffer to segment list */ | ||
778 | ret = kexec_add_buffer(image, purgatory_buf, buf_sz, memsz, | ||
779 | buf_align, min, max, top_down, | ||
780 | &pi->purgatory_load_addr); | ||
781 | if (ret) | ||
782 | goto out; | ||
783 | |||
784 | /* Load SHF_ALLOC sections */ | ||
785 | buf_addr = purgatory_buf; | ||
786 | load_addr = curr_load_addr = pi->purgatory_load_addr; | ||
787 | bss_addr = load_addr + buf_sz + bss_pad; | ||
788 | |||
789 | for (i = 0; i < pi->ehdr->e_shnum; i++) { | ||
790 | if (!(sechdrs[i].sh_flags & SHF_ALLOC)) | ||
791 | continue; | ||
792 | |||
793 | align = sechdrs[i].sh_addralign; | ||
794 | if (sechdrs[i].sh_type != SHT_NOBITS) { | ||
795 | curr_load_addr = ALIGN(curr_load_addr, align); | ||
796 | offset = curr_load_addr - load_addr; | ||
797 | /* We already modifed ->sh_offset to keep src addr */ | ||
798 | src = (char *) sechdrs[i].sh_offset; | ||
799 | memcpy(buf_addr + offset, src, sechdrs[i].sh_size); | ||
800 | |||
801 | /* Store load address and source address of section */ | ||
802 | sechdrs[i].sh_addr = curr_load_addr; | ||
803 | |||
804 | /* | ||
805 | * This section got copied to temporary buffer. Update | ||
806 | * ->sh_offset accordingly. | ||
807 | */ | ||
808 | sechdrs[i].sh_offset = (unsigned long)(buf_addr + offset); | ||
809 | |||
810 | /* Advance to the next address */ | ||
811 | curr_load_addr += sechdrs[i].sh_size; | ||
812 | } else { | ||
813 | bss_addr = ALIGN(bss_addr, align); | ||
814 | sechdrs[i].sh_addr = bss_addr; | ||
815 | bss_addr += sechdrs[i].sh_size; | ||
816 | } | ||
817 | } | ||
818 | |||
819 | /* Update entry point based on load address of text section */ | ||
820 | if (entry_sidx >= 0) | ||
821 | entry += sechdrs[entry_sidx].sh_addr; | ||
822 | |||
823 | /* Make kernel jump to purgatory after shutdown */ | ||
824 | image->start = entry; | ||
825 | |||
826 | /* Used later to get/set symbol values */ | ||
827 | pi->sechdrs = sechdrs; | ||
828 | |||
829 | /* | ||
830 | * Used later to identify which section is purgatory and skip it | ||
831 | * from checksumming. | ||
832 | */ | ||
833 | pi->purgatory_buf = purgatory_buf; | ||
834 | return ret; | ||
835 | out: | ||
836 | vfree(sechdrs); | ||
837 | vfree(purgatory_buf); | ||
838 | return ret; | ||
839 | } | ||
840 | |||
841 | static int kexec_apply_relocations(struct kimage *image) | ||
842 | { | ||
843 | int i, ret; | ||
844 | struct purgatory_info *pi = &image->purgatory_info; | ||
845 | Elf_Shdr *sechdrs = pi->sechdrs; | ||
846 | |||
847 | /* Apply relocations */ | ||
848 | for (i = 0; i < pi->ehdr->e_shnum; i++) { | ||
849 | Elf_Shdr *section, *symtab; | ||
850 | |||
851 | if (sechdrs[i].sh_type != SHT_RELA && | ||
852 | sechdrs[i].sh_type != SHT_REL) | ||
853 | continue; | ||
854 | |||
855 | /* | ||
856 | * For section of type SHT_RELA/SHT_REL, | ||
857 | * ->sh_link contains section header index of associated | ||
858 | * symbol table. And ->sh_info contains section header | ||
859 | * index of section to which relocations apply. | ||
860 | */ | ||
861 | if (sechdrs[i].sh_info >= pi->ehdr->e_shnum || | ||
862 | sechdrs[i].sh_link >= pi->ehdr->e_shnum) | ||
863 | return -ENOEXEC; | ||
864 | |||
865 | section = &sechdrs[sechdrs[i].sh_info]; | ||
866 | symtab = &sechdrs[sechdrs[i].sh_link]; | ||
867 | |||
868 | if (!(section->sh_flags & SHF_ALLOC)) | ||
869 | continue; | ||
870 | |||
871 | /* | ||
872 | * symtab->sh_link contain section header index of associated | ||
873 | * string table. | ||
874 | */ | ||
875 | if (symtab->sh_link >= pi->ehdr->e_shnum) | ||
876 | /* Invalid section number? */ | ||
877 | continue; | ||
878 | |||
879 | /* | ||
880 | * Respective architecture needs to provide support for applying | ||
881 | * relocations of type SHT_RELA/SHT_REL. | ||
882 | */ | ||
883 | if (sechdrs[i].sh_type == SHT_RELA) | ||
884 | ret = arch_kexec_apply_relocations_add(pi->ehdr, | ||
885 | sechdrs, i); | ||
886 | else if (sechdrs[i].sh_type == SHT_REL) | ||
887 | ret = arch_kexec_apply_relocations(pi->ehdr, | ||
888 | sechdrs, i); | ||
889 | if (ret) | ||
890 | return ret; | ||
891 | } | ||
892 | |||
893 | return 0; | ||
894 | } | ||
895 | |||
896 | /* Load relocatable purgatory object and relocate it appropriately */ | ||
897 | int kexec_load_purgatory(struct kimage *image, unsigned long min, | ||
898 | unsigned long max, int top_down, | ||
899 | unsigned long *load_addr) | ||
900 | { | ||
901 | struct purgatory_info *pi = &image->purgatory_info; | ||
902 | int ret; | ||
903 | |||
904 | if (kexec_purgatory_size <= 0) | ||
905 | return -EINVAL; | ||
906 | |||
907 | if (kexec_purgatory_size < sizeof(Elf_Ehdr)) | ||
908 | return -ENOEXEC; | ||
909 | |||
910 | pi->ehdr = (Elf_Ehdr *)kexec_purgatory; | ||
911 | |||
912 | if (memcmp(pi->ehdr->e_ident, ELFMAG, SELFMAG) != 0 | ||
913 | || pi->ehdr->e_type != ET_REL | ||
914 | || !elf_check_arch(pi->ehdr) | ||
915 | || pi->ehdr->e_shentsize != sizeof(Elf_Shdr)) | ||
916 | return -ENOEXEC; | ||
917 | |||
918 | if (pi->ehdr->e_shoff >= kexec_purgatory_size | ||
919 | || (pi->ehdr->e_shnum * sizeof(Elf_Shdr) > | ||
920 | kexec_purgatory_size - pi->ehdr->e_shoff)) | ||
921 | return -ENOEXEC; | ||
922 | |||
923 | ret = __kexec_load_purgatory(image, min, max, top_down); | ||
924 | if (ret) | ||
925 | return ret; | ||
926 | |||
927 | ret = kexec_apply_relocations(image); | ||
928 | if (ret) | ||
929 | goto out; | ||
930 | |||
931 | *load_addr = pi->purgatory_load_addr; | ||
932 | return 0; | ||
933 | out: | ||
934 | vfree(pi->sechdrs); | ||
935 | vfree(pi->purgatory_buf); | ||
936 | return ret; | ||
937 | } | ||
938 | |||
939 | static Elf_Sym *kexec_purgatory_find_symbol(struct purgatory_info *pi, | ||
940 | const char *name) | ||
941 | { | ||
942 | Elf_Sym *syms; | ||
943 | Elf_Shdr *sechdrs; | ||
944 | Elf_Ehdr *ehdr; | ||
945 | int i, k; | ||
946 | const char *strtab; | ||
947 | |||
948 | if (!pi->sechdrs || !pi->ehdr) | ||
949 | return NULL; | ||
950 | |||
951 | sechdrs = pi->sechdrs; | ||
952 | ehdr = pi->ehdr; | ||
953 | |||
954 | for (i = 0; i < ehdr->e_shnum; i++) { | ||
955 | if (sechdrs[i].sh_type != SHT_SYMTAB) | ||
956 | continue; | ||
957 | |||
958 | if (sechdrs[i].sh_link >= ehdr->e_shnum) | ||
959 | /* Invalid strtab section number */ | ||
960 | continue; | ||
961 | strtab = (char *)sechdrs[sechdrs[i].sh_link].sh_offset; | ||
962 | syms = (Elf_Sym *)sechdrs[i].sh_offset; | ||
963 | |||
964 | /* Go through symbols for a match */ | ||
965 | for (k = 0; k < sechdrs[i].sh_size/sizeof(Elf_Sym); k++) { | ||
966 | if (ELF_ST_BIND(syms[k].st_info) != STB_GLOBAL) | ||
967 | continue; | ||
968 | |||
969 | if (strcmp(strtab + syms[k].st_name, name) != 0) | ||
970 | continue; | ||
971 | |||
972 | if (syms[k].st_shndx == SHN_UNDEF || | ||
973 | syms[k].st_shndx >= ehdr->e_shnum) { | ||
974 | pr_debug("Symbol: %s has bad section index %d.\n", | ||
975 | name, syms[k].st_shndx); | ||
976 | return NULL; | ||
977 | } | ||
978 | |||
979 | /* Found the symbol we are looking for */ | ||
980 | return &syms[k]; | ||
981 | } | ||
982 | } | ||
983 | |||
984 | return NULL; | ||
985 | } | ||
986 | |||
987 | void *kexec_purgatory_get_symbol_addr(struct kimage *image, const char *name) | ||
988 | { | ||
989 | struct purgatory_info *pi = &image->purgatory_info; | ||
990 | Elf_Sym *sym; | ||
991 | Elf_Shdr *sechdr; | ||
992 | |||
993 | sym = kexec_purgatory_find_symbol(pi, name); | ||
994 | if (!sym) | ||
995 | return ERR_PTR(-EINVAL); | ||
996 | |||
997 | sechdr = &pi->sechdrs[sym->st_shndx]; | ||
998 | |||
999 | /* | ||
1000 | * Returns the address where symbol will finally be loaded after | ||
1001 | * kexec_load_segment() | ||
1002 | */ | ||
1003 | return (void *)(sechdr->sh_addr + sym->st_value); | ||
1004 | } | ||
1005 | |||
1006 | /* | ||
1007 | * Get or set value of a symbol. If "get_value" is true, symbol value is | ||
1008 | * returned in buf otherwise symbol value is set based on value in buf. | ||
1009 | */ | ||
1010 | int kexec_purgatory_get_set_symbol(struct kimage *image, const char *name, | ||
1011 | void *buf, unsigned int size, bool get_value) | ||
1012 | { | ||
1013 | Elf_Sym *sym; | ||
1014 | Elf_Shdr *sechdrs; | ||
1015 | struct purgatory_info *pi = &image->purgatory_info; | ||
1016 | char *sym_buf; | ||
1017 | |||
1018 | sym = kexec_purgatory_find_symbol(pi, name); | ||
1019 | if (!sym) | ||
1020 | return -EINVAL; | ||
1021 | |||
1022 | if (sym->st_size != size) { | ||
1023 | pr_err("symbol %s size mismatch: expected %lu actual %u\n", | ||
1024 | name, (unsigned long)sym->st_size, size); | ||
1025 | return -EINVAL; | ||
1026 | } | ||
1027 | |||
1028 | sechdrs = pi->sechdrs; | ||
1029 | |||
1030 | if (sechdrs[sym->st_shndx].sh_type == SHT_NOBITS) { | ||
1031 | pr_err("symbol %s is in a bss section. Cannot %s\n", name, | ||
1032 | get_value ? "get" : "set"); | ||
1033 | return -EINVAL; | ||
1034 | } | ||
1035 | |||
1036 | sym_buf = (unsigned char *)sechdrs[sym->st_shndx].sh_offset + | ||
1037 | sym->st_value; | ||
1038 | |||
1039 | if (get_value) | ||
1040 | memcpy((void *)buf, sym_buf, size); | ||
1041 | else | ||
1042 | memcpy((void *)sym_buf, buf, size); | ||
1043 | |||
1044 | return 0; | ||
1045 | } | ||
diff --git a/kernel/kexec_internal.h b/kernel/kexec_internal.h new file mode 100644 index 000000000000..e4392a698ad4 --- /dev/null +++ b/kernel/kexec_internal.h | |||
@@ -0,0 +1,22 @@ | |||
1 | #ifndef LINUX_KEXEC_INTERNAL_H | ||
2 | #define LINUX_KEXEC_INTERNAL_H | ||
3 | |||
4 | #include <linux/kexec.h> | ||
5 | |||
6 | struct kimage *do_kimage_alloc_init(void); | ||
7 | int sanity_check_segment_list(struct kimage *image); | ||
8 | void kimage_free_page_list(struct list_head *list); | ||
9 | void kimage_free(struct kimage *image); | ||
10 | int kimage_load_segment(struct kimage *image, struct kexec_segment *segment); | ||
11 | void kimage_terminate(struct kimage *image); | ||
12 | int kimage_is_destination_range(struct kimage *image, | ||
13 | unsigned long start, unsigned long end); | ||
14 | |||
15 | extern struct mutex kexec_mutex; | ||
16 | |||
17 | #ifdef CONFIG_KEXEC_FILE | ||
18 | void kimage_file_post_load_cleanup(struct kimage *image); | ||
19 | #else /* CONFIG_KEXEC_FILE */ | ||
20 | static inline void kimage_file_post_load_cleanup(struct kimage *image) { } | ||
21 | #endif /* CONFIG_KEXEC_FILE */ | ||
22 | #endif /* LINUX_KEXEC_INTERNAL_H */ | ||
diff --git a/kernel/kmod.c b/kernel/kmod.c index 2777f40a9c7b..da98d0593de2 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c | |||
@@ -45,8 +45,6 @@ | |||
45 | 45 | ||
46 | extern int max_threads; | 46 | extern int max_threads; |
47 | 47 | ||
48 | static struct workqueue_struct *khelper_wq; | ||
49 | |||
50 | #define CAP_BSET (void *)1 | 48 | #define CAP_BSET (void *)1 |
51 | #define CAP_PI (void *)2 | 49 | #define CAP_PI (void *)2 |
52 | 50 | ||
@@ -114,10 +112,11 @@ out: | |||
114 | * @...: arguments as specified in the format string | 112 | * @...: arguments as specified in the format string |
115 | * | 113 | * |
116 | * Load a module using the user mode module loader. The function returns | 114 | * Load a module using the user mode module loader. The function returns |
117 | * zero on success or a negative errno code on failure. Note that a | 115 | * zero on success or a negative errno code or positive exit code from |
118 | * successful module load does not mean the module did not then unload | 116 | * "modprobe" on failure. Note that a successful module load does not mean |
119 | * and exit on an error of its own. Callers must check that the service | 117 | * the module did not then unload and exit on an error of its own. Callers |
120 | * they requested is now available not blindly invoke it. | 118 | * must check that the service they requested is now available not blindly |
119 | * invoke it. | ||
121 | * | 120 | * |
122 | * If module auto-loading support is disabled then this function | 121 | * If module auto-loading support is disabled then this function |
123 | * becomes a no-operation. | 122 | * becomes a no-operation. |
@@ -213,7 +212,7 @@ static void umh_complete(struct subprocess_info *sub_info) | |||
213 | /* | 212 | /* |
214 | * This is the task which runs the usermode application | 213 | * This is the task which runs the usermode application |
215 | */ | 214 | */ |
216 | static int ____call_usermodehelper(void *data) | 215 | static int call_usermodehelper_exec_async(void *data) |
217 | { | 216 | { |
218 | struct subprocess_info *sub_info = data; | 217 | struct subprocess_info *sub_info = data; |
219 | struct cred *new; | 218 | struct cred *new; |
@@ -223,12 +222,9 @@ static int ____call_usermodehelper(void *data) | |||
223 | flush_signal_handlers(current, 1); | 222 | flush_signal_handlers(current, 1); |
224 | spin_unlock_irq(¤t->sighand->siglock); | 223 | spin_unlock_irq(¤t->sighand->siglock); |
225 | 224 | ||
226 | /* We can run anywhere, unlike our parent keventd(). */ | ||
227 | set_cpus_allowed_ptr(current, cpu_all_mask); | ||
228 | |||
229 | /* | 225 | /* |
230 | * Our parent is keventd, which runs with elevated scheduling priority. | 226 | * Our parent (unbound workqueue) runs with elevated scheduling |
231 | * Avoid propagating that into the userspace child. | 227 | * priority. Avoid propagating that into the userspace child. |
232 | */ | 228 | */ |
233 | set_user_nice(current, 0); | 229 | set_user_nice(current, 0); |
234 | 230 | ||
@@ -258,7 +254,10 @@ static int ____call_usermodehelper(void *data) | |||
258 | (const char __user *const __user *)sub_info->envp); | 254 | (const char __user *const __user *)sub_info->envp); |
259 | out: | 255 | out: |
260 | sub_info->retval = retval; | 256 | sub_info->retval = retval; |
261 | /* wait_for_helper() will call umh_complete if UHM_WAIT_PROC. */ | 257 | /* |
258 | * call_usermodehelper_exec_sync() will call umh_complete | ||
259 | * if UHM_WAIT_PROC. | ||
260 | */ | ||
262 | if (!(sub_info->wait & UMH_WAIT_PROC)) | 261 | if (!(sub_info->wait & UMH_WAIT_PROC)) |
263 | umh_complete(sub_info); | 262 | umh_complete(sub_info); |
264 | if (!retval) | 263 | if (!retval) |
@@ -266,15 +265,14 @@ out: | |||
266 | do_exit(0); | 265 | do_exit(0); |
267 | } | 266 | } |
268 | 267 | ||
269 | /* Keventd can't block, but this (a child) can. */ | 268 | /* Handles UMH_WAIT_PROC. */ |
270 | static int wait_for_helper(void *data) | 269 | static void call_usermodehelper_exec_sync(struct subprocess_info *sub_info) |
271 | { | 270 | { |
272 | struct subprocess_info *sub_info = data; | ||
273 | pid_t pid; | 271 | pid_t pid; |
274 | 272 | ||
275 | /* If SIGCLD is ignored sys_wait4 won't populate the status. */ | 273 | /* If SIGCLD is ignored sys_wait4 won't populate the status. */ |
276 | kernel_sigaction(SIGCHLD, SIG_DFL); | 274 | kernel_sigaction(SIGCHLD, SIG_DFL); |
277 | pid = kernel_thread(____call_usermodehelper, sub_info, SIGCHLD); | 275 | pid = kernel_thread(call_usermodehelper_exec_async, sub_info, SIGCHLD); |
278 | if (pid < 0) { | 276 | if (pid < 0) { |
279 | sub_info->retval = pid; | 277 | sub_info->retval = pid; |
280 | } else { | 278 | } else { |
@@ -282,44 +280,60 @@ static int wait_for_helper(void *data) | |||
282 | /* | 280 | /* |
283 | * Normally it is bogus to call wait4() from in-kernel because | 281 | * Normally it is bogus to call wait4() from in-kernel because |
284 | * wait4() wants to write the exit code to a userspace address. | 282 | * wait4() wants to write the exit code to a userspace address. |
285 | * But wait_for_helper() always runs as keventd, and put_user() | 283 | * But call_usermodehelper_exec_sync() always runs as kernel |
286 | * to a kernel address works OK for kernel threads, due to their | 284 | * thread (workqueue) and put_user() to a kernel address works |
287 | * having an mm_segment_t which spans the entire address space. | 285 | * OK for kernel threads, due to their having an mm_segment_t |
286 | * which spans the entire address space. | ||
288 | * | 287 | * |
289 | * Thus the __user pointer cast is valid here. | 288 | * Thus the __user pointer cast is valid here. |
290 | */ | 289 | */ |
291 | sys_wait4(pid, (int __user *)&ret, 0, NULL); | 290 | sys_wait4(pid, (int __user *)&ret, 0, NULL); |
292 | 291 | ||
293 | /* | 292 | /* |
294 | * If ret is 0, either ____call_usermodehelper failed and the | 293 | * If ret is 0, either call_usermodehelper_exec_async failed and |
295 | * real error code is already in sub_info->retval or | 294 | * the real error code is already in sub_info->retval or |
296 | * sub_info->retval is 0 anyway, so don't mess with it then. | 295 | * sub_info->retval is 0 anyway, so don't mess with it then. |
297 | */ | 296 | */ |
298 | if (ret) | 297 | if (ret) |
299 | sub_info->retval = ret; | 298 | sub_info->retval = ret; |
300 | } | 299 | } |
301 | 300 | ||
301 | /* Restore default kernel sig handler */ | ||
302 | kernel_sigaction(SIGCHLD, SIG_IGN); | ||
303 | |||
302 | umh_complete(sub_info); | 304 | umh_complete(sub_info); |
303 | do_exit(0); | ||
304 | } | 305 | } |
305 | 306 | ||
306 | /* This is run by khelper thread */ | 307 | /* |
307 | static void __call_usermodehelper(struct work_struct *work) | 308 | * We need to create the usermodehelper kernel thread from a task that is affine |
309 | * to an optimized set of CPUs (or nohz housekeeping ones) such that they | ||
310 | * inherit a widest affinity irrespective of call_usermodehelper() callers with | ||
311 | * possibly reduced affinity (eg: per-cpu workqueues). We don't want | ||
312 | * usermodehelper targets to contend a busy CPU. | ||
313 | * | ||
314 | * Unbound workqueues provide such wide affinity and allow to block on | ||
315 | * UMH_WAIT_PROC requests without blocking pending request (up to some limit). | ||
316 | * | ||
317 | * Besides, workqueues provide the privilege level that caller might not have | ||
318 | * to perform the usermodehelper request. | ||
319 | * | ||
320 | */ | ||
321 | static void call_usermodehelper_exec_work(struct work_struct *work) | ||
308 | { | 322 | { |
309 | struct subprocess_info *sub_info = | 323 | struct subprocess_info *sub_info = |
310 | container_of(work, struct subprocess_info, work); | 324 | container_of(work, struct subprocess_info, work); |
311 | pid_t pid; | ||
312 | 325 | ||
313 | if (sub_info->wait & UMH_WAIT_PROC) | 326 | if (sub_info->wait & UMH_WAIT_PROC) { |
314 | pid = kernel_thread(wait_for_helper, sub_info, | 327 | call_usermodehelper_exec_sync(sub_info); |
315 | CLONE_FS | CLONE_FILES | SIGCHLD); | 328 | } else { |
316 | else | 329 | pid_t pid; |
317 | pid = kernel_thread(____call_usermodehelper, sub_info, | ||
318 | SIGCHLD); | ||
319 | 330 | ||
320 | if (pid < 0) { | 331 | pid = kernel_thread(call_usermodehelper_exec_async, sub_info, |
321 | sub_info->retval = pid; | 332 | SIGCHLD); |
322 | umh_complete(sub_info); | 333 | if (pid < 0) { |
334 | sub_info->retval = pid; | ||
335 | umh_complete(sub_info); | ||
336 | } | ||
323 | } | 337 | } |
324 | } | 338 | } |
325 | 339 | ||
@@ -509,7 +523,7 @@ struct subprocess_info *call_usermodehelper_setup(char *path, char **argv, | |||
509 | if (!sub_info) | 523 | if (!sub_info) |
510 | goto out; | 524 | goto out; |
511 | 525 | ||
512 | INIT_WORK(&sub_info->work, __call_usermodehelper); | 526 | INIT_WORK(&sub_info->work, call_usermodehelper_exec_work); |
513 | sub_info->path = path; | 527 | sub_info->path = path; |
514 | sub_info->argv = argv; | 528 | sub_info->argv = argv; |
515 | sub_info->envp = envp; | 529 | sub_info->envp = envp; |
@@ -531,8 +545,8 @@ EXPORT_SYMBOL(call_usermodehelper_setup); | |||
531 | * from interrupt context. | 545 | * from interrupt context. |
532 | * | 546 | * |
533 | * Runs a user-space application. The application is started | 547 | * Runs a user-space application. The application is started |
534 | * asynchronously if wait is not set, and runs as a child of keventd. | 548 | * asynchronously if wait is not set, and runs as a child of system workqueues. |
535 | * (ie. it runs with full root capabilities). | 549 | * (ie. it runs with full root capabilities and optimized affinity). |
536 | */ | 550 | */ |
537 | int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait) | 551 | int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait) |
538 | { | 552 | { |
@@ -544,7 +558,7 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait) | |||
544 | return -EINVAL; | 558 | return -EINVAL; |
545 | } | 559 | } |
546 | helper_lock(); | 560 | helper_lock(); |
547 | if (!khelper_wq || usermodehelper_disabled) { | 561 | if (usermodehelper_disabled) { |
548 | retval = -EBUSY; | 562 | retval = -EBUSY; |
549 | goto out; | 563 | goto out; |
550 | } | 564 | } |
@@ -556,7 +570,7 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait) | |||
556 | sub_info->complete = (wait == UMH_NO_WAIT) ? NULL : &done; | 570 | sub_info->complete = (wait == UMH_NO_WAIT) ? NULL : &done; |
557 | sub_info->wait = wait; | 571 | sub_info->wait = wait; |
558 | 572 | ||
559 | queue_work(khelper_wq, &sub_info->work); | 573 | queue_work(system_unbound_wq, &sub_info->work); |
560 | if (wait == UMH_NO_WAIT) /* task has freed sub_info */ | 574 | if (wait == UMH_NO_WAIT) /* task has freed sub_info */ |
561 | goto unlock; | 575 | goto unlock; |
562 | 576 | ||
@@ -686,9 +700,3 @@ struct ctl_table usermodehelper_table[] = { | |||
686 | }, | 700 | }, |
687 | { } | 701 | { } |
688 | }; | 702 | }; |
689 | |||
690 | void __init usermodehelper_init(void) | ||
691 | { | ||
692 | khelper_wq = create_singlethread_workqueue("khelper"); | ||
693 | BUG_ON(!khelper_wq); | ||
694 | } | ||
diff --git a/kernel/kprobes.c b/kernel/kprobes.c index c90e417bb963..d10ab6b9b5e0 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c | |||
@@ -1332,7 +1332,7 @@ bool __weak arch_within_kprobe_blacklist(unsigned long addr) | |||
1332 | addr < (unsigned long)__kprobes_text_end; | 1332 | addr < (unsigned long)__kprobes_text_end; |
1333 | } | 1333 | } |
1334 | 1334 | ||
1335 | static bool within_kprobe_blacklist(unsigned long addr) | 1335 | bool within_kprobe_blacklist(unsigned long addr) |
1336 | { | 1336 | { |
1337 | struct kprobe_blacklist_entry *ent; | 1337 | struct kprobe_blacklist_entry *ent; |
1338 | 1338 | ||
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index 6683ccef9fff..e83b26464061 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c | |||
@@ -90,7 +90,7 @@ static ssize_t profiling_store(struct kobject *kobj, | |||
90 | KERNEL_ATTR_RW(profiling); | 90 | KERNEL_ATTR_RW(profiling); |
91 | #endif | 91 | #endif |
92 | 92 | ||
93 | #ifdef CONFIG_KEXEC | 93 | #ifdef CONFIG_KEXEC_CORE |
94 | static ssize_t kexec_loaded_show(struct kobject *kobj, | 94 | static ssize_t kexec_loaded_show(struct kobject *kobj, |
95 | struct kobj_attribute *attr, char *buf) | 95 | struct kobj_attribute *attr, char *buf) |
96 | { | 96 | { |
@@ -134,7 +134,7 @@ static ssize_t vmcoreinfo_show(struct kobject *kobj, | |||
134 | } | 134 | } |
135 | KERNEL_ATTR_RO(vmcoreinfo); | 135 | KERNEL_ATTR_RO(vmcoreinfo); |
136 | 136 | ||
137 | #endif /* CONFIG_KEXEC */ | 137 | #endif /* CONFIG_KEXEC_CORE */ |
138 | 138 | ||
139 | /* whether file capabilities are enabled */ | 139 | /* whether file capabilities are enabled */ |
140 | static ssize_t fscaps_show(struct kobject *kobj, | 140 | static ssize_t fscaps_show(struct kobject *kobj, |
@@ -196,7 +196,7 @@ static struct attribute * kernel_attrs[] = { | |||
196 | #ifdef CONFIG_PROFILING | 196 | #ifdef CONFIG_PROFILING |
197 | &profiling_attr.attr, | 197 | &profiling_attr.attr, |
198 | #endif | 198 | #endif |
199 | #ifdef CONFIG_KEXEC | 199 | #ifdef CONFIG_KEXEC_CORE |
200 | &kexec_loaded_attr.attr, | 200 | &kexec_loaded_attr.attr, |
201 | &kexec_crash_loaded_attr.attr, | 201 | &kexec_crash_loaded_attr.attr, |
202 | &kexec_crash_size_attr.attr, | 202 | &kexec_crash_size_attr.attr, |
diff --git a/kernel/kthread.c b/kernel/kthread.c index 10e489c448fe..9ff173dca1ae 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c | |||
@@ -97,6 +97,7 @@ bool kthread_should_park(void) | |||
97 | { | 97 | { |
98 | return test_bit(KTHREAD_SHOULD_PARK, &to_kthread(current)->flags); | 98 | return test_bit(KTHREAD_SHOULD_PARK, &to_kthread(current)->flags); |
99 | } | 99 | } |
100 | EXPORT_SYMBOL_GPL(kthread_should_park); | ||
100 | 101 | ||
101 | /** | 102 | /** |
102 | * kthread_freezable_should_stop - should this freezable kthread return now? | 103 | * kthread_freezable_should_stop - should this freezable kthread return now? |
@@ -171,6 +172,7 @@ void kthread_parkme(void) | |||
171 | { | 172 | { |
172 | __kthread_parkme(to_kthread(current)); | 173 | __kthread_parkme(to_kthread(current)); |
173 | } | 174 | } |
175 | EXPORT_SYMBOL_GPL(kthread_parkme); | ||
174 | 176 | ||
175 | static int kthread(void *_create) | 177 | static int kthread(void *_create) |
176 | { | 178 | { |
@@ -246,15 +248,16 @@ static void create_kthread(struct kthread_create_info *create) | |||
246 | * kthread_create_on_node - create a kthread. | 248 | * kthread_create_on_node - create a kthread. |
247 | * @threadfn: the function to run until signal_pending(current). | 249 | * @threadfn: the function to run until signal_pending(current). |
248 | * @data: data ptr for @threadfn. | 250 | * @data: data ptr for @threadfn. |
249 | * @node: memory node number. | 251 | * @node: task and thread structures for the thread are allocated on this node |
250 | * @namefmt: printf-style name for the thread. | 252 | * @namefmt: printf-style name for the thread. |
251 | * | 253 | * |
252 | * Description: This helper function creates and names a kernel | 254 | * Description: This helper function creates and names a kernel |
253 | * thread. The thread will be stopped: use wake_up_process() to start | 255 | * thread. The thread will be stopped: use wake_up_process() to start |
254 | * it. See also kthread_run(). | 256 | * it. See also kthread_run(). The new thread has SCHED_NORMAL policy and |
257 | * is affine to all CPUs. | ||
255 | * | 258 | * |
256 | * If thread is going to be bound on a particular cpu, give its node | 259 | * If thread is going to be bound on a particular cpu, give its node |
257 | * in @node, to get NUMA affinity for kthread stack, or else give -1. | 260 | * in @node, to get NUMA affinity for kthread stack, or else give NUMA_NO_NODE. |
258 | * When woken, the thread will run @threadfn() with @data as its | 261 | * When woken, the thread will run @threadfn() with @data as its |
259 | * argument. @threadfn() can either call do_exit() directly if it is a | 262 | * argument. @threadfn() can either call do_exit() directly if it is a |
260 | * standalone thread for which no one will call kthread_stop(), or | 263 | * standalone thread for which no one will call kthread_stop(), or |
@@ -325,16 +328,30 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), | |||
325 | } | 328 | } |
326 | EXPORT_SYMBOL(kthread_create_on_node); | 329 | EXPORT_SYMBOL(kthread_create_on_node); |
327 | 330 | ||
328 | static void __kthread_bind(struct task_struct *p, unsigned int cpu, long state) | 331 | static void __kthread_bind_mask(struct task_struct *p, const struct cpumask *mask, long state) |
329 | { | 332 | { |
330 | /* Must have done schedule() in kthread() before we set_task_cpu */ | 333 | unsigned long flags; |
334 | |||
331 | if (!wait_task_inactive(p, state)) { | 335 | if (!wait_task_inactive(p, state)) { |
332 | WARN_ON(1); | 336 | WARN_ON(1); |
333 | return; | 337 | return; |
334 | } | 338 | } |
339 | |||
335 | /* It's safe because the task is inactive. */ | 340 | /* It's safe because the task is inactive. */ |
336 | do_set_cpus_allowed(p, cpumask_of(cpu)); | 341 | raw_spin_lock_irqsave(&p->pi_lock, flags); |
342 | do_set_cpus_allowed(p, mask); | ||
337 | p->flags |= PF_NO_SETAFFINITY; | 343 | p->flags |= PF_NO_SETAFFINITY; |
344 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); | ||
345 | } | ||
346 | |||
347 | static void __kthread_bind(struct task_struct *p, unsigned int cpu, long state) | ||
348 | { | ||
349 | __kthread_bind_mask(p, cpumask_of(cpu), state); | ||
350 | } | ||
351 | |||
352 | void kthread_bind_mask(struct task_struct *p, const struct cpumask *mask) | ||
353 | { | ||
354 | __kthread_bind_mask(p, mask, TASK_UNINTERRUPTIBLE); | ||
338 | } | 355 | } |
339 | 356 | ||
340 | /** | 357 | /** |
@@ -411,6 +428,7 @@ void kthread_unpark(struct task_struct *k) | |||
411 | if (kthread) | 428 | if (kthread) |
412 | __kthread_unpark(k, kthread); | 429 | __kthread_unpark(k, kthread); |
413 | } | 430 | } |
431 | EXPORT_SYMBOL_GPL(kthread_unpark); | ||
414 | 432 | ||
415 | /** | 433 | /** |
416 | * kthread_park - park a thread created by kthread_create(). | 434 | * kthread_park - park a thread created by kthread_create(). |
@@ -441,6 +459,7 @@ int kthread_park(struct task_struct *k) | |||
441 | } | 459 | } |
442 | return ret; | 460 | return ret; |
443 | } | 461 | } |
462 | EXPORT_SYMBOL_GPL(kthread_park); | ||
444 | 463 | ||
445 | /** | 464 | /** |
446 | * kthread_stop - stop a thread created by kthread_create(). | 465 | * kthread_stop - stop a thread created by kthread_create(). |
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index c40ebcca0495..6e5344112419 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c | |||
@@ -348,8 +348,10 @@ static void klp_disable_func(struct klp_func *func) | |||
348 | { | 348 | { |
349 | struct klp_ops *ops; | 349 | struct klp_ops *ops; |
350 | 350 | ||
351 | WARN_ON(func->state != KLP_ENABLED); | 351 | if (WARN_ON(func->state != KLP_ENABLED)) |
352 | WARN_ON(!func->old_addr); | 352 | return; |
353 | if (WARN_ON(!func->old_addr)) | ||
354 | return; | ||
353 | 355 | ||
354 | ops = klp_find_ops(func->old_addr); | 356 | ops = klp_find_ops(func->old_addr); |
355 | if (WARN_ON(!ops)) | 357 | if (WARN_ON(!ops)) |
diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile index 7dd5c9918e4c..8e96f6cc2a4a 100644 --- a/kernel/locking/Makefile +++ b/kernel/locking/Makefile | |||
@@ -1,5 +1,5 @@ | |||
1 | 1 | ||
2 | obj-y += mutex.o semaphore.o rwsem.o | 2 | obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o |
3 | 3 | ||
4 | ifdef CONFIG_FUNCTION_TRACER | 4 | ifdef CONFIG_FUNCTION_TRACER |
5 | CFLAGS_REMOVE_lockdep.o = $(CC_FLAGS_FTRACE) | 5 | CFLAGS_REMOVE_lockdep.o = $(CC_FLAGS_FTRACE) |
@@ -20,11 +20,9 @@ obj-$(CONFIG_PROVE_LOCKING) += spinlock.o | |||
20 | obj-$(CONFIG_QUEUED_SPINLOCKS) += qspinlock.o | 20 | obj-$(CONFIG_QUEUED_SPINLOCKS) += qspinlock.o |
21 | obj-$(CONFIG_RT_MUTEXES) += rtmutex.o | 21 | obj-$(CONFIG_RT_MUTEXES) += rtmutex.o |
22 | obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o | 22 | obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o |
23 | obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o | ||
24 | obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o | 23 | obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o |
25 | obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o | 24 | obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o |
26 | obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o | 25 | obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o |
27 | obj-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem-xadd.o | 26 | obj-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem-xadd.o |
28 | obj-$(CONFIG_PERCPU_RWSEM) += percpu-rwsem.o | ||
29 | obj-$(CONFIG_QUEUED_RWLOCKS) += qrwlock.o | 27 | obj-$(CONFIG_QUEUED_RWLOCKS) += qrwlock.o |
30 | obj-$(CONFIG_LOCK_TORTURE_TEST) += locktorture.o | 28 | obj-$(CONFIG_LOCK_TORTURE_TEST) += locktorture.o |
diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c index 652a8ee8efe9..f32567254867 100644 --- a/kernel/locking/percpu-rwsem.c +++ b/kernel/locking/percpu-rwsem.c | |||
@@ -88,6 +88,19 @@ void percpu_down_read(struct percpu_rw_semaphore *brw) | |||
88 | __up_read(&brw->rw_sem); | 88 | __up_read(&brw->rw_sem); |
89 | } | 89 | } |
90 | 90 | ||
91 | int percpu_down_read_trylock(struct percpu_rw_semaphore *brw) | ||
92 | { | ||
93 | if (unlikely(!update_fast_ctr(brw, +1))) { | ||
94 | if (!__down_read_trylock(&brw->rw_sem)) | ||
95 | return 0; | ||
96 | atomic_inc(&brw->slow_read_ctr); | ||
97 | __up_read(&brw->rw_sem); | ||
98 | } | ||
99 | |||
100 | rwsem_acquire_read(&brw->rw_sem.dep_map, 0, 1, _RET_IP_); | ||
101 | return 1; | ||
102 | } | ||
103 | |||
91 | void percpu_up_read(struct percpu_rw_semaphore *brw) | 104 | void percpu_up_read(struct percpu_rw_semaphore *brw) |
92 | { | 105 | { |
93 | rwsem_release(&brw->rw_sem.dep_map, 1, _RET_IP_); | 106 | rwsem_release(&brw->rw_sem.dep_map, 1, _RET_IP_); |
diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c index 6c5da483966b..f17a3e3b3550 100644 --- a/kernel/locking/qrwlock.c +++ b/kernel/locking/qrwlock.c | |||
@@ -55,27 +55,29 @@ rspin_until_writer_unlock(struct qrwlock *lock, u32 cnts) | |||
55 | { | 55 | { |
56 | while ((cnts & _QW_WMASK) == _QW_LOCKED) { | 56 | while ((cnts & _QW_WMASK) == _QW_LOCKED) { |
57 | cpu_relax_lowlatency(); | 57 | cpu_relax_lowlatency(); |
58 | cnts = smp_load_acquire((u32 *)&lock->cnts); | 58 | cnts = atomic_read_acquire(&lock->cnts); |
59 | } | 59 | } |
60 | } | 60 | } |
61 | 61 | ||
62 | /** | 62 | /** |
63 | * queue_read_lock_slowpath - acquire read lock of a queue rwlock | 63 | * queued_read_lock_slowpath - acquire read lock of a queue rwlock |
64 | * @lock: Pointer to queue rwlock structure | 64 | * @lock: Pointer to queue rwlock structure |
65 | * @cnts: Current qrwlock lock value | ||
65 | */ | 66 | */ |
66 | void queue_read_lock_slowpath(struct qrwlock *lock) | 67 | void queued_read_lock_slowpath(struct qrwlock *lock, u32 cnts) |
67 | { | 68 | { |
68 | u32 cnts; | ||
69 | |||
70 | /* | 69 | /* |
71 | * Readers come here when they cannot get the lock without waiting | 70 | * Readers come here when they cannot get the lock without waiting |
72 | */ | 71 | */ |
73 | if (unlikely(in_interrupt())) { | 72 | if (unlikely(in_interrupt())) { |
74 | /* | 73 | /* |
75 | * Readers in interrupt context will spin until the lock is | 74 | * Readers in interrupt context will get the lock immediately |
76 | * available without waiting in the queue. | 75 | * if the writer is just waiting (not holding the lock yet). |
76 | * The rspin_until_writer_unlock() function returns immediately | ||
77 | * in this case. Otherwise, they will spin (with ACQUIRE | ||
78 | * semantics) until the lock is available without waiting in | ||
79 | * the queue. | ||
77 | */ | 80 | */ |
78 | cnts = smp_load_acquire((u32 *)&lock->cnts); | ||
79 | rspin_until_writer_unlock(lock, cnts); | 81 | rspin_until_writer_unlock(lock, cnts); |
80 | return; | 82 | return; |
81 | } | 83 | } |
@@ -87,16 +89,11 @@ void queue_read_lock_slowpath(struct qrwlock *lock) | |||
87 | arch_spin_lock(&lock->lock); | 89 | arch_spin_lock(&lock->lock); |
88 | 90 | ||
89 | /* | 91 | /* |
90 | * At the head of the wait queue now, wait until the writer state | 92 | * The ACQUIRE semantics of the following spinning code ensure |
91 | * goes to 0 and then try to increment the reader count and get | 93 | * that accesses can't leak upwards out of our subsequent critical |
92 | * the lock. It is possible that an incoming writer may steal the | 94 | * section in the case that the lock is currently held for write. |
93 | * lock in the interim, so it is necessary to check the writer byte | ||
94 | * to make sure that the write lock isn't taken. | ||
95 | */ | 95 | */ |
96 | while (atomic_read(&lock->cnts) & _QW_WMASK) | 96 | cnts = atomic_add_return_acquire(_QR_BIAS, &lock->cnts) - _QR_BIAS; |
97 | cpu_relax_lowlatency(); | ||
98 | |||
99 | cnts = atomic_add_return(_QR_BIAS, &lock->cnts) - _QR_BIAS; | ||
100 | rspin_until_writer_unlock(lock, cnts); | 97 | rspin_until_writer_unlock(lock, cnts); |
101 | 98 | ||
102 | /* | 99 | /* |
@@ -104,13 +101,13 @@ void queue_read_lock_slowpath(struct qrwlock *lock) | |||
104 | */ | 101 | */ |
105 | arch_spin_unlock(&lock->lock); | 102 | arch_spin_unlock(&lock->lock); |
106 | } | 103 | } |
107 | EXPORT_SYMBOL(queue_read_lock_slowpath); | 104 | EXPORT_SYMBOL(queued_read_lock_slowpath); |
108 | 105 | ||
109 | /** | 106 | /** |
110 | * queue_write_lock_slowpath - acquire write lock of a queue rwlock | 107 | * queued_write_lock_slowpath - acquire write lock of a queue rwlock |
111 | * @lock : Pointer to queue rwlock structure | 108 | * @lock : Pointer to queue rwlock structure |
112 | */ | 109 | */ |
113 | void queue_write_lock_slowpath(struct qrwlock *lock) | 110 | void queued_write_lock_slowpath(struct qrwlock *lock) |
114 | { | 111 | { |
115 | u32 cnts; | 112 | u32 cnts; |
116 | 113 | ||
@@ -119,7 +116,7 @@ void queue_write_lock_slowpath(struct qrwlock *lock) | |||
119 | 116 | ||
120 | /* Try to acquire the lock directly if no reader is present */ | 117 | /* Try to acquire the lock directly if no reader is present */ |
121 | if (!atomic_read(&lock->cnts) && | 118 | if (!atomic_read(&lock->cnts) && |
122 | (atomic_cmpxchg(&lock->cnts, 0, _QW_LOCKED) == 0)) | 119 | (atomic_cmpxchg_acquire(&lock->cnts, 0, _QW_LOCKED) == 0)) |
123 | goto unlock; | 120 | goto unlock; |
124 | 121 | ||
125 | /* | 122 | /* |
@@ -130,7 +127,7 @@ void queue_write_lock_slowpath(struct qrwlock *lock) | |||
130 | struct __qrwlock *l = (struct __qrwlock *)lock; | 127 | struct __qrwlock *l = (struct __qrwlock *)lock; |
131 | 128 | ||
132 | if (!READ_ONCE(l->wmode) && | 129 | if (!READ_ONCE(l->wmode) && |
133 | (cmpxchg(&l->wmode, 0, _QW_WAITING) == 0)) | 130 | (cmpxchg_relaxed(&l->wmode, 0, _QW_WAITING) == 0)) |
134 | break; | 131 | break; |
135 | 132 | ||
136 | cpu_relax_lowlatency(); | 133 | cpu_relax_lowlatency(); |
@@ -140,8 +137,8 @@ void queue_write_lock_slowpath(struct qrwlock *lock) | |||
140 | for (;;) { | 137 | for (;;) { |
141 | cnts = atomic_read(&lock->cnts); | 138 | cnts = atomic_read(&lock->cnts); |
142 | if ((cnts == _QW_WAITING) && | 139 | if ((cnts == _QW_WAITING) && |
143 | (atomic_cmpxchg(&lock->cnts, _QW_WAITING, | 140 | (atomic_cmpxchg_acquire(&lock->cnts, _QW_WAITING, |
144 | _QW_LOCKED) == _QW_WAITING)) | 141 | _QW_LOCKED) == _QW_WAITING)) |
145 | break; | 142 | break; |
146 | 143 | ||
147 | cpu_relax_lowlatency(); | 144 | cpu_relax_lowlatency(); |
@@ -149,4 +146,4 @@ void queue_write_lock_slowpath(struct qrwlock *lock) | |||
149 | unlock: | 146 | unlock: |
150 | arch_spin_unlock(&lock->lock); | 147 | arch_spin_unlock(&lock->lock); |
151 | } | 148 | } |
152 | EXPORT_SYMBOL(queue_write_lock_slowpath); | 149 | EXPORT_SYMBOL(queued_write_lock_slowpath); |
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c index 38c49202d532..337c8818541d 100644 --- a/kernel/locking/qspinlock.c +++ b/kernel/locking/qspinlock.c | |||
@@ -239,8 +239,8 @@ static __always_inline void set_locked(struct qspinlock *lock) | |||
239 | 239 | ||
240 | static __always_inline void __pv_init_node(struct mcs_spinlock *node) { } | 240 | static __always_inline void __pv_init_node(struct mcs_spinlock *node) { } |
241 | static __always_inline void __pv_wait_node(struct mcs_spinlock *node) { } | 241 | static __always_inline void __pv_wait_node(struct mcs_spinlock *node) { } |
242 | static __always_inline void __pv_kick_node(struct mcs_spinlock *node) { } | 242 | static __always_inline void __pv_kick_node(struct qspinlock *lock, |
243 | 243 | struct mcs_spinlock *node) { } | |
244 | static __always_inline void __pv_wait_head(struct qspinlock *lock, | 244 | static __always_inline void __pv_wait_head(struct qspinlock *lock, |
245 | struct mcs_spinlock *node) { } | 245 | struct mcs_spinlock *node) { } |
246 | 246 | ||
@@ -440,7 +440,7 @@ queue: | |||
440 | cpu_relax(); | 440 | cpu_relax(); |
441 | 441 | ||
442 | arch_mcs_spin_unlock_contended(&next->locked); | 442 | arch_mcs_spin_unlock_contended(&next->locked); |
443 | pv_kick_node(next); | 443 | pv_kick_node(lock, next); |
444 | 444 | ||
445 | release: | 445 | release: |
446 | /* | 446 | /* |
diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h index 04ab18151cc8..c8e6e9a596f5 100644 --- a/kernel/locking/qspinlock_paravirt.h +++ b/kernel/locking/qspinlock_paravirt.h | |||
@@ -4,6 +4,7 @@ | |||
4 | 4 | ||
5 | #include <linux/hash.h> | 5 | #include <linux/hash.h> |
6 | #include <linux/bootmem.h> | 6 | #include <linux/bootmem.h> |
7 | #include <linux/debug_locks.h> | ||
7 | 8 | ||
8 | /* | 9 | /* |
9 | * Implement paravirt qspinlocks; the general idea is to halt the vcpus instead | 10 | * Implement paravirt qspinlocks; the general idea is to halt the vcpus instead |
@@ -21,9 +22,14 @@ | |||
21 | 22 | ||
22 | #define _Q_SLOW_VAL (3U << _Q_LOCKED_OFFSET) | 23 | #define _Q_SLOW_VAL (3U << _Q_LOCKED_OFFSET) |
23 | 24 | ||
25 | /* | ||
26 | * Queue node uses: vcpu_running & vcpu_halted. | ||
27 | * Queue head uses: vcpu_running & vcpu_hashed. | ||
28 | */ | ||
24 | enum vcpu_state { | 29 | enum vcpu_state { |
25 | vcpu_running = 0, | 30 | vcpu_running = 0, |
26 | vcpu_halted, | 31 | vcpu_halted, /* Used only in pv_wait_node */ |
32 | vcpu_hashed, /* = pv_hash'ed + vcpu_halted */ | ||
27 | }; | 33 | }; |
28 | 34 | ||
29 | struct pv_node { | 35 | struct pv_node { |
@@ -152,7 +158,8 @@ static void pv_init_node(struct mcs_spinlock *node) | |||
152 | 158 | ||
153 | /* | 159 | /* |
154 | * Wait for node->locked to become true, halt the vcpu after a short spin. | 160 | * Wait for node->locked to become true, halt the vcpu after a short spin. |
155 | * pv_kick_node() is used to wake the vcpu again. | 161 | * pv_kick_node() is used to set _Q_SLOW_VAL and fill in hash table on its |
162 | * behalf. | ||
156 | */ | 163 | */ |
157 | static void pv_wait_node(struct mcs_spinlock *node) | 164 | static void pv_wait_node(struct mcs_spinlock *node) |
158 | { | 165 | { |
@@ -171,9 +178,9 @@ static void pv_wait_node(struct mcs_spinlock *node) | |||
171 | * | 178 | * |
172 | * [S] pn->state = vcpu_halted [S] next->locked = 1 | 179 | * [S] pn->state = vcpu_halted [S] next->locked = 1 |
173 | * MB MB | 180 | * MB MB |
174 | * [L] pn->locked [RmW] pn->state = vcpu_running | 181 | * [L] pn->locked [RmW] pn->state = vcpu_hashed |
175 | * | 182 | * |
176 | * Matches the xchg() from pv_kick_node(). | 183 | * Matches the cmpxchg() from pv_kick_node(). |
177 | */ | 184 | */ |
178 | smp_store_mb(pn->state, vcpu_halted); | 185 | smp_store_mb(pn->state, vcpu_halted); |
179 | 186 | ||
@@ -181,9 +188,10 @@ static void pv_wait_node(struct mcs_spinlock *node) | |||
181 | pv_wait(&pn->state, vcpu_halted); | 188 | pv_wait(&pn->state, vcpu_halted); |
182 | 189 | ||
183 | /* | 190 | /* |
184 | * Reset the vCPU state to avoid unncessary CPU kicking | 191 | * If pv_kick_node() changed us to vcpu_hashed, retain that value |
192 | * so that pv_wait_head() knows to not also try to hash this lock. | ||
185 | */ | 193 | */ |
186 | WRITE_ONCE(pn->state, vcpu_running); | 194 | cmpxchg(&pn->state, vcpu_halted, vcpu_running); |
187 | 195 | ||
188 | /* | 196 | /* |
189 | * If the locked flag is still not set after wakeup, it is a | 197 | * If the locked flag is still not set after wakeup, it is a |
@@ -193,6 +201,7 @@ static void pv_wait_node(struct mcs_spinlock *node) | |||
193 | * MCS lock will be released soon. | 201 | * MCS lock will be released soon. |
194 | */ | 202 | */ |
195 | } | 203 | } |
204 | |||
196 | /* | 205 | /* |
197 | * By now our node->locked should be 1 and our caller will not actually | 206 | * By now our node->locked should be 1 and our caller will not actually |
198 | * spin-wait for it. We do however rely on our caller to do a | 207 | * spin-wait for it. We do however rely on our caller to do a |
@@ -201,24 +210,35 @@ static void pv_wait_node(struct mcs_spinlock *node) | |||
201 | } | 210 | } |
202 | 211 | ||
203 | /* | 212 | /* |
204 | * Called after setting next->locked = 1, used to wake those stuck in | 213 | * Called after setting next->locked = 1 when we're the lock owner. |
205 | * pv_wait_node(). | 214 | * |
215 | * Instead of waking the waiters stuck in pv_wait_node() advance their state such | ||
216 | * that they're waiting in pv_wait_head(), this avoids a wake/sleep cycle. | ||
206 | */ | 217 | */ |
207 | static void pv_kick_node(struct mcs_spinlock *node) | 218 | static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node) |
208 | { | 219 | { |
209 | struct pv_node *pn = (struct pv_node *)node; | 220 | struct pv_node *pn = (struct pv_node *)node; |
221 | struct __qspinlock *l = (void *)lock; | ||
210 | 222 | ||
211 | /* | 223 | /* |
212 | * Note that because node->locked is already set, this actual | 224 | * If the vCPU is indeed halted, advance its state to match that of |
213 | * mcs_spinlock entry could be re-used already. | 225 | * pv_wait_node(). If OTOH this fails, the vCPU was running and will |
226 | * observe its next->locked value and advance itself. | ||
214 | * | 227 | * |
215 | * This should be fine however, kicking people for no reason is | 228 | * Matches with smp_store_mb() and cmpxchg() in pv_wait_node() |
216 | * harmless. | 229 | */ |
230 | if (cmpxchg(&pn->state, vcpu_halted, vcpu_hashed) != vcpu_halted) | ||
231 | return; | ||
232 | |||
233 | /* | ||
234 | * Put the lock into the hash table and set the _Q_SLOW_VAL. | ||
217 | * | 235 | * |
218 | * See the comment in pv_wait_node(). | 236 | * As this is the same vCPU that will check the _Q_SLOW_VAL value and |
237 | * the hash table later on at unlock time, no atomic instruction is | ||
238 | * needed. | ||
219 | */ | 239 | */ |
220 | if (xchg(&pn->state, vcpu_running) == vcpu_halted) | 240 | WRITE_ONCE(l->locked, _Q_SLOW_VAL); |
221 | pv_kick(pn->cpu); | 241 | (void)pv_hash(lock, pn); |
222 | } | 242 | } |
223 | 243 | ||
224 | /* | 244 | /* |
@@ -232,6 +252,13 @@ static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node) | |||
232 | struct qspinlock **lp = NULL; | 252 | struct qspinlock **lp = NULL; |
233 | int loop; | 253 | int loop; |
234 | 254 | ||
255 | /* | ||
256 | * If pv_kick_node() already advanced our state, we don't need to | ||
257 | * insert ourselves into the hash table anymore. | ||
258 | */ | ||
259 | if (READ_ONCE(pn->state) == vcpu_hashed) | ||
260 | lp = (struct qspinlock **)1; | ||
261 | |||
235 | for (;;) { | 262 | for (;;) { |
236 | for (loop = SPIN_THRESHOLD; loop; loop--) { | 263 | for (loop = SPIN_THRESHOLD; loop; loop--) { |
237 | if (!READ_ONCE(l->locked)) | 264 | if (!READ_ONCE(l->locked)) |
@@ -239,17 +266,22 @@ static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node) | |||
239 | cpu_relax(); | 266 | cpu_relax(); |
240 | } | 267 | } |
241 | 268 | ||
242 | WRITE_ONCE(pn->state, vcpu_halted); | ||
243 | if (!lp) { /* ONCE */ | 269 | if (!lp) { /* ONCE */ |
270 | WRITE_ONCE(pn->state, vcpu_hashed); | ||
244 | lp = pv_hash(lock, pn); | 271 | lp = pv_hash(lock, pn); |
272 | |||
245 | /* | 273 | /* |
246 | * lp must be set before setting _Q_SLOW_VAL | 274 | * We must hash before setting _Q_SLOW_VAL, such that |
275 | * when we observe _Q_SLOW_VAL in __pv_queued_spin_unlock() | ||
276 | * we'll be sure to be able to observe our hash entry. | ||
247 | * | 277 | * |
248 | * [S] lp = lock [RmW] l = l->locked = 0 | 278 | * [S] pn->state |
249 | * MB MB | 279 | * [S] <hash> [Rmw] l->locked == _Q_SLOW_VAL |
250 | * [S] l->locked = _Q_SLOW_VAL [L] lp | 280 | * MB RMB |
281 | * [RmW] l->locked = _Q_SLOW_VAL [L] <unhash> | ||
282 | * [L] pn->state | ||
251 | * | 283 | * |
252 | * Matches the cmpxchg() in __pv_queued_spin_unlock(). | 284 | * Matches the smp_rmb() in __pv_queued_spin_unlock(). |
253 | */ | 285 | */ |
254 | if (!cmpxchg(&l->locked, _Q_LOCKED_VAL, _Q_SLOW_VAL)) { | 286 | if (!cmpxchg(&l->locked, _Q_LOCKED_VAL, _Q_SLOW_VAL)) { |
255 | /* | 287 | /* |
@@ -286,14 +318,32 @@ __visible void __pv_queued_spin_unlock(struct qspinlock *lock) | |||
286 | { | 318 | { |
287 | struct __qspinlock *l = (void *)lock; | 319 | struct __qspinlock *l = (void *)lock; |
288 | struct pv_node *node; | 320 | struct pv_node *node; |
321 | u8 locked; | ||
289 | 322 | ||
290 | /* | 323 | /* |
291 | * We must not unlock if SLOW, because in that case we must first | 324 | * We must not unlock if SLOW, because in that case we must first |
292 | * unhash. Otherwise it would be possible to have multiple @lock | 325 | * unhash. Otherwise it would be possible to have multiple @lock |
293 | * entries, which would be BAD. | 326 | * entries, which would be BAD. |
294 | */ | 327 | */ |
295 | if (likely(cmpxchg(&l->locked, _Q_LOCKED_VAL, 0) == _Q_LOCKED_VAL)) | 328 | locked = cmpxchg(&l->locked, _Q_LOCKED_VAL, 0); |
329 | if (likely(locked == _Q_LOCKED_VAL)) | ||
330 | return; | ||
331 | |||
332 | if (unlikely(locked != _Q_SLOW_VAL)) { | ||
333 | WARN(!debug_locks_silent, | ||
334 | "pvqspinlock: lock 0x%lx has corrupted value 0x%x!\n", | ||
335 | (unsigned long)lock, atomic_read(&lock->val)); | ||
296 | return; | 336 | return; |
337 | } | ||
338 | |||
339 | /* | ||
340 | * A failed cmpxchg doesn't provide any memory-ordering guarantees, | ||
341 | * so we need a barrier to order the read of the node data in | ||
342 | * pv_unhash *after* we've read the lock being _Q_SLOW_VAL. | ||
343 | * | ||
344 | * Matches the cmpxchg() in pv_wait_head() setting _Q_SLOW_VAL. | ||
345 | */ | ||
346 | smp_rmb(); | ||
297 | 347 | ||
298 | /* | 348 | /* |
299 | * Since the above failed to release, this must be the SLOW path. | 349 | * Since the above failed to release, this must be the SLOW path. |
@@ -310,8 +360,11 @@ __visible void __pv_queued_spin_unlock(struct qspinlock *lock) | |||
310 | /* | 360 | /* |
311 | * At this point the memory pointed at by lock can be freed/reused, | 361 | * At this point the memory pointed at by lock can be freed/reused, |
312 | * however we can still use the pv_node to kick the CPU. | 362 | * however we can still use the pv_node to kick the CPU. |
363 | * The other vCPU may not really be halted, but kicking an active | ||
364 | * vCPU is harmless other than the additional latency in completing | ||
365 | * the unlock. | ||
313 | */ | 366 | */ |
314 | if (READ_ONCE(node->state) == vcpu_halted) | 367 | if (READ_ONCE(node->state) == vcpu_hashed) |
315 | pv_kick(node->cpu); | 368 | pv_kick(node->cpu); |
316 | } | 369 | } |
317 | /* | 370 | /* |
diff --git a/kernel/locking/rtmutex-tester.c b/kernel/locking/rtmutex-tester.c deleted file mode 100644 index 1d96dd0d93c1..000000000000 --- a/kernel/locking/rtmutex-tester.c +++ /dev/null | |||
@@ -1,420 +0,0 @@ | |||
1 | /* | ||
2 | * RT-Mutex-tester: scriptable tester for rt mutexes | ||
3 | * | ||
4 | * started by Thomas Gleixner: | ||
5 | * | ||
6 | * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com> | ||
7 | * | ||
8 | */ | ||
9 | #include <linux/device.h> | ||
10 | #include <linux/kthread.h> | ||
11 | #include <linux/export.h> | ||
12 | #include <linux/sched.h> | ||
13 | #include <linux/sched/rt.h> | ||
14 | #include <linux/spinlock.h> | ||
15 | #include <linux/timer.h> | ||
16 | #include <linux/freezer.h> | ||
17 | #include <linux/stat.h> | ||
18 | |||
19 | #include "rtmutex.h" | ||
20 | |||
21 | #define MAX_RT_TEST_THREADS 8 | ||
22 | #define MAX_RT_TEST_MUTEXES 8 | ||
23 | |||
24 | static spinlock_t rttest_lock; | ||
25 | static atomic_t rttest_event; | ||
26 | |||
27 | struct test_thread_data { | ||
28 | int opcode; | ||
29 | int opdata; | ||
30 | int mutexes[MAX_RT_TEST_MUTEXES]; | ||
31 | int event; | ||
32 | struct device dev; | ||
33 | }; | ||
34 | |||
35 | static struct test_thread_data thread_data[MAX_RT_TEST_THREADS]; | ||
36 | static struct task_struct *threads[MAX_RT_TEST_THREADS]; | ||
37 | static struct rt_mutex mutexes[MAX_RT_TEST_MUTEXES]; | ||
38 | |||
39 | enum test_opcodes { | ||
40 | RTTEST_NOP = 0, | ||
41 | RTTEST_SCHEDOT, /* 1 Sched other, data = nice */ | ||
42 | RTTEST_SCHEDRT, /* 2 Sched fifo, data = prio */ | ||
43 | RTTEST_LOCK, /* 3 Lock uninterruptible, data = lockindex */ | ||
44 | RTTEST_LOCKNOWAIT, /* 4 Lock uninterruptible no wait in wakeup, data = lockindex */ | ||
45 | RTTEST_LOCKINT, /* 5 Lock interruptible, data = lockindex */ | ||
46 | RTTEST_LOCKINTNOWAIT, /* 6 Lock interruptible no wait in wakeup, data = lockindex */ | ||
47 | RTTEST_LOCKCONT, /* 7 Continue locking after the wakeup delay */ | ||
48 | RTTEST_UNLOCK, /* 8 Unlock, data = lockindex */ | ||
49 | /* 9, 10 - reserved for BKL commemoration */ | ||
50 | RTTEST_SIGNAL = 11, /* 11 Signal other test thread, data = thread id */ | ||
51 | RTTEST_RESETEVENT = 98, /* 98 Reset event counter */ | ||
52 | RTTEST_RESET = 99, /* 99 Reset all pending operations */ | ||
53 | }; | ||
54 | |||
55 | static int handle_op(struct test_thread_data *td, int lockwakeup) | ||
56 | { | ||
57 | int i, id, ret = -EINVAL; | ||
58 | |||
59 | switch(td->opcode) { | ||
60 | |||
61 | case RTTEST_NOP: | ||
62 | return 0; | ||
63 | |||
64 | case RTTEST_LOCKCONT: | ||
65 | td->mutexes[td->opdata] = 1; | ||
66 | td->event = atomic_add_return(1, &rttest_event); | ||
67 | return 0; | ||
68 | |||
69 | case RTTEST_RESET: | ||
70 | for (i = 0; i < MAX_RT_TEST_MUTEXES; i++) { | ||
71 | if (td->mutexes[i] == 4) { | ||
72 | rt_mutex_unlock(&mutexes[i]); | ||
73 | td->mutexes[i] = 0; | ||
74 | } | ||
75 | } | ||
76 | return 0; | ||
77 | |||
78 | case RTTEST_RESETEVENT: | ||
79 | atomic_set(&rttest_event, 0); | ||
80 | return 0; | ||
81 | |||
82 | default: | ||
83 | if (lockwakeup) | ||
84 | return ret; | ||
85 | } | ||
86 | |||
87 | switch(td->opcode) { | ||
88 | |||
89 | case RTTEST_LOCK: | ||
90 | case RTTEST_LOCKNOWAIT: | ||
91 | id = td->opdata; | ||
92 | if (id < 0 || id >= MAX_RT_TEST_MUTEXES) | ||
93 | return ret; | ||
94 | |||
95 | td->mutexes[id] = 1; | ||
96 | td->event = atomic_add_return(1, &rttest_event); | ||
97 | rt_mutex_lock(&mutexes[id]); | ||
98 | td->event = atomic_add_return(1, &rttest_event); | ||
99 | td->mutexes[id] = 4; | ||
100 | return 0; | ||
101 | |||
102 | case RTTEST_LOCKINT: | ||
103 | case RTTEST_LOCKINTNOWAIT: | ||
104 | id = td->opdata; | ||
105 | if (id < 0 || id >= MAX_RT_TEST_MUTEXES) | ||
106 | return ret; | ||
107 | |||
108 | td->mutexes[id] = 1; | ||
109 | td->event = atomic_add_return(1, &rttest_event); | ||
110 | ret = rt_mutex_lock_interruptible(&mutexes[id], 0); | ||
111 | td->event = atomic_add_return(1, &rttest_event); | ||
112 | td->mutexes[id] = ret ? 0 : 4; | ||
113 | return ret ? -EINTR : 0; | ||
114 | |||
115 | case RTTEST_UNLOCK: | ||
116 | id = td->opdata; | ||
117 | if (id < 0 || id >= MAX_RT_TEST_MUTEXES || td->mutexes[id] != 4) | ||
118 | return ret; | ||
119 | |||
120 | td->event = atomic_add_return(1, &rttest_event); | ||
121 | rt_mutex_unlock(&mutexes[id]); | ||
122 | td->event = atomic_add_return(1, &rttest_event); | ||
123 | td->mutexes[id] = 0; | ||
124 | return 0; | ||
125 | |||
126 | default: | ||
127 | break; | ||
128 | } | ||
129 | return ret; | ||
130 | } | ||
131 | |||
132 | /* | ||
133 | * Schedule replacement for rtsem_down(). Only called for threads with | ||
134 | * PF_MUTEX_TESTER set. | ||
135 | * | ||
136 | * This allows us to have finegrained control over the event flow. | ||
137 | * | ||
138 | */ | ||
139 | void schedule_rt_mutex_test(struct rt_mutex *mutex) | ||
140 | { | ||
141 | int tid, op, dat; | ||
142 | struct test_thread_data *td; | ||
143 | |||
144 | /* We have to lookup the task */ | ||
145 | for (tid = 0; tid < MAX_RT_TEST_THREADS; tid++) { | ||
146 | if (threads[tid] == current) | ||
147 | break; | ||
148 | } | ||
149 | |||
150 | BUG_ON(tid == MAX_RT_TEST_THREADS); | ||
151 | |||
152 | td = &thread_data[tid]; | ||
153 | |||
154 | op = td->opcode; | ||
155 | dat = td->opdata; | ||
156 | |||
157 | switch (op) { | ||
158 | case RTTEST_LOCK: | ||
159 | case RTTEST_LOCKINT: | ||
160 | case RTTEST_LOCKNOWAIT: | ||
161 | case RTTEST_LOCKINTNOWAIT: | ||
162 | if (mutex != &mutexes[dat]) | ||
163 | break; | ||
164 | |||
165 | if (td->mutexes[dat] != 1) | ||
166 | break; | ||
167 | |||
168 | td->mutexes[dat] = 2; | ||
169 | td->event = atomic_add_return(1, &rttest_event); | ||
170 | break; | ||
171 | |||
172 | default: | ||
173 | break; | ||
174 | } | ||
175 | |||
176 | schedule(); | ||
177 | |||
178 | |||
179 | switch (op) { | ||
180 | case RTTEST_LOCK: | ||
181 | case RTTEST_LOCKINT: | ||
182 | if (mutex != &mutexes[dat]) | ||
183 | return; | ||
184 | |||
185 | if (td->mutexes[dat] != 2) | ||
186 | return; | ||
187 | |||
188 | td->mutexes[dat] = 3; | ||
189 | td->event = atomic_add_return(1, &rttest_event); | ||
190 | break; | ||
191 | |||
192 | case RTTEST_LOCKNOWAIT: | ||
193 | case RTTEST_LOCKINTNOWAIT: | ||
194 | if (mutex != &mutexes[dat]) | ||
195 | return; | ||
196 | |||
197 | if (td->mutexes[dat] != 2) | ||
198 | return; | ||
199 | |||
200 | td->mutexes[dat] = 1; | ||
201 | td->event = atomic_add_return(1, &rttest_event); | ||
202 | return; | ||
203 | |||
204 | default: | ||
205 | return; | ||
206 | } | ||
207 | |||
208 | td->opcode = 0; | ||
209 | |||
210 | for (;;) { | ||
211 | set_current_state(TASK_INTERRUPTIBLE); | ||
212 | |||
213 | if (td->opcode > 0) { | ||
214 | int ret; | ||
215 | |||
216 | set_current_state(TASK_RUNNING); | ||
217 | ret = handle_op(td, 1); | ||
218 | set_current_state(TASK_INTERRUPTIBLE); | ||
219 | if (td->opcode == RTTEST_LOCKCONT) | ||
220 | break; | ||
221 | td->opcode = ret; | ||
222 | } | ||
223 | |||
224 | /* Wait for the next command to be executed */ | ||
225 | schedule(); | ||
226 | } | ||
227 | |||
228 | /* Restore previous command and data */ | ||
229 | td->opcode = op; | ||
230 | td->opdata = dat; | ||
231 | } | ||
232 | |||
233 | static int test_func(void *data) | ||
234 | { | ||
235 | struct test_thread_data *td = data; | ||
236 | int ret; | ||
237 | |||
238 | current->flags |= PF_MUTEX_TESTER; | ||
239 | set_freezable(); | ||
240 | allow_signal(SIGHUP); | ||
241 | |||
242 | for(;;) { | ||
243 | |||
244 | set_current_state(TASK_INTERRUPTIBLE); | ||
245 | |||
246 | if (td->opcode > 0) { | ||
247 | set_current_state(TASK_RUNNING); | ||
248 | ret = handle_op(td, 0); | ||
249 | set_current_state(TASK_INTERRUPTIBLE); | ||
250 | td->opcode = ret; | ||
251 | } | ||
252 | |||
253 | /* Wait for the next command to be executed */ | ||
254 | schedule(); | ||
255 | try_to_freeze(); | ||
256 | |||
257 | if (signal_pending(current)) | ||
258 | flush_signals(current); | ||
259 | |||
260 | if(kthread_should_stop()) | ||
261 | break; | ||
262 | } | ||
263 | return 0; | ||
264 | } | ||
265 | |||
266 | /** | ||
267 | * sysfs_test_command - interface for test commands | ||
268 | * @dev: thread reference | ||
269 | * @buf: command for actual step | ||
270 | * @count: length of buffer | ||
271 | * | ||
272 | * command syntax: | ||
273 | * | ||
274 | * opcode:data | ||
275 | */ | ||
276 | static ssize_t sysfs_test_command(struct device *dev, struct device_attribute *attr, | ||
277 | const char *buf, size_t count) | ||
278 | { | ||
279 | struct sched_param schedpar; | ||
280 | struct test_thread_data *td; | ||
281 | char cmdbuf[32]; | ||
282 | int op, dat, tid, ret; | ||
283 | |||
284 | td = container_of(dev, struct test_thread_data, dev); | ||
285 | tid = td->dev.id; | ||
286 | |||
287 | /* strings from sysfs write are not 0 terminated! */ | ||
288 | if (count >= sizeof(cmdbuf)) | ||
289 | return -EINVAL; | ||
290 | |||
291 | /* strip of \n: */ | ||
292 | if (buf[count-1] == '\n') | ||
293 | count--; | ||
294 | if (count < 1) | ||
295 | return -EINVAL; | ||
296 | |||
297 | memcpy(cmdbuf, buf, count); | ||
298 | cmdbuf[count] = 0; | ||
299 | |||
300 | if (sscanf(cmdbuf, "%d:%d", &op, &dat) != 2) | ||
301 | return -EINVAL; | ||
302 | |||
303 | switch (op) { | ||
304 | case RTTEST_SCHEDOT: | ||
305 | schedpar.sched_priority = 0; | ||
306 | ret = sched_setscheduler(threads[tid], SCHED_NORMAL, &schedpar); | ||
307 | if (ret) | ||
308 | return ret; | ||
309 | set_user_nice(current, 0); | ||
310 | break; | ||
311 | |||
312 | case RTTEST_SCHEDRT: | ||
313 | schedpar.sched_priority = dat; | ||
314 | ret = sched_setscheduler(threads[tid], SCHED_FIFO, &schedpar); | ||
315 | if (ret) | ||
316 | return ret; | ||
317 | break; | ||
318 | |||
319 | case RTTEST_SIGNAL: | ||
320 | send_sig(SIGHUP, threads[tid], 0); | ||
321 | break; | ||
322 | |||
323 | default: | ||
324 | if (td->opcode > 0) | ||
325 | return -EBUSY; | ||
326 | td->opdata = dat; | ||
327 | td->opcode = op; | ||
328 | wake_up_process(threads[tid]); | ||
329 | } | ||
330 | |||
331 | return count; | ||
332 | } | ||
333 | |||
334 | /** | ||
335 | * sysfs_test_status - sysfs interface for rt tester | ||
336 | * @dev: thread to query | ||
337 | * @buf: char buffer to be filled with thread status info | ||
338 | */ | ||
339 | static ssize_t sysfs_test_status(struct device *dev, struct device_attribute *attr, | ||
340 | char *buf) | ||
341 | { | ||
342 | struct test_thread_data *td; | ||
343 | struct task_struct *tsk; | ||
344 | char *curr = buf; | ||
345 | int i; | ||
346 | |||
347 | td = container_of(dev, struct test_thread_data, dev); | ||
348 | tsk = threads[td->dev.id]; | ||
349 | |||
350 | spin_lock(&rttest_lock); | ||
351 | |||
352 | curr += sprintf(curr, | ||
353 | "O: %4d, E:%8d, S: 0x%08lx, P: %4d, N: %4d, B: %p, M:", | ||
354 | td->opcode, td->event, tsk->state, | ||
355 | (MAX_RT_PRIO - 1) - tsk->prio, | ||
356 | (MAX_RT_PRIO - 1) - tsk->normal_prio, | ||
357 | tsk->pi_blocked_on); | ||
358 | |||
359 | for (i = MAX_RT_TEST_MUTEXES - 1; i >=0 ; i--) | ||
360 | curr += sprintf(curr, "%d", td->mutexes[i]); | ||
361 | |||
362 | spin_unlock(&rttest_lock); | ||
363 | |||
364 | curr += sprintf(curr, ", T: %p, R: %p\n", tsk, | ||
365 | mutexes[td->dev.id].owner); | ||
366 | |||
367 | return curr - buf; | ||
368 | } | ||
369 | |||
370 | static DEVICE_ATTR(status, S_IRUSR, sysfs_test_status, NULL); | ||
371 | static DEVICE_ATTR(command, S_IWUSR, NULL, sysfs_test_command); | ||
372 | |||
373 | static struct bus_type rttest_subsys = { | ||
374 | .name = "rttest", | ||
375 | .dev_name = "rttest", | ||
376 | }; | ||
377 | |||
378 | static int init_test_thread(int id) | ||
379 | { | ||
380 | thread_data[id].dev.bus = &rttest_subsys; | ||
381 | thread_data[id].dev.id = id; | ||
382 | |||
383 | threads[id] = kthread_run(test_func, &thread_data[id], "rt-test-%d", id); | ||
384 | if (IS_ERR(threads[id])) | ||
385 | return PTR_ERR(threads[id]); | ||
386 | |||
387 | return device_register(&thread_data[id].dev); | ||
388 | } | ||
389 | |||
390 | static int init_rttest(void) | ||
391 | { | ||
392 | int ret, i; | ||
393 | |||
394 | spin_lock_init(&rttest_lock); | ||
395 | |||
396 | for (i = 0; i < MAX_RT_TEST_MUTEXES; i++) | ||
397 | rt_mutex_init(&mutexes[i]); | ||
398 | |||
399 | ret = subsys_system_register(&rttest_subsys, NULL); | ||
400 | if (ret) | ||
401 | return ret; | ||
402 | |||
403 | for (i = 0; i < MAX_RT_TEST_THREADS; i++) { | ||
404 | ret = init_test_thread(i); | ||
405 | if (ret) | ||
406 | break; | ||
407 | ret = device_create_file(&thread_data[i].dev, &dev_attr_status); | ||
408 | if (ret) | ||
409 | break; | ||
410 | ret = device_create_file(&thread_data[i].dev, &dev_attr_command); | ||
411 | if (ret) | ||
412 | break; | ||
413 | } | ||
414 | |||
415 | printk("Initializing RT-Tester: %s\n", ret ? "Failed" : "OK" ); | ||
416 | |||
417 | return ret; | ||
418 | } | ||
419 | |||
420 | device_initcall(init_rttest); | ||
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 5674b073473c..7781d801212f 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c | |||
@@ -1120,7 +1120,7 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state, | |||
1120 | 1120 | ||
1121 | debug_rt_mutex_print_deadlock(waiter); | 1121 | debug_rt_mutex_print_deadlock(waiter); |
1122 | 1122 | ||
1123 | schedule_rt_mutex(lock); | 1123 | schedule(); |
1124 | 1124 | ||
1125 | raw_spin_lock(&lock->wait_lock); | 1125 | raw_spin_lock(&lock->wait_lock); |
1126 | set_current_state(state); | 1126 | set_current_state(state); |
diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h index 7844f8f0e639..4f5f83c7d2d3 100644 --- a/kernel/locking/rtmutex_common.h +++ b/kernel/locking/rtmutex_common.h | |||
@@ -15,28 +15,6 @@ | |||
15 | #include <linux/rtmutex.h> | 15 | #include <linux/rtmutex.h> |
16 | 16 | ||
17 | /* | 17 | /* |
18 | * The rtmutex in kernel tester is independent of rtmutex debugging. We | ||
19 | * call schedule_rt_mutex_test() instead of schedule() for the tasks which | ||
20 | * belong to the tester. That way we can delay the wakeup path of those | ||
21 | * threads to provoke lock stealing and testing of complex boosting scenarios. | ||
22 | */ | ||
23 | #ifdef CONFIG_RT_MUTEX_TESTER | ||
24 | |||
25 | extern void schedule_rt_mutex_test(struct rt_mutex *lock); | ||
26 | |||
27 | #define schedule_rt_mutex(_lock) \ | ||
28 | do { \ | ||
29 | if (!(current->flags & PF_MUTEX_TESTER)) \ | ||
30 | schedule(); \ | ||
31 | else \ | ||
32 | schedule_rt_mutex_test(_lock); \ | ||
33 | } while (0) | ||
34 | |||
35 | #else | ||
36 | # define schedule_rt_mutex(_lock) schedule() | ||
37 | #endif | ||
38 | |||
39 | /* | ||
40 | * This is the control structure for tasks blocked on a rt_mutex, | 18 | * This is the control structure for tasks blocked on a rt_mutex, |
41 | * which is allocated on the kernel stack on of the blocked task. | 19 | * which is allocated on the kernel stack on of the blocked task. |
42 | * | 20 | * |
diff --git a/kernel/membarrier.c b/kernel/membarrier.c new file mode 100644 index 000000000000..536c727a56e9 --- /dev/null +++ b/kernel/membarrier.c | |||
@@ -0,0 +1,66 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2010, 2015 Mathieu Desnoyers <mathieu.desnoyers@efficios.com> | ||
3 | * | ||
4 | * membarrier system call | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or modify | ||
7 | * it under the terms of the GNU General Public License as published by | ||
8 | * the Free Software Foundation; either version 2 of the License, or | ||
9 | * (at your option) any later version. | ||
10 | * | ||
11 | * This program is distributed in the hope that it will be useful, | ||
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
14 | * GNU General Public License for more details. | ||
15 | */ | ||
16 | |||
17 | #include <linux/syscalls.h> | ||
18 | #include <linux/membarrier.h> | ||
19 | |||
20 | /* | ||
21 | * Bitmask made from a "or" of all commands within enum membarrier_cmd, | ||
22 | * except MEMBARRIER_CMD_QUERY. | ||
23 | */ | ||
24 | #define MEMBARRIER_CMD_BITMASK (MEMBARRIER_CMD_SHARED) | ||
25 | |||
26 | /** | ||
27 | * sys_membarrier - issue memory barriers on a set of threads | ||
28 | * @cmd: Takes command values defined in enum membarrier_cmd. | ||
29 | * @flags: Currently needs to be 0. For future extensions. | ||
30 | * | ||
31 | * If this system call is not implemented, -ENOSYS is returned. If the | ||
32 | * command specified does not exist, or if the command argument is invalid, | ||
33 | * this system call returns -EINVAL. For a given command, with flags argument | ||
34 | * set to 0, this system call is guaranteed to always return the same value | ||
35 | * until reboot. | ||
36 | * | ||
37 | * All memory accesses performed in program order from each targeted thread | ||
38 | * is guaranteed to be ordered with respect to sys_membarrier(). If we use | ||
39 | * the semantic "barrier()" to represent a compiler barrier forcing memory | ||
40 | * accesses to be performed in program order across the barrier, and | ||
41 | * smp_mb() to represent explicit memory barriers forcing full memory | ||
42 | * ordering across the barrier, we have the following ordering table for | ||
43 | * each pair of barrier(), sys_membarrier() and smp_mb(): | ||
44 | * | ||
45 | * The pair ordering is detailed as (O: ordered, X: not ordered): | ||
46 | * | ||
47 | * barrier() smp_mb() sys_membarrier() | ||
48 | * barrier() X X O | ||
49 | * smp_mb() X O O | ||
50 | * sys_membarrier() O O O | ||
51 | */ | ||
52 | SYSCALL_DEFINE2(membarrier, int, cmd, int, flags) | ||
53 | { | ||
54 | if (unlikely(flags)) | ||
55 | return -EINVAL; | ||
56 | switch (cmd) { | ||
57 | case MEMBARRIER_CMD_QUERY: | ||
58 | return MEMBARRIER_CMD_BITMASK; | ||
59 | case MEMBARRIER_CMD_SHARED: | ||
60 | if (num_online_cpus() > 1) | ||
61 | synchronize_sched(); | ||
62 | return 0; | ||
63 | default: | ||
64 | return -EINVAL; | ||
65 | } | ||
66 | } | ||
diff --git a/kernel/memremap.c b/kernel/memremap.c new file mode 100644 index 000000000000..72b0c66628b6 --- /dev/null +++ b/kernel/memremap.c | |||
@@ -0,0 +1,190 @@ | |||
1 | /* | ||
2 | * Copyright(c) 2015 Intel Corporation. All rights reserved. | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or modify | ||
5 | * it under the terms of version 2 of the GNU General Public License as | ||
6 | * published by the Free Software Foundation. | ||
7 | * | ||
8 | * This program is distributed in the hope that it will be useful, but | ||
9 | * WITHOUT ANY WARRANTY; without even the implied warranty of | ||
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
11 | * General Public License for more details. | ||
12 | */ | ||
13 | #include <linux/device.h> | ||
14 | #include <linux/types.h> | ||
15 | #include <linux/io.h> | ||
16 | #include <linux/mm.h> | ||
17 | #include <linux/memory_hotplug.h> | ||
18 | |||
19 | #ifndef ioremap_cache | ||
20 | /* temporary while we convert existing ioremap_cache users to memremap */ | ||
21 | __weak void __iomem *ioremap_cache(resource_size_t offset, unsigned long size) | ||
22 | { | ||
23 | return ioremap(offset, size); | ||
24 | } | ||
25 | #endif | ||
26 | |||
27 | /** | ||
28 | * memremap() - remap an iomem_resource as cacheable memory | ||
29 | * @offset: iomem resource start address | ||
30 | * @size: size of remap | ||
31 | * @flags: either MEMREMAP_WB or MEMREMAP_WT | ||
32 | * | ||
33 | * memremap() is "ioremap" for cases where it is known that the resource | ||
34 | * being mapped does not have i/o side effects and the __iomem | ||
35 | * annotation is not applicable. | ||
36 | * | ||
37 | * MEMREMAP_WB - matches the default mapping for "System RAM" on | ||
38 | * the architecture. This is usually a read-allocate write-back cache. | ||
39 | * Morever, if MEMREMAP_WB is specified and the requested remap region is RAM | ||
40 | * memremap() will bypass establishing a new mapping and instead return | ||
41 | * a pointer into the direct map. | ||
42 | * | ||
43 | * MEMREMAP_WT - establish a mapping whereby writes either bypass the | ||
44 | * cache or are written through to memory and never exist in a | ||
45 | * cache-dirty state with respect to program visibility. Attempts to | ||
46 | * map "System RAM" with this mapping type will fail. | ||
47 | */ | ||
48 | void *memremap(resource_size_t offset, size_t size, unsigned long flags) | ||
49 | { | ||
50 | int is_ram = region_intersects(offset, size, "System RAM"); | ||
51 | void *addr = NULL; | ||
52 | |||
53 | if (is_ram == REGION_MIXED) { | ||
54 | WARN_ONCE(1, "memremap attempted on mixed range %pa size: %#lx\n", | ||
55 | &offset, (unsigned long) size); | ||
56 | return NULL; | ||
57 | } | ||
58 | |||
59 | /* Try all mapping types requested until one returns non-NULL */ | ||
60 | if (flags & MEMREMAP_WB) { | ||
61 | flags &= ~MEMREMAP_WB; | ||
62 | /* | ||
63 | * MEMREMAP_WB is special in that it can be satisifed | ||
64 | * from the direct map. Some archs depend on the | ||
65 | * capability of memremap() to autodetect cases where | ||
66 | * the requested range is potentially in "System RAM" | ||
67 | */ | ||
68 | if (is_ram == REGION_INTERSECTS) | ||
69 | addr = __va(offset); | ||
70 | else | ||
71 | addr = ioremap_cache(offset, size); | ||
72 | } | ||
73 | |||
74 | /* | ||
75 | * If we don't have a mapping yet and more request flags are | ||
76 | * pending then we will be attempting to establish a new virtual | ||
77 | * address mapping. Enforce that this mapping is not aliasing | ||
78 | * "System RAM" | ||
79 | */ | ||
80 | if (!addr && is_ram == REGION_INTERSECTS && flags) { | ||
81 | WARN_ONCE(1, "memremap attempted on ram %pa size: %#lx\n", | ||
82 | &offset, (unsigned long) size); | ||
83 | return NULL; | ||
84 | } | ||
85 | |||
86 | if (!addr && (flags & MEMREMAP_WT)) { | ||
87 | flags &= ~MEMREMAP_WT; | ||
88 | addr = ioremap_wt(offset, size); | ||
89 | } | ||
90 | |||
91 | return addr; | ||
92 | } | ||
93 | EXPORT_SYMBOL(memremap); | ||
94 | |||
95 | void memunmap(void *addr) | ||
96 | { | ||
97 | if (is_vmalloc_addr(addr)) | ||
98 | iounmap((void __iomem *) addr); | ||
99 | } | ||
100 | EXPORT_SYMBOL(memunmap); | ||
101 | |||
102 | static void devm_memremap_release(struct device *dev, void *res) | ||
103 | { | ||
104 | memunmap(res); | ||
105 | } | ||
106 | |||
107 | static int devm_memremap_match(struct device *dev, void *res, void *match_data) | ||
108 | { | ||
109 | return *(void **)res == match_data; | ||
110 | } | ||
111 | |||
112 | void *devm_memremap(struct device *dev, resource_size_t offset, | ||
113 | size_t size, unsigned long flags) | ||
114 | { | ||
115 | void **ptr, *addr; | ||
116 | |||
117 | ptr = devres_alloc(devm_memremap_release, sizeof(*ptr), GFP_KERNEL); | ||
118 | if (!ptr) | ||
119 | return NULL; | ||
120 | |||
121 | addr = memremap(offset, size, flags); | ||
122 | if (addr) { | ||
123 | *ptr = addr; | ||
124 | devres_add(dev, ptr); | ||
125 | } else | ||
126 | devres_free(ptr); | ||
127 | |||
128 | return addr; | ||
129 | } | ||
130 | EXPORT_SYMBOL(devm_memremap); | ||
131 | |||
132 | void devm_memunmap(struct device *dev, void *addr) | ||
133 | { | ||
134 | WARN_ON(devres_destroy(dev, devm_memremap_release, devm_memremap_match, | ||
135 | addr)); | ||
136 | memunmap(addr); | ||
137 | } | ||
138 | EXPORT_SYMBOL(devm_memunmap); | ||
139 | |||
140 | #ifdef CONFIG_ZONE_DEVICE | ||
141 | struct page_map { | ||
142 | struct resource res; | ||
143 | }; | ||
144 | |||
145 | static void devm_memremap_pages_release(struct device *dev, void *res) | ||
146 | { | ||
147 | struct page_map *page_map = res; | ||
148 | |||
149 | /* pages are dead and unused, undo the arch mapping */ | ||
150 | arch_remove_memory(page_map->res.start, resource_size(&page_map->res)); | ||
151 | } | ||
152 | |||
153 | void *devm_memremap_pages(struct device *dev, struct resource *res) | ||
154 | { | ||
155 | int is_ram = region_intersects(res->start, resource_size(res), | ||
156 | "System RAM"); | ||
157 | struct page_map *page_map; | ||
158 | int error, nid; | ||
159 | |||
160 | if (is_ram == REGION_MIXED) { | ||
161 | WARN_ONCE(1, "%s attempted on mixed region %pr\n", | ||
162 | __func__, res); | ||
163 | return ERR_PTR(-ENXIO); | ||
164 | } | ||
165 | |||
166 | if (is_ram == REGION_INTERSECTS) | ||
167 | return __va(res->start); | ||
168 | |||
169 | page_map = devres_alloc(devm_memremap_pages_release, | ||
170 | sizeof(*page_map), GFP_KERNEL); | ||
171 | if (!page_map) | ||
172 | return ERR_PTR(-ENOMEM); | ||
173 | |||
174 | memcpy(&page_map->res, res, sizeof(*res)); | ||
175 | |||
176 | nid = dev_to_node(dev); | ||
177 | if (nid < 0) | ||
178 | nid = 0; | ||
179 | |||
180 | error = arch_add_memory(nid, res->start, resource_size(res), true); | ||
181 | if (error) { | ||
182 | devres_free(page_map); | ||
183 | return ERR_PTR(error); | ||
184 | } | ||
185 | |||
186 | devres_add(dev, page_map); | ||
187 | return __va(res->start); | ||
188 | } | ||
189 | EXPORT_SYMBOL(devm_memremap_pages); | ||
190 | #endif /* CONFIG_ZONE_DEVICE */ | ||
diff --git a/kernel/module.c b/kernel/module.c index 4d2b82e610e2..b86b7bf1be38 100644 --- a/kernel/module.c +++ b/kernel/module.c | |||
@@ -602,13 +602,16 @@ const struct kernel_symbol *find_symbol(const char *name, | |||
602 | } | 602 | } |
603 | EXPORT_SYMBOL_GPL(find_symbol); | 603 | EXPORT_SYMBOL_GPL(find_symbol); |
604 | 604 | ||
605 | /* Search for module by name: must hold module_mutex. */ | 605 | /* |
606 | * Search for module by name: must hold module_mutex (or preempt disabled | ||
607 | * for read-only access). | ||
608 | */ | ||
606 | static struct module *find_module_all(const char *name, size_t len, | 609 | static struct module *find_module_all(const char *name, size_t len, |
607 | bool even_unformed) | 610 | bool even_unformed) |
608 | { | 611 | { |
609 | struct module *mod; | 612 | struct module *mod; |
610 | 613 | ||
611 | module_assert_mutex(); | 614 | module_assert_mutex_or_preempt(); |
612 | 615 | ||
613 | list_for_each_entry(mod, &modules, list) { | 616 | list_for_each_entry(mod, &modules, list) { |
614 | if (!even_unformed && mod->state == MODULE_STATE_UNFORMED) | 617 | if (!even_unformed && mod->state == MODULE_STATE_UNFORMED) |
@@ -621,6 +624,7 @@ static struct module *find_module_all(const char *name, size_t len, | |||
621 | 624 | ||
622 | struct module *find_module(const char *name) | 625 | struct module *find_module(const char *name) |
623 | { | 626 | { |
627 | module_assert_mutex(); | ||
624 | return find_module_all(name, strlen(name), false); | 628 | return find_module_all(name, strlen(name), false); |
625 | } | 629 | } |
626 | EXPORT_SYMBOL_GPL(find_module); | 630 | EXPORT_SYMBOL_GPL(find_module); |
diff --git a/kernel/module_signing.c b/kernel/module_signing.c index be5b8fac4bd0..bd62f5cda746 100644 --- a/kernel/module_signing.c +++ b/kernel/module_signing.c | |||
@@ -10,11 +10,8 @@ | |||
10 | */ | 10 | */ |
11 | 11 | ||
12 | #include <linux/kernel.h> | 12 | #include <linux/kernel.h> |
13 | #include <linux/err.h> | ||
14 | #include <crypto/public_key.h> | ||
15 | #include <crypto/hash.h> | ||
16 | #include <keys/asymmetric-type.h> | ||
17 | #include <keys/system_keyring.h> | 13 | #include <keys/system_keyring.h> |
14 | #include <crypto/public_key.h> | ||
18 | #include "module-internal.h" | 15 | #include "module-internal.h" |
19 | 16 | ||
20 | /* | 17 | /* |
@@ -28,170 +25,22 @@ | |||
28 | * - Information block | 25 | * - Information block |
29 | */ | 26 | */ |
30 | struct module_signature { | 27 | struct module_signature { |
31 | u8 algo; /* Public-key crypto algorithm [enum pkey_algo] */ | 28 | u8 algo; /* Public-key crypto algorithm [0] */ |
32 | u8 hash; /* Digest algorithm [enum hash_algo] */ | 29 | u8 hash; /* Digest algorithm [0] */ |
33 | u8 id_type; /* Key identifier type [enum pkey_id_type] */ | 30 | u8 id_type; /* Key identifier type [PKEY_ID_PKCS7] */ |
34 | u8 signer_len; /* Length of signer's name */ | 31 | u8 signer_len; /* Length of signer's name [0] */ |
35 | u8 key_id_len; /* Length of key identifier */ | 32 | u8 key_id_len; /* Length of key identifier [0] */ |
36 | u8 __pad[3]; | 33 | u8 __pad[3]; |
37 | __be32 sig_len; /* Length of signature data */ | 34 | __be32 sig_len; /* Length of signature data */ |
38 | }; | 35 | }; |
39 | 36 | ||
40 | /* | 37 | /* |
41 | * Digest the module contents. | ||
42 | */ | ||
43 | static struct public_key_signature *mod_make_digest(enum hash_algo hash, | ||
44 | const void *mod, | ||
45 | unsigned long modlen) | ||
46 | { | ||
47 | struct public_key_signature *pks; | ||
48 | struct crypto_shash *tfm; | ||
49 | struct shash_desc *desc; | ||
50 | size_t digest_size, desc_size; | ||
51 | int ret; | ||
52 | |||
53 | pr_devel("==>%s()\n", __func__); | ||
54 | |||
55 | /* Allocate the hashing algorithm we're going to need and find out how | ||
56 | * big the hash operational data will be. | ||
57 | */ | ||
58 | tfm = crypto_alloc_shash(hash_algo_name[hash], 0, 0); | ||
59 | if (IS_ERR(tfm)) | ||
60 | return (PTR_ERR(tfm) == -ENOENT) ? ERR_PTR(-ENOPKG) : ERR_CAST(tfm); | ||
61 | |||
62 | desc_size = crypto_shash_descsize(tfm) + sizeof(*desc); | ||
63 | digest_size = crypto_shash_digestsize(tfm); | ||
64 | |||
65 | /* We allocate the hash operational data storage on the end of our | ||
66 | * context data and the digest output buffer on the end of that. | ||
67 | */ | ||
68 | ret = -ENOMEM; | ||
69 | pks = kzalloc(digest_size + sizeof(*pks) + desc_size, GFP_KERNEL); | ||
70 | if (!pks) | ||
71 | goto error_no_pks; | ||
72 | |||
73 | pks->pkey_hash_algo = hash; | ||
74 | pks->digest = (u8 *)pks + sizeof(*pks) + desc_size; | ||
75 | pks->digest_size = digest_size; | ||
76 | |||
77 | desc = (void *)pks + sizeof(*pks); | ||
78 | desc->tfm = tfm; | ||
79 | desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP; | ||
80 | |||
81 | ret = crypto_shash_init(desc); | ||
82 | if (ret < 0) | ||
83 | goto error; | ||
84 | |||
85 | ret = crypto_shash_finup(desc, mod, modlen, pks->digest); | ||
86 | if (ret < 0) | ||
87 | goto error; | ||
88 | |||
89 | crypto_free_shash(tfm); | ||
90 | pr_devel("<==%s() = ok\n", __func__); | ||
91 | return pks; | ||
92 | |||
93 | error: | ||
94 | kfree(pks); | ||
95 | error_no_pks: | ||
96 | crypto_free_shash(tfm); | ||
97 | pr_devel("<==%s() = %d\n", __func__, ret); | ||
98 | return ERR_PTR(ret); | ||
99 | } | ||
100 | |||
101 | /* | ||
102 | * Extract an MPI array from the signature data. This represents the actual | ||
103 | * signature. Each raw MPI is prefaced by a BE 2-byte value indicating the | ||
104 | * size of the MPI in bytes. | ||
105 | * | ||
106 | * RSA signatures only have one MPI, so currently we only read one. | ||
107 | */ | ||
108 | static int mod_extract_mpi_array(struct public_key_signature *pks, | ||
109 | const void *data, size_t len) | ||
110 | { | ||
111 | size_t nbytes; | ||
112 | MPI mpi; | ||
113 | |||
114 | if (len < 3) | ||
115 | return -EBADMSG; | ||
116 | nbytes = ((const u8 *)data)[0] << 8 | ((const u8 *)data)[1]; | ||
117 | data += 2; | ||
118 | len -= 2; | ||
119 | if (len != nbytes) | ||
120 | return -EBADMSG; | ||
121 | |||
122 | mpi = mpi_read_raw_data(data, nbytes); | ||
123 | if (!mpi) | ||
124 | return -ENOMEM; | ||
125 | pks->mpi[0] = mpi; | ||
126 | pks->nr_mpi = 1; | ||
127 | return 0; | ||
128 | } | ||
129 | |||
130 | /* | ||
131 | * Request an asymmetric key. | ||
132 | */ | ||
133 | static struct key *request_asymmetric_key(const char *signer, size_t signer_len, | ||
134 | const u8 *key_id, size_t key_id_len) | ||
135 | { | ||
136 | key_ref_t key; | ||
137 | size_t i; | ||
138 | char *id, *q; | ||
139 | |||
140 | pr_devel("==>%s(,%zu,,%zu)\n", __func__, signer_len, key_id_len); | ||
141 | |||
142 | /* Construct an identifier. */ | ||
143 | id = kmalloc(signer_len + 2 + key_id_len * 2 + 1, GFP_KERNEL); | ||
144 | if (!id) | ||
145 | return ERR_PTR(-ENOKEY); | ||
146 | |||
147 | memcpy(id, signer, signer_len); | ||
148 | |||
149 | q = id + signer_len; | ||
150 | *q++ = ':'; | ||
151 | *q++ = ' '; | ||
152 | for (i = 0; i < key_id_len; i++) { | ||
153 | *q++ = hex_asc[*key_id >> 4]; | ||
154 | *q++ = hex_asc[*key_id++ & 0x0f]; | ||
155 | } | ||
156 | |||
157 | *q = 0; | ||
158 | |||
159 | pr_debug("Look up: \"%s\"\n", id); | ||
160 | |||
161 | key = keyring_search(make_key_ref(system_trusted_keyring, 1), | ||
162 | &key_type_asymmetric, id); | ||
163 | if (IS_ERR(key)) | ||
164 | pr_warn("Request for unknown module key '%s' err %ld\n", | ||
165 | id, PTR_ERR(key)); | ||
166 | kfree(id); | ||
167 | |||
168 | if (IS_ERR(key)) { | ||
169 | switch (PTR_ERR(key)) { | ||
170 | /* Hide some search errors */ | ||
171 | case -EACCES: | ||
172 | case -ENOTDIR: | ||
173 | case -EAGAIN: | ||
174 | return ERR_PTR(-ENOKEY); | ||
175 | default: | ||
176 | return ERR_CAST(key); | ||
177 | } | ||
178 | } | ||
179 | |||
180 | pr_devel("<==%s() = 0 [%x]\n", __func__, key_serial(key_ref_to_ptr(key))); | ||
181 | return key_ref_to_ptr(key); | ||
182 | } | ||
183 | |||
184 | /* | ||
185 | * Verify the signature on a module. | 38 | * Verify the signature on a module. |
186 | */ | 39 | */ |
187 | int mod_verify_sig(const void *mod, unsigned long *_modlen) | 40 | int mod_verify_sig(const void *mod, unsigned long *_modlen) |
188 | { | 41 | { |
189 | struct public_key_signature *pks; | ||
190 | struct module_signature ms; | 42 | struct module_signature ms; |
191 | struct key *key; | ||
192 | const void *sig; | ||
193 | size_t modlen = *_modlen, sig_len; | 43 | size_t modlen = *_modlen, sig_len; |
194 | int ret; | ||
195 | 44 | ||
196 | pr_devel("==>%s(,%zu)\n", __func__, modlen); | 45 | pr_devel("==>%s(,%zu)\n", __func__, modlen); |
197 | 46 | ||
@@ -205,46 +54,24 @@ int mod_verify_sig(const void *mod, unsigned long *_modlen) | |||
205 | if (sig_len >= modlen) | 54 | if (sig_len >= modlen) |
206 | return -EBADMSG; | 55 | return -EBADMSG; |
207 | modlen -= sig_len; | 56 | modlen -= sig_len; |
208 | if ((size_t)ms.signer_len + ms.key_id_len >= modlen) | ||
209 | return -EBADMSG; | ||
210 | modlen -= (size_t)ms.signer_len + ms.key_id_len; | ||
211 | |||
212 | *_modlen = modlen; | 57 | *_modlen = modlen; |
213 | sig = mod + modlen; | ||
214 | |||
215 | /* For the moment, only support RSA and X.509 identifiers */ | ||
216 | if (ms.algo != PKEY_ALGO_RSA || | ||
217 | ms.id_type != PKEY_ID_X509) | ||
218 | return -ENOPKG; | ||
219 | 58 | ||
220 | if (ms.hash >= PKEY_HASH__LAST || | 59 | if (ms.id_type != PKEY_ID_PKCS7) { |
221 | !hash_algo_name[ms.hash]) | 60 | pr_err("Module is not signed with expected PKCS#7 message\n"); |
222 | return -ENOPKG; | 61 | return -ENOPKG; |
223 | |||
224 | key = request_asymmetric_key(sig, ms.signer_len, | ||
225 | sig + ms.signer_len, ms.key_id_len); | ||
226 | if (IS_ERR(key)) | ||
227 | return PTR_ERR(key); | ||
228 | |||
229 | pks = mod_make_digest(ms.hash, mod, modlen); | ||
230 | if (IS_ERR(pks)) { | ||
231 | ret = PTR_ERR(pks); | ||
232 | goto error_put_key; | ||
233 | } | 62 | } |
234 | 63 | ||
235 | ret = mod_extract_mpi_array(pks, sig + ms.signer_len + ms.key_id_len, | 64 | if (ms.algo != 0 || |
236 | sig_len); | 65 | ms.hash != 0 || |
237 | if (ret < 0) | 66 | ms.signer_len != 0 || |
238 | goto error_free_pks; | 67 | ms.key_id_len != 0 || |
239 | 68 | ms.__pad[0] != 0 || | |
240 | ret = verify_signature(key, pks); | 69 | ms.__pad[1] != 0 || |
241 | pr_devel("verify_signature() = %d\n", ret); | 70 | ms.__pad[2] != 0) { |
71 | pr_err("PKCS#7 signature info has unexpected non-zero params\n"); | ||
72 | return -EBADMSG; | ||
73 | } | ||
242 | 74 | ||
243 | error_free_pks: | 75 | return system_verify_data(mod, modlen, mod + modlen, sig_len, |
244 | mpi_free(pks->rsa.s); | 76 | VERIFYING_MODULE_SIGNATURE); |
245 | kfree(pks); | ||
246 | error_put_key: | ||
247 | key_put(key); | ||
248 | pr_devel("<==%s() = %d\n", __func__, ret); | ||
249 | return ret; | ||
250 | } | 77 | } |
diff --git a/kernel/notifier.c b/kernel/notifier.c index ae9fc7cc360e..fd2c9acbcc19 100644 --- a/kernel/notifier.c +++ b/kernel/notifier.c | |||
@@ -544,6 +544,8 @@ int notrace notify_die(enum die_val val, const char *str, | |||
544 | .signr = sig, | 544 | .signr = sig, |
545 | 545 | ||
546 | }; | 546 | }; |
547 | RCU_LOCKDEP_WARN(!rcu_is_watching(), | ||
548 | "notify_die called but RCU thinks we're quiescent"); | ||
547 | return atomic_notifier_call_chain(&die_chain, val, &args); | 549 | return atomic_notifier_call_chain(&die_chain, val, &args); |
548 | } | 550 | } |
549 | NOKPROBE_SYMBOL(notify_die); | 551 | NOKPROBE_SYMBOL(notify_die); |
diff --git a/kernel/pid.c b/kernel/pid.c index 4fd07d5b7baf..ca368793808e 100644 --- a/kernel/pid.c +++ b/kernel/pid.c | |||
@@ -451,9 +451,8 @@ EXPORT_SYMBOL(pid_task); | |||
451 | */ | 451 | */ |
452 | struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns) | 452 | struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns) |
453 | { | 453 | { |
454 | rcu_lockdep_assert(rcu_read_lock_held(), | 454 | RCU_LOCKDEP_WARN(!rcu_read_lock_held(), |
455 | "find_task_by_pid_ns() needs rcu_read_lock()" | 455 | "find_task_by_pid_ns() needs rcu_read_lock() protection"); |
456 | " protection"); | ||
457 | return pid_task(find_pid_ns(nr, ns), PIDTYPE_PID); | 456 | return pid_task(find_pid_ns(nr, ns), PIDTYPE_PID); |
458 | } | 457 | } |
459 | 458 | ||
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 9e302315e33d..02e8dfaa1ce2 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig | |||
@@ -18,6 +18,16 @@ config SUSPEND_FREEZER | |||
18 | 18 | ||
19 | Turning OFF this setting is NOT recommended! If in doubt, say Y. | 19 | Turning OFF this setting is NOT recommended! If in doubt, say Y. |
20 | 20 | ||
21 | config SUSPEND_SKIP_SYNC | ||
22 | bool "Skip kernel's sys_sync() on suspend to RAM/standby" | ||
23 | depends on SUSPEND | ||
24 | depends on EXPERT | ||
25 | help | ||
26 | Skip the kernel sys_sync() before freezing user processes. | ||
27 | Some systems prefer not to pay this cost on every invocation | ||
28 | of suspend, or they are content with invoking sync() from | ||
29 | user-space before invoking suspend. Say Y if that's your case. | ||
30 | |||
21 | config HIBERNATE_CALLBACKS | 31 | config HIBERNATE_CALLBACKS |
22 | bool | 32 | bool |
23 | 33 | ||
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 53266b729fd9..7e4cda4a8dd9 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c | |||
@@ -484,11 +484,13 @@ static int enter_state(suspend_state_t state) | |||
484 | if (state == PM_SUSPEND_FREEZE) | 484 | if (state == PM_SUSPEND_FREEZE) |
485 | freeze_begin(); | 485 | freeze_begin(); |
486 | 486 | ||
487 | #ifndef CONFIG_SUSPEND_SKIP_SYNC | ||
487 | trace_suspend_resume(TPS("sync_filesystems"), 0, true); | 488 | trace_suspend_resume(TPS("sync_filesystems"), 0, true); |
488 | printk(KERN_INFO "PM: Syncing filesystems ... "); | 489 | printk(KERN_INFO "PM: Syncing filesystems ... "); |
489 | sys_sync(); | 490 | sys_sync(); |
490 | printk("done.\n"); | 491 | printk("done.\n"); |
491 | trace_suspend_resume(TPS("sync_filesystems"), 0, false); | 492 | trace_suspend_resume(TPS("sync_filesystems"), 0, false); |
493 | #endif | ||
492 | 494 | ||
493 | pr_debug("PM: Preparing system for sleep (%s)\n", pm_states[state]); | 495 | pr_debug("PM: Preparing system for sleep (%s)\n", pm_states[state]); |
494 | error = suspend_prepare(state); | 496 | error = suspend_prepare(state); |
diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 2f30ca91e4fa..b2066fb5b10f 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c | |||
@@ -227,27 +227,23 @@ static void hib_init_batch(struct hib_bio_batch *hb) | |||
227 | hb->error = 0; | 227 | hb->error = 0; |
228 | } | 228 | } |
229 | 229 | ||
230 | static void hib_end_io(struct bio *bio, int error) | 230 | static void hib_end_io(struct bio *bio) |
231 | { | 231 | { |
232 | struct hib_bio_batch *hb = bio->bi_private; | 232 | struct hib_bio_batch *hb = bio->bi_private; |
233 | const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); | ||
234 | struct page *page = bio->bi_io_vec[0].bv_page; | 233 | struct page *page = bio->bi_io_vec[0].bv_page; |
235 | 234 | ||
236 | if (!uptodate || error) { | 235 | if (bio->bi_error) { |
237 | printk(KERN_ALERT "Read-error on swap-device (%u:%u:%Lu)\n", | 236 | printk(KERN_ALERT "Read-error on swap-device (%u:%u:%Lu)\n", |
238 | imajor(bio->bi_bdev->bd_inode), | 237 | imajor(bio->bi_bdev->bd_inode), |
239 | iminor(bio->bi_bdev->bd_inode), | 238 | iminor(bio->bi_bdev->bd_inode), |
240 | (unsigned long long)bio->bi_iter.bi_sector); | 239 | (unsigned long long)bio->bi_iter.bi_sector); |
241 | |||
242 | if (!error) | ||
243 | error = -EIO; | ||
244 | } | 240 | } |
245 | 241 | ||
246 | if (bio_data_dir(bio) == WRITE) | 242 | if (bio_data_dir(bio) == WRITE) |
247 | put_page(page); | 243 | put_page(page); |
248 | 244 | ||
249 | if (error && !hb->error) | 245 | if (bio->bi_error && !hb->error) |
250 | hb->error = error; | 246 | hb->error = bio->bi_error; |
251 | if (atomic_dec_and_test(&hb->count)) | 247 | if (atomic_dec_and_test(&hb->count)) |
252 | wake_up(&hb->wait); | 248 | wake_up(&hb->wait); |
253 | 249 | ||
diff --git a/kernel/power/wakelock.c b/kernel/power/wakelock.c index 019069c84ff6..1896386e16bb 100644 --- a/kernel/power/wakelock.c +++ b/kernel/power/wakelock.c | |||
@@ -17,6 +17,7 @@ | |||
17 | #include <linux/list.h> | 17 | #include <linux/list.h> |
18 | #include <linux/rbtree.h> | 18 | #include <linux/rbtree.h> |
19 | #include <linux/slab.h> | 19 | #include <linux/slab.h> |
20 | #include <linux/workqueue.h> | ||
20 | 21 | ||
21 | #include "power.h" | 22 | #include "power.h" |
22 | 23 | ||
@@ -83,7 +84,9 @@ static inline void decrement_wakelocks_number(void) {} | |||
83 | #define WL_GC_COUNT_MAX 100 | 84 | #define WL_GC_COUNT_MAX 100 |
84 | #define WL_GC_TIME_SEC 300 | 85 | #define WL_GC_TIME_SEC 300 |
85 | 86 | ||
87 | static void __wakelocks_gc(struct work_struct *work); | ||
86 | static LIST_HEAD(wakelocks_lru_list); | 88 | static LIST_HEAD(wakelocks_lru_list); |
89 | static DECLARE_WORK(wakelock_work, __wakelocks_gc); | ||
87 | static unsigned int wakelocks_gc_count; | 90 | static unsigned int wakelocks_gc_count; |
88 | 91 | ||
89 | static inline void wakelocks_lru_add(struct wakelock *wl) | 92 | static inline void wakelocks_lru_add(struct wakelock *wl) |
@@ -96,13 +99,12 @@ static inline void wakelocks_lru_most_recent(struct wakelock *wl) | |||
96 | list_move(&wl->lru, &wakelocks_lru_list); | 99 | list_move(&wl->lru, &wakelocks_lru_list); |
97 | } | 100 | } |
98 | 101 | ||
99 | static void wakelocks_gc(void) | 102 | static void __wakelocks_gc(struct work_struct *work) |
100 | { | 103 | { |
101 | struct wakelock *wl, *aux; | 104 | struct wakelock *wl, *aux; |
102 | ktime_t now; | 105 | ktime_t now; |
103 | 106 | ||
104 | if (++wakelocks_gc_count <= WL_GC_COUNT_MAX) | 107 | mutex_lock(&wakelocks_lock); |
105 | return; | ||
106 | 108 | ||
107 | now = ktime_get(); | 109 | now = ktime_get(); |
108 | list_for_each_entry_safe_reverse(wl, aux, &wakelocks_lru_list, lru) { | 110 | list_for_each_entry_safe_reverse(wl, aux, &wakelocks_lru_list, lru) { |
@@ -127,6 +129,16 @@ static void wakelocks_gc(void) | |||
127 | } | 129 | } |
128 | } | 130 | } |
129 | wakelocks_gc_count = 0; | 131 | wakelocks_gc_count = 0; |
132 | |||
133 | mutex_unlock(&wakelocks_lock); | ||
134 | } | ||
135 | |||
136 | static void wakelocks_gc(void) | ||
137 | { | ||
138 | if (++wakelocks_gc_count <= WL_GC_COUNT_MAX) | ||
139 | return; | ||
140 | |||
141 | schedule_work(&wakelock_work); | ||
130 | } | 142 | } |
131 | #else /* !CONFIG_PM_WAKELOCKS_GC */ | 143 | #else /* !CONFIG_PM_WAKELOCKS_GC */ |
132 | static inline void wakelocks_lru_add(struct wakelock *wl) {} | 144 | static inline void wakelocks_lru_add(struct wakelock *wl) {} |
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index cf8c24203368..8f0324ef72ab 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c | |||
@@ -835,7 +835,7 @@ const struct file_operations kmsg_fops = { | |||
835 | .release = devkmsg_release, | 835 | .release = devkmsg_release, |
836 | }; | 836 | }; |
837 | 837 | ||
838 | #ifdef CONFIG_KEXEC | 838 | #ifdef CONFIG_KEXEC_CORE |
839 | /* | 839 | /* |
840 | * This appends the listed symbols to /proc/vmcore | 840 | * This appends the listed symbols to /proc/vmcore |
841 | * | 841 | * |
diff --git a/kernel/profile.c b/kernel/profile.c index a7bcd28d6e9f..99513e1160e5 100644 --- a/kernel/profile.c +++ b/kernel/profile.c | |||
@@ -339,7 +339,7 @@ static int profile_cpu_callback(struct notifier_block *info, | |||
339 | node = cpu_to_mem(cpu); | 339 | node = cpu_to_mem(cpu); |
340 | per_cpu(cpu_profile_flip, cpu) = 0; | 340 | per_cpu(cpu_profile_flip, cpu) = 0; |
341 | if (!per_cpu(cpu_profile_hits, cpu)[1]) { | 341 | if (!per_cpu(cpu_profile_hits, cpu)[1]) { |
342 | page = alloc_pages_exact_node(node, | 342 | page = __alloc_pages_node(node, |
343 | GFP_KERNEL | __GFP_ZERO, | 343 | GFP_KERNEL | __GFP_ZERO, |
344 | 0); | 344 | 0); |
345 | if (!page) | 345 | if (!page) |
@@ -347,7 +347,7 @@ static int profile_cpu_callback(struct notifier_block *info, | |||
347 | per_cpu(cpu_profile_hits, cpu)[1] = page_address(page); | 347 | per_cpu(cpu_profile_hits, cpu)[1] = page_address(page); |
348 | } | 348 | } |
349 | if (!per_cpu(cpu_profile_hits, cpu)[0]) { | 349 | if (!per_cpu(cpu_profile_hits, cpu)[0]) { |
350 | page = alloc_pages_exact_node(node, | 350 | page = __alloc_pages_node(node, |
351 | GFP_KERNEL | __GFP_ZERO, | 351 | GFP_KERNEL | __GFP_ZERO, |
352 | 0); | 352 | 0); |
353 | if (!page) | 353 | if (!page) |
@@ -543,14 +543,14 @@ static int create_hash_tables(void) | |||
543 | int node = cpu_to_mem(cpu); | 543 | int node = cpu_to_mem(cpu); |
544 | struct page *page; | 544 | struct page *page; |
545 | 545 | ||
546 | page = alloc_pages_exact_node(node, | 546 | page = __alloc_pages_node(node, |
547 | GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE, | 547 | GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE, |
548 | 0); | 548 | 0); |
549 | if (!page) | 549 | if (!page) |
550 | goto out_cleanup; | 550 | goto out_cleanup; |
551 | per_cpu(cpu_profile_hits, cpu)[1] | 551 | per_cpu(cpu_profile_hits, cpu)[1] |
552 | = (struct profile_hit *)page_address(page); | 552 | = (struct profile_hit *)page_address(page); |
553 | page = alloc_pages_exact_node(node, | 553 | page = __alloc_pages_node(node, |
554 | GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE, | 554 | GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE, |
555 | 0); | 555 | 0); |
556 | if (!page) | 556 | if (!page) |
diff --git a/kernel/ptrace.c b/kernel/ptrace.c index c8e0e050a36a..787320de68e0 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c | |||
@@ -556,6 +556,19 @@ static int ptrace_setoptions(struct task_struct *child, unsigned long data) | |||
556 | if (data & ~(unsigned long)PTRACE_O_MASK) | 556 | if (data & ~(unsigned long)PTRACE_O_MASK) |
557 | return -EINVAL; | 557 | return -EINVAL; |
558 | 558 | ||
559 | if (unlikely(data & PTRACE_O_SUSPEND_SECCOMP)) { | ||
560 | if (!config_enabled(CONFIG_CHECKPOINT_RESTORE) || | ||
561 | !config_enabled(CONFIG_SECCOMP)) | ||
562 | return -EINVAL; | ||
563 | |||
564 | if (!capable(CAP_SYS_ADMIN)) | ||
565 | return -EPERM; | ||
566 | |||
567 | if (seccomp_mode(¤t->seccomp) != SECCOMP_MODE_DISABLED || | ||
568 | current->ptrace & PT_SUSPEND_SECCOMP) | ||
569 | return -EPERM; | ||
570 | } | ||
571 | |||
559 | /* Avoid intermediate state when all opts are cleared */ | 572 | /* Avoid intermediate state when all opts are cleared */ |
560 | flags = child->ptrace; | 573 | flags = child->ptrace; |
561 | flags &= ~(PTRACE_O_MASK << PT_OPT_FLAG_SHIFT); | 574 | flags &= ~(PTRACE_O_MASK << PT_OPT_FLAG_SHIFT); |
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 59e32684c23b..77192953dee5 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c | |||
@@ -635,6 +635,8 @@ static struct rcu_torture_ops sched_ops = { | |||
635 | .deferred_free = rcu_sched_torture_deferred_free, | 635 | .deferred_free = rcu_sched_torture_deferred_free, |
636 | .sync = synchronize_sched, | 636 | .sync = synchronize_sched, |
637 | .exp_sync = synchronize_sched_expedited, | 637 | .exp_sync = synchronize_sched_expedited, |
638 | .get_state = get_state_synchronize_sched, | ||
639 | .cond_sync = cond_synchronize_sched, | ||
638 | .call = call_rcu_sched, | 640 | .call = call_rcu_sched, |
639 | .cb_barrier = rcu_barrier_sched, | 641 | .cb_barrier = rcu_barrier_sched, |
640 | .fqs = rcu_sched_force_quiescent_state, | 642 | .fqs = rcu_sched_force_quiescent_state, |
@@ -684,10 +686,20 @@ static struct rcu_torture_ops tasks_ops = { | |||
684 | 686 | ||
685 | #define RCUTORTURE_TASKS_OPS &tasks_ops, | 687 | #define RCUTORTURE_TASKS_OPS &tasks_ops, |
686 | 688 | ||
689 | static bool __maybe_unused torturing_tasks(void) | ||
690 | { | ||
691 | return cur_ops == &tasks_ops; | ||
692 | } | ||
693 | |||
687 | #else /* #ifdef CONFIG_TASKS_RCU */ | 694 | #else /* #ifdef CONFIG_TASKS_RCU */ |
688 | 695 | ||
689 | #define RCUTORTURE_TASKS_OPS | 696 | #define RCUTORTURE_TASKS_OPS |
690 | 697 | ||
698 | static bool torturing_tasks(void) | ||
699 | { | ||
700 | return false; | ||
701 | } | ||
702 | |||
691 | #endif /* #else #ifdef CONFIG_TASKS_RCU */ | 703 | #endif /* #else #ifdef CONFIG_TASKS_RCU */ |
692 | 704 | ||
693 | /* | 705 | /* |
@@ -823,9 +835,7 @@ rcu_torture_cbflood(void *arg) | |||
823 | } | 835 | } |
824 | if (err) { | 836 | if (err) { |
825 | VERBOSE_TOROUT_STRING("rcu_torture_cbflood disabled: Bad args or OOM"); | 837 | VERBOSE_TOROUT_STRING("rcu_torture_cbflood disabled: Bad args or OOM"); |
826 | while (!torture_must_stop()) | 838 | goto wait_for_stop; |
827 | schedule_timeout_interruptible(HZ); | ||
828 | return 0; | ||
829 | } | 839 | } |
830 | VERBOSE_TOROUT_STRING("rcu_torture_cbflood task started"); | 840 | VERBOSE_TOROUT_STRING("rcu_torture_cbflood task started"); |
831 | do { | 841 | do { |
@@ -844,6 +854,7 @@ rcu_torture_cbflood(void *arg) | |||
844 | stutter_wait("rcu_torture_cbflood"); | 854 | stutter_wait("rcu_torture_cbflood"); |
845 | } while (!torture_must_stop()); | 855 | } while (!torture_must_stop()); |
846 | vfree(rhp); | 856 | vfree(rhp); |
857 | wait_for_stop: | ||
847 | torture_kthread_stopping("rcu_torture_cbflood"); | 858 | torture_kthread_stopping("rcu_torture_cbflood"); |
848 | return 0; | 859 | return 0; |
849 | } | 860 | } |
@@ -1088,7 +1099,8 @@ static void rcu_torture_timer(unsigned long unused) | |||
1088 | p = rcu_dereference_check(rcu_torture_current, | 1099 | p = rcu_dereference_check(rcu_torture_current, |
1089 | rcu_read_lock_bh_held() || | 1100 | rcu_read_lock_bh_held() || |
1090 | rcu_read_lock_sched_held() || | 1101 | rcu_read_lock_sched_held() || |
1091 | srcu_read_lock_held(srcu_ctlp)); | 1102 | srcu_read_lock_held(srcu_ctlp) || |
1103 | torturing_tasks()); | ||
1092 | if (p == NULL) { | 1104 | if (p == NULL) { |
1093 | /* Leave because rcu_torture_writer is not yet underway */ | 1105 | /* Leave because rcu_torture_writer is not yet underway */ |
1094 | cur_ops->readunlock(idx); | 1106 | cur_ops->readunlock(idx); |
@@ -1162,7 +1174,8 @@ rcu_torture_reader(void *arg) | |||
1162 | p = rcu_dereference_check(rcu_torture_current, | 1174 | p = rcu_dereference_check(rcu_torture_current, |
1163 | rcu_read_lock_bh_held() || | 1175 | rcu_read_lock_bh_held() || |
1164 | rcu_read_lock_sched_held() || | 1176 | rcu_read_lock_sched_held() || |
1165 | srcu_read_lock_held(srcu_ctlp)); | 1177 | srcu_read_lock_held(srcu_ctlp) || |
1178 | torturing_tasks()); | ||
1166 | if (p == NULL) { | 1179 | if (p == NULL) { |
1167 | /* Wait for rcu_torture_writer to get underway */ | 1180 | /* Wait for rcu_torture_writer to get underway */ |
1168 | cur_ops->readunlock(idx); | 1181 | cur_ops->readunlock(idx); |
@@ -1507,7 +1520,7 @@ static int rcu_torture_barrier_init(void) | |||
1507 | int i; | 1520 | int i; |
1508 | int ret; | 1521 | int ret; |
1509 | 1522 | ||
1510 | if (n_barrier_cbs == 0) | 1523 | if (n_barrier_cbs <= 0) |
1511 | return 0; | 1524 | return 0; |
1512 | if (cur_ops->call == NULL || cur_ops->cb_barrier == NULL) { | 1525 | if (cur_ops->call == NULL || cur_ops->cb_barrier == NULL) { |
1513 | pr_alert("%s" TORTURE_FLAG | 1526 | pr_alert("%s" TORTURE_FLAG |
@@ -1786,12 +1799,15 @@ rcu_torture_init(void) | |||
1786 | writer_task); | 1799 | writer_task); |
1787 | if (firsterr) | 1800 | if (firsterr) |
1788 | goto unwind; | 1801 | goto unwind; |
1789 | fakewriter_tasks = kzalloc(nfakewriters * sizeof(fakewriter_tasks[0]), | 1802 | if (nfakewriters > 0) { |
1790 | GFP_KERNEL); | 1803 | fakewriter_tasks = kzalloc(nfakewriters * |
1791 | if (fakewriter_tasks == NULL) { | 1804 | sizeof(fakewriter_tasks[0]), |
1792 | VERBOSE_TOROUT_ERRSTRING("out of memory"); | 1805 | GFP_KERNEL); |
1793 | firsterr = -ENOMEM; | 1806 | if (fakewriter_tasks == NULL) { |
1794 | goto unwind; | 1807 | VERBOSE_TOROUT_ERRSTRING("out of memory"); |
1808 | firsterr = -ENOMEM; | ||
1809 | goto unwind; | ||
1810 | } | ||
1795 | } | 1811 | } |
1796 | for (i = 0; i < nfakewriters; i++) { | 1812 | for (i = 0; i < nfakewriters; i++) { |
1797 | firsterr = torture_create_kthread(rcu_torture_fakewriter, | 1813 | firsterr = torture_create_kthread(rcu_torture_fakewriter, |
@@ -1818,7 +1834,7 @@ rcu_torture_init(void) | |||
1818 | if (firsterr) | 1834 | if (firsterr) |
1819 | goto unwind; | 1835 | goto unwind; |
1820 | } | 1836 | } |
1821 | if (test_no_idle_hz) { | 1837 | if (test_no_idle_hz && shuffle_interval > 0) { |
1822 | firsterr = torture_shuffle_init(shuffle_interval * HZ); | 1838 | firsterr = torture_shuffle_init(shuffle_interval * HZ); |
1823 | if (firsterr) | 1839 | if (firsterr) |
1824 | goto unwind; | 1840 | goto unwind; |
diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c index fb33d35ee0b7..d3fcb2ec8536 100644 --- a/kernel/rcu/srcu.c +++ b/kernel/rcu/srcu.c | |||
@@ -252,14 +252,15 @@ static bool srcu_readers_active_idx_check(struct srcu_struct *sp, int idx) | |||
252 | } | 252 | } |
253 | 253 | ||
254 | /** | 254 | /** |
255 | * srcu_readers_active - returns approximate number of readers. | 255 | * srcu_readers_active - returns true if there are readers. and false |
256 | * otherwise | ||
256 | * @sp: which srcu_struct to count active readers (holding srcu_read_lock). | 257 | * @sp: which srcu_struct to count active readers (holding srcu_read_lock). |
257 | * | 258 | * |
258 | * Note that this is not an atomic primitive, and can therefore suffer | 259 | * Note that this is not an atomic primitive, and can therefore suffer |
259 | * severe errors when invoked on an active srcu_struct. That said, it | 260 | * severe errors when invoked on an active srcu_struct. That said, it |
260 | * can be useful as an error check at cleanup time. | 261 | * can be useful as an error check at cleanup time. |
261 | */ | 262 | */ |
262 | static int srcu_readers_active(struct srcu_struct *sp) | 263 | static bool srcu_readers_active(struct srcu_struct *sp) |
263 | { | 264 | { |
264 | int cpu; | 265 | int cpu; |
265 | unsigned long sum = 0; | 266 | unsigned long sum = 0; |
@@ -414,11 +415,11 @@ static void __synchronize_srcu(struct srcu_struct *sp, int trycount) | |||
414 | struct rcu_head *head = &rcu.head; | 415 | struct rcu_head *head = &rcu.head; |
415 | bool done = false; | 416 | bool done = false; |
416 | 417 | ||
417 | rcu_lockdep_assert(!lock_is_held(&sp->dep_map) && | 418 | RCU_LOCKDEP_WARN(lock_is_held(&sp->dep_map) || |
418 | !lock_is_held(&rcu_bh_lock_map) && | 419 | lock_is_held(&rcu_bh_lock_map) || |
419 | !lock_is_held(&rcu_lock_map) && | 420 | lock_is_held(&rcu_lock_map) || |
420 | !lock_is_held(&rcu_sched_lock_map), | 421 | lock_is_held(&rcu_sched_lock_map), |
421 | "Illegal synchronize_srcu() in same-type SRCU (or RCU) read-side critical section"); | 422 | "Illegal synchronize_srcu() in same-type SRCU (or in RCU) read-side critical section"); |
422 | 423 | ||
423 | might_sleep(); | 424 | might_sleep(); |
424 | init_completion(&rcu.completion); | 425 | init_completion(&rcu.completion); |
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index c291bd65d2cb..d0471056d0af 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c | |||
@@ -191,10 +191,10 @@ static void rcu_process_callbacks(struct softirq_action *unused) | |||
191 | */ | 191 | */ |
192 | void synchronize_sched(void) | 192 | void synchronize_sched(void) |
193 | { | 193 | { |
194 | rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) && | 194 | RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) || |
195 | !lock_is_held(&rcu_lock_map) && | 195 | lock_is_held(&rcu_lock_map) || |
196 | !lock_is_held(&rcu_sched_lock_map), | 196 | lock_is_held(&rcu_sched_lock_map), |
197 | "Illegal synchronize_sched() in RCU read-side critical section"); | 197 | "Illegal synchronize_sched() in RCU read-side critical section"); |
198 | cond_resched(); | 198 | cond_resched(); |
199 | } | 199 | } |
200 | EXPORT_SYMBOL_GPL(synchronize_sched); | 200 | EXPORT_SYMBOL_GPL(synchronize_sched); |
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 65137bc28b2b..9f75f25cc5d9 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c | |||
@@ -70,6 +70,8 @@ MODULE_ALIAS("rcutree"); | |||
70 | 70 | ||
71 | static struct lock_class_key rcu_node_class[RCU_NUM_LVLS]; | 71 | static struct lock_class_key rcu_node_class[RCU_NUM_LVLS]; |
72 | static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS]; | 72 | static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS]; |
73 | static struct lock_class_key rcu_exp_class[RCU_NUM_LVLS]; | ||
74 | static struct lock_class_key rcu_exp_sched_class[RCU_NUM_LVLS]; | ||
73 | 75 | ||
74 | /* | 76 | /* |
75 | * In order to export the rcu_state name to the tracing tools, it | 77 | * In order to export the rcu_state name to the tracing tools, it |
@@ -124,13 +126,8 @@ module_param(rcu_fanout_exact, bool, 0444); | |||
124 | static int rcu_fanout_leaf = RCU_FANOUT_LEAF; | 126 | static int rcu_fanout_leaf = RCU_FANOUT_LEAF; |
125 | module_param(rcu_fanout_leaf, int, 0444); | 127 | module_param(rcu_fanout_leaf, int, 0444); |
126 | int rcu_num_lvls __read_mostly = RCU_NUM_LVLS; | 128 | int rcu_num_lvls __read_mostly = RCU_NUM_LVLS; |
127 | static int num_rcu_lvl[] = { /* Number of rcu_nodes at specified level. */ | 129 | /* Number of rcu_nodes at specified level. */ |
128 | NUM_RCU_LVL_0, | 130 | static int num_rcu_lvl[] = NUM_RCU_LVL_INIT; |
129 | NUM_RCU_LVL_1, | ||
130 | NUM_RCU_LVL_2, | ||
131 | NUM_RCU_LVL_3, | ||
132 | NUM_RCU_LVL_4, | ||
133 | }; | ||
134 | int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */ | 131 | int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */ |
135 | 132 | ||
136 | /* | 133 | /* |
@@ -649,12 +646,12 @@ static void rcu_eqs_enter_common(long long oldval, bool user) | |||
649 | * It is illegal to enter an extended quiescent state while | 646 | * It is illegal to enter an extended quiescent state while |
650 | * in an RCU read-side critical section. | 647 | * in an RCU read-side critical section. |
651 | */ | 648 | */ |
652 | rcu_lockdep_assert(!lock_is_held(&rcu_lock_map), | 649 | RCU_LOCKDEP_WARN(lock_is_held(&rcu_lock_map), |
653 | "Illegal idle entry in RCU read-side critical section."); | 650 | "Illegal idle entry in RCU read-side critical section."); |
654 | rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map), | 651 | RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map), |
655 | "Illegal idle entry in RCU-bh read-side critical section."); | 652 | "Illegal idle entry in RCU-bh read-side critical section."); |
656 | rcu_lockdep_assert(!lock_is_held(&rcu_sched_lock_map), | 653 | RCU_LOCKDEP_WARN(lock_is_held(&rcu_sched_lock_map), |
657 | "Illegal idle entry in RCU-sched read-side critical section."); | 654 | "Illegal idle entry in RCU-sched read-side critical section."); |
658 | } | 655 | } |
659 | 656 | ||
660 | /* | 657 | /* |
@@ -701,7 +698,7 @@ void rcu_idle_enter(void) | |||
701 | } | 698 | } |
702 | EXPORT_SYMBOL_GPL(rcu_idle_enter); | 699 | EXPORT_SYMBOL_GPL(rcu_idle_enter); |
703 | 700 | ||
704 | #ifdef CONFIG_RCU_USER_QS | 701 | #ifdef CONFIG_NO_HZ_FULL |
705 | /** | 702 | /** |
706 | * rcu_user_enter - inform RCU that we are resuming userspace. | 703 | * rcu_user_enter - inform RCU that we are resuming userspace. |
707 | * | 704 | * |
@@ -714,7 +711,7 @@ void rcu_user_enter(void) | |||
714 | { | 711 | { |
715 | rcu_eqs_enter(1); | 712 | rcu_eqs_enter(1); |
716 | } | 713 | } |
717 | #endif /* CONFIG_RCU_USER_QS */ | 714 | #endif /* CONFIG_NO_HZ_FULL */ |
718 | 715 | ||
719 | /** | 716 | /** |
720 | * rcu_irq_exit - inform RCU that current CPU is exiting irq towards idle | 717 | * rcu_irq_exit - inform RCU that current CPU is exiting irq towards idle |
@@ -828,7 +825,7 @@ void rcu_idle_exit(void) | |||
828 | } | 825 | } |
829 | EXPORT_SYMBOL_GPL(rcu_idle_exit); | 826 | EXPORT_SYMBOL_GPL(rcu_idle_exit); |
830 | 827 | ||
831 | #ifdef CONFIG_RCU_USER_QS | 828 | #ifdef CONFIG_NO_HZ_FULL |
832 | /** | 829 | /** |
833 | * rcu_user_exit - inform RCU that we are exiting userspace. | 830 | * rcu_user_exit - inform RCU that we are exiting userspace. |
834 | * | 831 | * |
@@ -839,7 +836,7 @@ void rcu_user_exit(void) | |||
839 | { | 836 | { |
840 | rcu_eqs_exit(1); | 837 | rcu_eqs_exit(1); |
841 | } | 838 | } |
842 | #endif /* CONFIG_RCU_USER_QS */ | 839 | #endif /* CONFIG_NO_HZ_FULL */ |
843 | 840 | ||
844 | /** | 841 | /** |
845 | * rcu_irq_enter - inform RCU that current CPU is entering irq away from idle | 842 | * rcu_irq_enter - inform RCU that current CPU is entering irq away from idle |
@@ -978,9 +975,9 @@ bool notrace rcu_is_watching(void) | |||
978 | { | 975 | { |
979 | bool ret; | 976 | bool ret; |
980 | 977 | ||
981 | preempt_disable(); | 978 | preempt_disable_notrace(); |
982 | ret = __rcu_is_watching(); | 979 | ret = __rcu_is_watching(); |
983 | preempt_enable(); | 980 | preempt_enable_notrace(); |
984 | return ret; | 981 | return ret; |
985 | } | 982 | } |
986 | EXPORT_SYMBOL_GPL(rcu_is_watching); | 983 | EXPORT_SYMBOL_GPL(rcu_is_watching); |
@@ -1178,9 +1175,11 @@ static void rcu_check_gp_kthread_starvation(struct rcu_state *rsp) | |||
1178 | j = jiffies; | 1175 | j = jiffies; |
1179 | gpa = READ_ONCE(rsp->gp_activity); | 1176 | gpa = READ_ONCE(rsp->gp_activity); |
1180 | if (j - gpa > 2 * HZ) | 1177 | if (j - gpa > 2 * HZ) |
1181 | pr_err("%s kthread starved for %ld jiffies! g%lu c%lu f%#x\n", | 1178 | pr_err("%s kthread starved for %ld jiffies! g%lu c%lu f%#x s%d ->state=%#lx\n", |
1182 | rsp->name, j - gpa, | 1179 | rsp->name, j - gpa, |
1183 | rsp->gpnum, rsp->completed, rsp->gp_flags); | 1180 | rsp->gpnum, rsp->completed, |
1181 | rsp->gp_flags, rsp->gp_state, | ||
1182 | rsp->gp_kthread ? rsp->gp_kthread->state : 0); | ||
1184 | } | 1183 | } |
1185 | 1184 | ||
1186 | /* | 1185 | /* |
@@ -1906,6 +1905,26 @@ static int rcu_gp_init(struct rcu_state *rsp) | |||
1906 | } | 1905 | } |
1907 | 1906 | ||
1908 | /* | 1907 | /* |
1908 | * Helper function for wait_event_interruptible_timeout() wakeup | ||
1909 | * at force-quiescent-state time. | ||
1910 | */ | ||
1911 | static bool rcu_gp_fqs_check_wake(struct rcu_state *rsp, int *gfp) | ||
1912 | { | ||
1913 | struct rcu_node *rnp = rcu_get_root(rsp); | ||
1914 | |||
1915 | /* Someone like call_rcu() requested a force-quiescent-state scan. */ | ||
1916 | *gfp = READ_ONCE(rsp->gp_flags); | ||
1917 | if (*gfp & RCU_GP_FLAG_FQS) | ||
1918 | return true; | ||
1919 | |||
1920 | /* The current grace period has completed. */ | ||
1921 | if (!READ_ONCE(rnp->qsmask) && !rcu_preempt_blocked_readers_cgp(rnp)) | ||
1922 | return true; | ||
1923 | |||
1924 | return false; | ||
1925 | } | ||
1926 | |||
1927 | /* | ||
1909 | * Do one round of quiescent-state forcing. | 1928 | * Do one round of quiescent-state forcing. |
1910 | */ | 1929 | */ |
1911 | static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in) | 1930 | static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in) |
@@ -2041,6 +2060,7 @@ static int __noreturn rcu_gp_kthread(void *arg) | |||
2041 | wait_event_interruptible(rsp->gp_wq, | 2060 | wait_event_interruptible(rsp->gp_wq, |
2042 | READ_ONCE(rsp->gp_flags) & | 2061 | READ_ONCE(rsp->gp_flags) & |
2043 | RCU_GP_FLAG_INIT); | 2062 | RCU_GP_FLAG_INIT); |
2063 | rsp->gp_state = RCU_GP_DONE_GPS; | ||
2044 | /* Locking provides needed memory barrier. */ | 2064 | /* Locking provides needed memory barrier. */ |
2045 | if (rcu_gp_init(rsp)) | 2065 | if (rcu_gp_init(rsp)) |
2046 | break; | 2066 | break; |
@@ -2068,11 +2088,8 @@ static int __noreturn rcu_gp_kthread(void *arg) | |||
2068 | TPS("fqswait")); | 2088 | TPS("fqswait")); |
2069 | rsp->gp_state = RCU_GP_WAIT_FQS; | 2089 | rsp->gp_state = RCU_GP_WAIT_FQS; |
2070 | ret = wait_event_interruptible_timeout(rsp->gp_wq, | 2090 | ret = wait_event_interruptible_timeout(rsp->gp_wq, |
2071 | ((gf = READ_ONCE(rsp->gp_flags)) & | 2091 | rcu_gp_fqs_check_wake(rsp, &gf), j); |
2072 | RCU_GP_FLAG_FQS) || | 2092 | rsp->gp_state = RCU_GP_DOING_FQS; |
2073 | (!READ_ONCE(rnp->qsmask) && | ||
2074 | !rcu_preempt_blocked_readers_cgp(rnp)), | ||
2075 | j); | ||
2076 | /* Locking provides needed memory barriers. */ | 2093 | /* Locking provides needed memory barriers. */ |
2077 | /* If grace period done, leave loop. */ | 2094 | /* If grace period done, leave loop. */ |
2078 | if (!READ_ONCE(rnp->qsmask) && | 2095 | if (!READ_ONCE(rnp->qsmask) && |
@@ -2110,7 +2127,9 @@ static int __noreturn rcu_gp_kthread(void *arg) | |||
2110 | } | 2127 | } |
2111 | 2128 | ||
2112 | /* Handle grace-period end. */ | 2129 | /* Handle grace-period end. */ |
2130 | rsp->gp_state = RCU_GP_CLEANUP; | ||
2113 | rcu_gp_cleanup(rsp); | 2131 | rcu_gp_cleanup(rsp); |
2132 | rsp->gp_state = RCU_GP_CLEANED; | ||
2114 | } | 2133 | } |
2115 | } | 2134 | } |
2116 | 2135 | ||
@@ -3161,10 +3180,10 @@ static inline int rcu_blocking_is_gp(void) | |||
3161 | */ | 3180 | */ |
3162 | void synchronize_sched(void) | 3181 | void synchronize_sched(void) |
3163 | { | 3182 | { |
3164 | rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) && | 3183 | RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) || |
3165 | !lock_is_held(&rcu_lock_map) && | 3184 | lock_is_held(&rcu_lock_map) || |
3166 | !lock_is_held(&rcu_sched_lock_map), | 3185 | lock_is_held(&rcu_sched_lock_map), |
3167 | "Illegal synchronize_sched() in RCU-sched read-side critical section"); | 3186 | "Illegal synchronize_sched() in RCU-sched read-side critical section"); |
3168 | if (rcu_blocking_is_gp()) | 3187 | if (rcu_blocking_is_gp()) |
3169 | return; | 3188 | return; |
3170 | if (rcu_gp_is_expedited()) | 3189 | if (rcu_gp_is_expedited()) |
@@ -3188,10 +3207,10 @@ EXPORT_SYMBOL_GPL(synchronize_sched); | |||
3188 | */ | 3207 | */ |
3189 | void synchronize_rcu_bh(void) | 3208 | void synchronize_rcu_bh(void) |
3190 | { | 3209 | { |
3191 | rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) && | 3210 | RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) || |
3192 | !lock_is_held(&rcu_lock_map) && | 3211 | lock_is_held(&rcu_lock_map) || |
3193 | !lock_is_held(&rcu_sched_lock_map), | 3212 | lock_is_held(&rcu_sched_lock_map), |
3194 | "Illegal synchronize_rcu_bh() in RCU-bh read-side critical section"); | 3213 | "Illegal synchronize_rcu_bh() in RCU-bh read-side critical section"); |
3195 | if (rcu_blocking_is_gp()) | 3214 | if (rcu_blocking_is_gp()) |
3196 | return; | 3215 | return; |
3197 | if (rcu_gp_is_expedited()) | 3216 | if (rcu_gp_is_expedited()) |
@@ -3253,23 +3272,247 @@ void cond_synchronize_rcu(unsigned long oldstate) | |||
3253 | } | 3272 | } |
3254 | EXPORT_SYMBOL_GPL(cond_synchronize_rcu); | 3273 | EXPORT_SYMBOL_GPL(cond_synchronize_rcu); |
3255 | 3274 | ||
3256 | static int synchronize_sched_expedited_cpu_stop(void *data) | 3275 | /** |
3276 | * get_state_synchronize_sched - Snapshot current RCU-sched state | ||
3277 | * | ||
3278 | * Returns a cookie that is used by a later call to cond_synchronize_sched() | ||
3279 | * to determine whether or not a full grace period has elapsed in the | ||
3280 | * meantime. | ||
3281 | */ | ||
3282 | unsigned long get_state_synchronize_sched(void) | ||
3257 | { | 3283 | { |
3258 | /* | 3284 | /* |
3259 | * There must be a full memory barrier on each affected CPU | 3285 | * Any prior manipulation of RCU-protected data must happen |
3260 | * between the time that try_stop_cpus() is called and the | 3286 | * before the load from ->gpnum. |
3261 | * time that it returns. | 3287 | */ |
3262 | * | 3288 | smp_mb(); /* ^^^ */ |
3263 | * In the current initial implementation of cpu_stop, the | 3289 | |
3264 | * above condition is already met when the control reaches | 3290 | /* |
3265 | * this point and the following smp_mb() is not strictly | 3291 | * Make sure this load happens before the purportedly |
3266 | * necessary. Do smp_mb() anyway for documentation and | 3292 | * time-consuming work between get_state_synchronize_sched() |
3267 | * robustness against future implementation changes. | 3293 | * and cond_synchronize_sched(). |
3294 | */ | ||
3295 | return smp_load_acquire(&rcu_sched_state.gpnum); | ||
3296 | } | ||
3297 | EXPORT_SYMBOL_GPL(get_state_synchronize_sched); | ||
3298 | |||
3299 | /** | ||
3300 | * cond_synchronize_sched - Conditionally wait for an RCU-sched grace period | ||
3301 | * | ||
3302 | * @oldstate: return value from earlier call to get_state_synchronize_sched() | ||
3303 | * | ||
3304 | * If a full RCU-sched grace period has elapsed since the earlier call to | ||
3305 | * get_state_synchronize_sched(), just return. Otherwise, invoke | ||
3306 | * synchronize_sched() to wait for a full grace period. | ||
3307 | * | ||
3308 | * Yes, this function does not take counter wrap into account. But | ||
3309 | * counter wrap is harmless. If the counter wraps, we have waited for | ||
3310 | * more than 2 billion grace periods (and way more on a 64-bit system!), | ||
3311 | * so waiting for one additional grace period should be just fine. | ||
3312 | */ | ||
3313 | void cond_synchronize_sched(unsigned long oldstate) | ||
3314 | { | ||
3315 | unsigned long newstate; | ||
3316 | |||
3317 | /* | ||
3318 | * Ensure that this load happens before any RCU-destructive | ||
3319 | * actions the caller might carry out after we return. | ||
3268 | */ | 3320 | */ |
3269 | smp_mb(); /* See above comment block. */ | 3321 | newstate = smp_load_acquire(&rcu_sched_state.completed); |
3322 | if (ULONG_CMP_GE(oldstate, newstate)) | ||
3323 | synchronize_sched(); | ||
3324 | } | ||
3325 | EXPORT_SYMBOL_GPL(cond_synchronize_sched); | ||
3326 | |||
3327 | /* Adjust sequence number for start of update-side operation. */ | ||
3328 | static void rcu_seq_start(unsigned long *sp) | ||
3329 | { | ||
3330 | WRITE_ONCE(*sp, *sp + 1); | ||
3331 | smp_mb(); /* Ensure update-side operation after counter increment. */ | ||
3332 | WARN_ON_ONCE(!(*sp & 0x1)); | ||
3333 | } | ||
3334 | |||
3335 | /* Adjust sequence number for end of update-side operation. */ | ||
3336 | static void rcu_seq_end(unsigned long *sp) | ||
3337 | { | ||
3338 | smp_mb(); /* Ensure update-side operation before counter increment. */ | ||
3339 | WRITE_ONCE(*sp, *sp + 1); | ||
3340 | WARN_ON_ONCE(*sp & 0x1); | ||
3341 | } | ||
3342 | |||
3343 | /* Take a snapshot of the update side's sequence number. */ | ||
3344 | static unsigned long rcu_seq_snap(unsigned long *sp) | ||
3345 | { | ||
3346 | unsigned long s; | ||
3347 | |||
3348 | smp_mb(); /* Caller's modifications seen first by other CPUs. */ | ||
3349 | s = (READ_ONCE(*sp) + 3) & ~0x1; | ||
3350 | smp_mb(); /* Above access must not bleed into critical section. */ | ||
3351 | return s; | ||
3352 | } | ||
3353 | |||
3354 | /* | ||
3355 | * Given a snapshot from rcu_seq_snap(), determine whether or not a | ||
3356 | * full update-side operation has occurred. | ||
3357 | */ | ||
3358 | static bool rcu_seq_done(unsigned long *sp, unsigned long s) | ||
3359 | { | ||
3360 | return ULONG_CMP_GE(READ_ONCE(*sp), s); | ||
3361 | } | ||
3362 | |||
3363 | /* Wrapper functions for expedited grace periods. */ | ||
3364 | static void rcu_exp_gp_seq_start(struct rcu_state *rsp) | ||
3365 | { | ||
3366 | rcu_seq_start(&rsp->expedited_sequence); | ||
3367 | } | ||
3368 | static void rcu_exp_gp_seq_end(struct rcu_state *rsp) | ||
3369 | { | ||
3370 | rcu_seq_end(&rsp->expedited_sequence); | ||
3371 | smp_mb(); /* Ensure that consecutive grace periods serialize. */ | ||
3372 | } | ||
3373 | static unsigned long rcu_exp_gp_seq_snap(struct rcu_state *rsp) | ||
3374 | { | ||
3375 | return rcu_seq_snap(&rsp->expedited_sequence); | ||
3376 | } | ||
3377 | static bool rcu_exp_gp_seq_done(struct rcu_state *rsp, unsigned long s) | ||
3378 | { | ||
3379 | return rcu_seq_done(&rsp->expedited_sequence, s); | ||
3380 | } | ||
3381 | |||
3382 | /* Common code for synchronize_{rcu,sched}_expedited() work-done checking. */ | ||
3383 | static bool sync_exp_work_done(struct rcu_state *rsp, struct rcu_node *rnp, | ||
3384 | struct rcu_data *rdp, | ||
3385 | atomic_long_t *stat, unsigned long s) | ||
3386 | { | ||
3387 | if (rcu_exp_gp_seq_done(rsp, s)) { | ||
3388 | if (rnp) | ||
3389 | mutex_unlock(&rnp->exp_funnel_mutex); | ||
3390 | else if (rdp) | ||
3391 | mutex_unlock(&rdp->exp_funnel_mutex); | ||
3392 | /* Ensure test happens before caller kfree(). */ | ||
3393 | smp_mb__before_atomic(); /* ^^^ */ | ||
3394 | atomic_long_inc(stat); | ||
3395 | return true; | ||
3396 | } | ||
3397 | return false; | ||
3398 | } | ||
3399 | |||
3400 | /* | ||
3401 | * Funnel-lock acquisition for expedited grace periods. Returns a | ||
3402 | * pointer to the root rcu_node structure, or NULL if some other | ||
3403 | * task did the expedited grace period for us. | ||
3404 | */ | ||
3405 | static struct rcu_node *exp_funnel_lock(struct rcu_state *rsp, unsigned long s) | ||
3406 | { | ||
3407 | struct rcu_data *rdp; | ||
3408 | struct rcu_node *rnp0; | ||
3409 | struct rcu_node *rnp1 = NULL; | ||
3410 | |||
3411 | /* | ||
3412 | * First try directly acquiring the root lock in order to reduce | ||
3413 | * latency in the common case where expedited grace periods are | ||
3414 | * rare. We check mutex_is_locked() to avoid pathological levels of | ||
3415 | * memory contention on ->exp_funnel_mutex in the heavy-load case. | ||
3416 | */ | ||
3417 | rnp0 = rcu_get_root(rsp); | ||
3418 | if (!mutex_is_locked(&rnp0->exp_funnel_mutex)) { | ||
3419 | if (mutex_trylock(&rnp0->exp_funnel_mutex)) { | ||
3420 | if (sync_exp_work_done(rsp, rnp0, NULL, | ||
3421 | &rsp->expedited_workdone0, s)) | ||
3422 | return NULL; | ||
3423 | return rnp0; | ||
3424 | } | ||
3425 | } | ||
3426 | |||
3427 | /* | ||
3428 | * Each pass through the following loop works its way | ||
3429 | * up the rcu_node tree, returning if others have done the | ||
3430 | * work or otherwise falls through holding the root rnp's | ||
3431 | * ->exp_funnel_mutex. The mapping from CPU to rcu_node structure | ||
3432 | * can be inexact, as it is just promoting locality and is not | ||
3433 | * strictly needed for correctness. | ||
3434 | */ | ||
3435 | rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id()); | ||
3436 | if (sync_exp_work_done(rsp, NULL, NULL, &rsp->expedited_workdone1, s)) | ||
3437 | return NULL; | ||
3438 | mutex_lock(&rdp->exp_funnel_mutex); | ||
3439 | rnp0 = rdp->mynode; | ||
3440 | for (; rnp0 != NULL; rnp0 = rnp0->parent) { | ||
3441 | if (sync_exp_work_done(rsp, rnp1, rdp, | ||
3442 | &rsp->expedited_workdone2, s)) | ||
3443 | return NULL; | ||
3444 | mutex_lock(&rnp0->exp_funnel_mutex); | ||
3445 | if (rnp1) | ||
3446 | mutex_unlock(&rnp1->exp_funnel_mutex); | ||
3447 | else | ||
3448 | mutex_unlock(&rdp->exp_funnel_mutex); | ||
3449 | rnp1 = rnp0; | ||
3450 | } | ||
3451 | if (sync_exp_work_done(rsp, rnp1, rdp, | ||
3452 | &rsp->expedited_workdone3, s)) | ||
3453 | return NULL; | ||
3454 | return rnp1; | ||
3455 | } | ||
3456 | |||
3457 | /* Invoked on each online non-idle CPU for expedited quiescent state. */ | ||
3458 | static int synchronize_sched_expedited_cpu_stop(void *data) | ||
3459 | { | ||
3460 | struct rcu_data *rdp = data; | ||
3461 | struct rcu_state *rsp = rdp->rsp; | ||
3462 | |||
3463 | /* We are here: If we are last, do the wakeup. */ | ||
3464 | rdp->exp_done = true; | ||
3465 | if (atomic_dec_and_test(&rsp->expedited_need_qs)) | ||
3466 | wake_up(&rsp->expedited_wq); | ||
3270 | return 0; | 3467 | return 0; |
3271 | } | 3468 | } |
3272 | 3469 | ||
3470 | static void synchronize_sched_expedited_wait(struct rcu_state *rsp) | ||
3471 | { | ||
3472 | int cpu; | ||
3473 | unsigned long jiffies_stall; | ||
3474 | unsigned long jiffies_start; | ||
3475 | struct rcu_data *rdp; | ||
3476 | int ret; | ||
3477 | |||
3478 | jiffies_stall = rcu_jiffies_till_stall_check(); | ||
3479 | jiffies_start = jiffies; | ||
3480 | |||
3481 | for (;;) { | ||
3482 | ret = wait_event_interruptible_timeout( | ||
3483 | rsp->expedited_wq, | ||
3484 | !atomic_read(&rsp->expedited_need_qs), | ||
3485 | jiffies_stall); | ||
3486 | if (ret > 0) | ||
3487 | return; | ||
3488 | if (ret < 0) { | ||
3489 | /* Hit a signal, disable CPU stall warnings. */ | ||
3490 | wait_event(rsp->expedited_wq, | ||
3491 | !atomic_read(&rsp->expedited_need_qs)); | ||
3492 | return; | ||
3493 | } | ||
3494 | pr_err("INFO: %s detected expedited stalls on CPUs: {", | ||
3495 | rsp->name); | ||
3496 | for_each_online_cpu(cpu) { | ||
3497 | rdp = per_cpu_ptr(rsp->rda, cpu); | ||
3498 | |||
3499 | if (rdp->exp_done) | ||
3500 | continue; | ||
3501 | pr_cont(" %d", cpu); | ||
3502 | } | ||
3503 | pr_cont(" } %lu jiffies s: %lu\n", | ||
3504 | jiffies - jiffies_start, rsp->expedited_sequence); | ||
3505 | for_each_online_cpu(cpu) { | ||
3506 | rdp = per_cpu_ptr(rsp->rda, cpu); | ||
3507 | |||
3508 | if (rdp->exp_done) | ||
3509 | continue; | ||
3510 | dump_cpu_task(cpu); | ||
3511 | } | ||
3512 | jiffies_stall = 3 * rcu_jiffies_till_stall_check() + 3; | ||
3513 | } | ||
3514 | } | ||
3515 | |||
3273 | /** | 3516 | /** |
3274 | * synchronize_sched_expedited - Brute-force RCU-sched grace period | 3517 | * synchronize_sched_expedited - Brute-force RCU-sched grace period |
3275 | * | 3518 | * |
@@ -3281,58 +3524,21 @@ static int synchronize_sched_expedited_cpu_stop(void *data) | |||
3281 | * restructure your code to batch your updates, and then use a single | 3524 | * restructure your code to batch your updates, and then use a single |
3282 | * synchronize_sched() instead. | 3525 | * synchronize_sched() instead. |
3283 | * | 3526 | * |
3284 | * This implementation can be thought of as an application of ticket | 3527 | * This implementation can be thought of as an application of sequence |
3285 | * locking to RCU, with sync_sched_expedited_started and | 3528 | * locking to expedited grace periods, but using the sequence counter to |
3286 | * sync_sched_expedited_done taking on the roles of the halves | 3529 | * determine when someone else has already done the work instead of for |
3287 | * of the ticket-lock word. Each task atomically increments | 3530 | * retrying readers. |
3288 | * sync_sched_expedited_started upon entry, snapshotting the old value, | ||
3289 | * then attempts to stop all the CPUs. If this succeeds, then each | ||
3290 | * CPU will have executed a context switch, resulting in an RCU-sched | ||
3291 | * grace period. We are then done, so we use atomic_cmpxchg() to | ||
3292 | * update sync_sched_expedited_done to match our snapshot -- but | ||
3293 | * only if someone else has not already advanced past our snapshot. | ||
3294 | * | ||
3295 | * On the other hand, if try_stop_cpus() fails, we check the value | ||
3296 | * of sync_sched_expedited_done. If it has advanced past our | ||
3297 | * initial snapshot, then someone else must have forced a grace period | ||
3298 | * some time after we took our snapshot. In this case, our work is | ||
3299 | * done for us, and we can simply return. Otherwise, we try again, | ||
3300 | * but keep our initial snapshot for purposes of checking for someone | ||
3301 | * doing our work for us. | ||
3302 | * | ||
3303 | * If we fail too many times in a row, we fall back to synchronize_sched(). | ||
3304 | */ | 3531 | */ |
3305 | void synchronize_sched_expedited(void) | 3532 | void synchronize_sched_expedited(void) |
3306 | { | 3533 | { |
3307 | cpumask_var_t cm; | ||
3308 | bool cma = false; | ||
3309 | int cpu; | 3534 | int cpu; |
3310 | long firstsnap, s, snap; | 3535 | unsigned long s; |
3311 | int trycount = 0; | 3536 | struct rcu_node *rnp; |
3312 | struct rcu_state *rsp = &rcu_sched_state; | 3537 | struct rcu_state *rsp = &rcu_sched_state; |
3313 | 3538 | ||
3314 | /* | 3539 | /* Take a snapshot of the sequence number. */ |
3315 | * If we are in danger of counter wrap, just do synchronize_sched(). | 3540 | s = rcu_exp_gp_seq_snap(rsp); |
3316 | * By allowing sync_sched_expedited_started to advance no more than | ||
3317 | * ULONG_MAX/8 ahead of sync_sched_expedited_done, we are ensuring | ||
3318 | * that more than 3.5 billion CPUs would be required to force a | ||
3319 | * counter wrap on a 32-bit system. Quite a few more CPUs would of | ||
3320 | * course be required on a 64-bit system. | ||
3321 | */ | ||
3322 | if (ULONG_CMP_GE((ulong)atomic_long_read(&rsp->expedited_start), | ||
3323 | (ulong)atomic_long_read(&rsp->expedited_done) + | ||
3324 | ULONG_MAX / 8)) { | ||
3325 | wait_rcu_gp(call_rcu_sched); | ||
3326 | atomic_long_inc(&rsp->expedited_wrap); | ||
3327 | return; | ||
3328 | } | ||
3329 | 3541 | ||
3330 | /* | ||
3331 | * Take a ticket. Note that atomic_inc_return() implies a | ||
3332 | * full memory barrier. | ||
3333 | */ | ||
3334 | snap = atomic_long_inc_return(&rsp->expedited_start); | ||
3335 | firstsnap = snap; | ||
3336 | if (!try_get_online_cpus()) { | 3542 | if (!try_get_online_cpus()) { |
3337 | /* CPU hotplug operation in flight, fall back to normal GP. */ | 3543 | /* CPU hotplug operation in flight, fall back to normal GP. */ |
3338 | wait_rcu_gp(call_rcu_sched); | 3544 | wait_rcu_gp(call_rcu_sched); |
@@ -3341,100 +3547,38 @@ void synchronize_sched_expedited(void) | |||
3341 | } | 3547 | } |
3342 | WARN_ON_ONCE(cpu_is_offline(raw_smp_processor_id())); | 3548 | WARN_ON_ONCE(cpu_is_offline(raw_smp_processor_id())); |
3343 | 3549 | ||
3344 | /* Offline CPUs, idle CPUs, and any CPU we run on are quiescent. */ | 3550 | rnp = exp_funnel_lock(rsp, s); |
3345 | cma = zalloc_cpumask_var(&cm, GFP_KERNEL); | 3551 | if (rnp == NULL) { |
3346 | if (cma) { | 3552 | put_online_cpus(); |
3347 | cpumask_copy(cm, cpu_online_mask); | 3553 | return; /* Someone else did our work for us. */ |
3348 | cpumask_clear_cpu(raw_smp_processor_id(), cm); | ||
3349 | for_each_cpu(cpu, cm) { | ||
3350 | struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); | ||
3351 | |||
3352 | if (!(atomic_add_return(0, &rdtp->dynticks) & 0x1)) | ||
3353 | cpumask_clear_cpu(cpu, cm); | ||
3354 | } | ||
3355 | if (cpumask_weight(cm) == 0) | ||
3356 | goto all_cpus_idle; | ||
3357 | } | 3554 | } |
3358 | 3555 | ||
3359 | /* | 3556 | rcu_exp_gp_seq_start(rsp); |
3360 | * Each pass through the following loop attempts to force a | ||
3361 | * context switch on each CPU. | ||
3362 | */ | ||
3363 | while (try_stop_cpus(cma ? cm : cpu_online_mask, | ||
3364 | synchronize_sched_expedited_cpu_stop, | ||
3365 | NULL) == -EAGAIN) { | ||
3366 | put_online_cpus(); | ||
3367 | atomic_long_inc(&rsp->expedited_tryfail); | ||
3368 | |||
3369 | /* Check to see if someone else did our work for us. */ | ||
3370 | s = atomic_long_read(&rsp->expedited_done); | ||
3371 | if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) { | ||
3372 | /* ensure test happens before caller kfree */ | ||
3373 | smp_mb__before_atomic(); /* ^^^ */ | ||
3374 | atomic_long_inc(&rsp->expedited_workdone1); | ||
3375 | free_cpumask_var(cm); | ||
3376 | return; | ||
3377 | } | ||
3378 | 3557 | ||
3379 | /* No joy, try again later. Or just synchronize_sched(). */ | 3558 | /* Stop each CPU that is online, non-idle, and not us. */ |
3380 | if (trycount++ < 10) { | 3559 | init_waitqueue_head(&rsp->expedited_wq); |
3381 | udelay(trycount * num_online_cpus()); | 3560 | atomic_set(&rsp->expedited_need_qs, 1); /* Extra count avoids race. */ |
3382 | } else { | 3561 | for_each_online_cpu(cpu) { |
3383 | wait_rcu_gp(call_rcu_sched); | 3562 | struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); |
3384 | atomic_long_inc(&rsp->expedited_normal); | 3563 | struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); |
3385 | free_cpumask_var(cm); | ||
3386 | return; | ||
3387 | } | ||
3388 | 3564 | ||
3389 | /* Recheck to see if someone else did our work for us. */ | 3565 | rdp->exp_done = false; |
3390 | s = atomic_long_read(&rsp->expedited_done); | ||
3391 | if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) { | ||
3392 | /* ensure test happens before caller kfree */ | ||
3393 | smp_mb__before_atomic(); /* ^^^ */ | ||
3394 | atomic_long_inc(&rsp->expedited_workdone2); | ||
3395 | free_cpumask_var(cm); | ||
3396 | return; | ||
3397 | } | ||
3398 | 3566 | ||
3399 | /* | 3567 | /* Skip our CPU and any idle CPUs. */ |
3400 | * Refetching sync_sched_expedited_started allows later | 3568 | if (raw_smp_processor_id() == cpu || |
3401 | * callers to piggyback on our grace period. We retry | 3569 | !(atomic_add_return(0, &rdtp->dynticks) & 0x1)) |
3402 | * after they started, so our grace period works for them, | 3570 | continue; |
3403 | * and they started after our first try, so their grace | 3571 | atomic_inc(&rsp->expedited_need_qs); |
3404 | * period works for us. | 3572 | stop_one_cpu_nowait(cpu, synchronize_sched_expedited_cpu_stop, |
3405 | */ | 3573 | rdp, &rdp->exp_stop_work); |
3406 | if (!try_get_online_cpus()) { | ||
3407 | /* CPU hotplug operation in flight, use normal GP. */ | ||
3408 | wait_rcu_gp(call_rcu_sched); | ||
3409 | atomic_long_inc(&rsp->expedited_normal); | ||
3410 | free_cpumask_var(cm); | ||
3411 | return; | ||
3412 | } | ||
3413 | snap = atomic_long_read(&rsp->expedited_start); | ||
3414 | smp_mb(); /* ensure read is before try_stop_cpus(). */ | ||
3415 | } | 3574 | } |
3416 | atomic_long_inc(&rsp->expedited_stoppedcpus); | ||
3417 | 3575 | ||
3418 | all_cpus_idle: | 3576 | /* Remove extra count and, if necessary, wait for CPUs to stop. */ |
3419 | free_cpumask_var(cm); | 3577 | if (!atomic_dec_and_test(&rsp->expedited_need_qs)) |
3578 | synchronize_sched_expedited_wait(rsp); | ||
3420 | 3579 | ||
3421 | /* | 3580 | rcu_exp_gp_seq_end(rsp); |
3422 | * Everyone up to our most recent fetch is covered by our grace | 3581 | mutex_unlock(&rnp->exp_funnel_mutex); |
3423 | * period. Update the counter, but only if our work is still | ||
3424 | * relevant -- which it won't be if someone who started later | ||
3425 | * than we did already did their update. | ||
3426 | */ | ||
3427 | do { | ||
3428 | atomic_long_inc(&rsp->expedited_done_tries); | ||
3429 | s = atomic_long_read(&rsp->expedited_done); | ||
3430 | if (ULONG_CMP_GE((ulong)s, (ulong)snap)) { | ||
3431 | /* ensure test happens before caller kfree */ | ||
3432 | smp_mb__before_atomic(); /* ^^^ */ | ||
3433 | atomic_long_inc(&rsp->expedited_done_lost); | ||
3434 | break; | ||
3435 | } | ||
3436 | } while (atomic_long_cmpxchg(&rsp->expedited_done, s, snap) != s); | ||
3437 | atomic_long_inc(&rsp->expedited_done_exit); | ||
3438 | 3582 | ||
3439 | put_online_cpus(); | 3583 | put_online_cpus(); |
3440 | } | 3584 | } |
@@ -3571,10 +3715,10 @@ static void rcu_barrier_callback(struct rcu_head *rhp) | |||
3571 | struct rcu_state *rsp = rdp->rsp; | 3715 | struct rcu_state *rsp = rdp->rsp; |
3572 | 3716 | ||
3573 | if (atomic_dec_and_test(&rsp->barrier_cpu_count)) { | 3717 | if (atomic_dec_and_test(&rsp->barrier_cpu_count)) { |
3574 | _rcu_barrier_trace(rsp, "LastCB", -1, rsp->n_barrier_done); | 3718 | _rcu_barrier_trace(rsp, "LastCB", -1, rsp->barrier_sequence); |
3575 | complete(&rsp->barrier_completion); | 3719 | complete(&rsp->barrier_completion); |
3576 | } else { | 3720 | } else { |
3577 | _rcu_barrier_trace(rsp, "CB", -1, rsp->n_barrier_done); | 3721 | _rcu_barrier_trace(rsp, "CB", -1, rsp->barrier_sequence); |
3578 | } | 3722 | } |
3579 | } | 3723 | } |
3580 | 3724 | ||
@@ -3586,7 +3730,7 @@ static void rcu_barrier_func(void *type) | |||
3586 | struct rcu_state *rsp = type; | 3730 | struct rcu_state *rsp = type; |
3587 | struct rcu_data *rdp = raw_cpu_ptr(rsp->rda); | 3731 | struct rcu_data *rdp = raw_cpu_ptr(rsp->rda); |
3588 | 3732 | ||
3589 | _rcu_barrier_trace(rsp, "IRQ", -1, rsp->n_barrier_done); | 3733 | _rcu_barrier_trace(rsp, "IRQ", -1, rsp->barrier_sequence); |
3590 | atomic_inc(&rsp->barrier_cpu_count); | 3734 | atomic_inc(&rsp->barrier_cpu_count); |
3591 | rsp->call(&rdp->barrier_head, rcu_barrier_callback); | 3735 | rsp->call(&rdp->barrier_head, rcu_barrier_callback); |
3592 | } | 3736 | } |
@@ -3599,55 +3743,24 @@ static void _rcu_barrier(struct rcu_state *rsp) | |||
3599 | { | 3743 | { |
3600 | int cpu; | 3744 | int cpu; |
3601 | struct rcu_data *rdp; | 3745 | struct rcu_data *rdp; |
3602 | unsigned long snap = READ_ONCE(rsp->n_barrier_done); | 3746 | unsigned long s = rcu_seq_snap(&rsp->barrier_sequence); |
3603 | unsigned long snap_done; | ||
3604 | 3747 | ||
3605 | _rcu_barrier_trace(rsp, "Begin", -1, snap); | 3748 | _rcu_barrier_trace(rsp, "Begin", -1, s); |
3606 | 3749 | ||
3607 | /* Take mutex to serialize concurrent rcu_barrier() requests. */ | 3750 | /* Take mutex to serialize concurrent rcu_barrier() requests. */ |
3608 | mutex_lock(&rsp->barrier_mutex); | 3751 | mutex_lock(&rsp->barrier_mutex); |
3609 | 3752 | ||
3610 | /* | 3753 | /* Did someone else do our work for us? */ |
3611 | * Ensure that all prior references, including to ->n_barrier_done, | 3754 | if (rcu_seq_done(&rsp->barrier_sequence, s)) { |
3612 | * are ordered before the _rcu_barrier() machinery. | 3755 | _rcu_barrier_trace(rsp, "EarlyExit", -1, rsp->barrier_sequence); |
3613 | */ | ||
3614 | smp_mb(); /* See above block comment. */ | ||
3615 | |||
3616 | /* | ||
3617 | * Recheck ->n_barrier_done to see if others did our work for us. | ||
3618 | * This means checking ->n_barrier_done for an even-to-odd-to-even | ||
3619 | * transition. The "if" expression below therefore rounds the old | ||
3620 | * value up to the next even number and adds two before comparing. | ||
3621 | */ | ||
3622 | snap_done = rsp->n_barrier_done; | ||
3623 | _rcu_barrier_trace(rsp, "Check", -1, snap_done); | ||
3624 | |||
3625 | /* | ||
3626 | * If the value in snap is odd, we needed to wait for the current | ||
3627 | * rcu_barrier() to complete, then wait for the next one, in other | ||
3628 | * words, we need the value of snap_done to be three larger than | ||
3629 | * the value of snap. On the other hand, if the value in snap is | ||
3630 | * even, we only had to wait for the next rcu_barrier() to complete, | ||
3631 | * in other words, we need the value of snap_done to be only two | ||
3632 | * greater than the value of snap. The "(snap + 3) & ~0x1" computes | ||
3633 | * this for us (thank you, Linus!). | ||
3634 | */ | ||
3635 | if (ULONG_CMP_GE(snap_done, (snap + 3) & ~0x1)) { | ||
3636 | _rcu_barrier_trace(rsp, "EarlyExit", -1, snap_done); | ||
3637 | smp_mb(); /* caller's subsequent code after above check. */ | 3756 | smp_mb(); /* caller's subsequent code after above check. */ |
3638 | mutex_unlock(&rsp->barrier_mutex); | 3757 | mutex_unlock(&rsp->barrier_mutex); |
3639 | return; | 3758 | return; |
3640 | } | 3759 | } |
3641 | 3760 | ||
3642 | /* | 3761 | /* Mark the start of the barrier operation. */ |
3643 | * Increment ->n_barrier_done to avoid duplicate work. Use | 3762 | rcu_seq_start(&rsp->barrier_sequence); |
3644 | * WRITE_ONCE() to prevent the compiler from speculating | 3763 | _rcu_barrier_trace(rsp, "Inc1", -1, rsp->barrier_sequence); |
3645 | * the increment to precede the early-exit check. | ||
3646 | */ | ||
3647 | WRITE_ONCE(rsp->n_barrier_done, rsp->n_barrier_done + 1); | ||
3648 | WARN_ON_ONCE((rsp->n_barrier_done & 0x1) != 1); | ||
3649 | _rcu_barrier_trace(rsp, "Inc1", -1, rsp->n_barrier_done); | ||
3650 | smp_mb(); /* Order ->n_barrier_done increment with below mechanism. */ | ||
3651 | 3764 | ||
3652 | /* | 3765 | /* |
3653 | * Initialize the count to one rather than to zero in order to | 3766 | * Initialize the count to one rather than to zero in order to |
@@ -3671,10 +3784,10 @@ static void _rcu_barrier(struct rcu_state *rsp) | |||
3671 | if (rcu_is_nocb_cpu(cpu)) { | 3784 | if (rcu_is_nocb_cpu(cpu)) { |
3672 | if (!rcu_nocb_cpu_needs_barrier(rsp, cpu)) { | 3785 | if (!rcu_nocb_cpu_needs_barrier(rsp, cpu)) { |
3673 | _rcu_barrier_trace(rsp, "OfflineNoCB", cpu, | 3786 | _rcu_barrier_trace(rsp, "OfflineNoCB", cpu, |
3674 | rsp->n_barrier_done); | 3787 | rsp->barrier_sequence); |
3675 | } else { | 3788 | } else { |
3676 | _rcu_barrier_trace(rsp, "OnlineNoCB", cpu, | 3789 | _rcu_barrier_trace(rsp, "OnlineNoCB", cpu, |
3677 | rsp->n_barrier_done); | 3790 | rsp->barrier_sequence); |
3678 | smp_mb__before_atomic(); | 3791 | smp_mb__before_atomic(); |
3679 | atomic_inc(&rsp->barrier_cpu_count); | 3792 | atomic_inc(&rsp->barrier_cpu_count); |
3680 | __call_rcu(&rdp->barrier_head, | 3793 | __call_rcu(&rdp->barrier_head, |
@@ -3682,11 +3795,11 @@ static void _rcu_barrier(struct rcu_state *rsp) | |||
3682 | } | 3795 | } |
3683 | } else if (READ_ONCE(rdp->qlen)) { | 3796 | } else if (READ_ONCE(rdp->qlen)) { |
3684 | _rcu_barrier_trace(rsp, "OnlineQ", cpu, | 3797 | _rcu_barrier_trace(rsp, "OnlineQ", cpu, |
3685 | rsp->n_barrier_done); | 3798 | rsp->barrier_sequence); |
3686 | smp_call_function_single(cpu, rcu_barrier_func, rsp, 1); | 3799 | smp_call_function_single(cpu, rcu_barrier_func, rsp, 1); |
3687 | } else { | 3800 | } else { |
3688 | _rcu_barrier_trace(rsp, "OnlineNQ", cpu, | 3801 | _rcu_barrier_trace(rsp, "OnlineNQ", cpu, |
3689 | rsp->n_barrier_done); | 3802 | rsp->barrier_sequence); |
3690 | } | 3803 | } |
3691 | } | 3804 | } |
3692 | put_online_cpus(); | 3805 | put_online_cpus(); |
@@ -3698,16 +3811,13 @@ static void _rcu_barrier(struct rcu_state *rsp) | |||
3698 | if (atomic_dec_and_test(&rsp->barrier_cpu_count)) | 3811 | if (atomic_dec_and_test(&rsp->barrier_cpu_count)) |
3699 | complete(&rsp->barrier_completion); | 3812 | complete(&rsp->barrier_completion); |
3700 | 3813 | ||
3701 | /* Increment ->n_barrier_done to prevent duplicate work. */ | ||
3702 | smp_mb(); /* Keep increment after above mechanism. */ | ||
3703 | WRITE_ONCE(rsp->n_barrier_done, rsp->n_barrier_done + 1); | ||
3704 | WARN_ON_ONCE((rsp->n_barrier_done & 0x1) != 0); | ||
3705 | _rcu_barrier_trace(rsp, "Inc2", -1, rsp->n_barrier_done); | ||
3706 | smp_mb(); /* Keep increment before caller's subsequent code. */ | ||
3707 | |||
3708 | /* Wait for all rcu_barrier_callback() callbacks to be invoked. */ | 3814 | /* Wait for all rcu_barrier_callback() callbacks to be invoked. */ |
3709 | wait_for_completion(&rsp->barrier_completion); | 3815 | wait_for_completion(&rsp->barrier_completion); |
3710 | 3816 | ||
3817 | /* Mark the end of the barrier operation. */ | ||
3818 | _rcu_barrier_trace(rsp, "Inc2", -1, rsp->barrier_sequence); | ||
3819 | rcu_seq_end(&rsp->barrier_sequence); | ||
3820 | |||
3711 | /* Other rcu_barrier() invocations can now safely proceed. */ | 3821 | /* Other rcu_barrier() invocations can now safely proceed. */ |
3712 | mutex_unlock(&rsp->barrier_mutex); | 3822 | mutex_unlock(&rsp->barrier_mutex); |
3713 | } | 3823 | } |
@@ -3770,6 +3880,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) | |||
3770 | WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1); | 3880 | WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1); |
3771 | rdp->cpu = cpu; | 3881 | rdp->cpu = cpu; |
3772 | rdp->rsp = rsp; | 3882 | rdp->rsp = rsp; |
3883 | mutex_init(&rdp->exp_funnel_mutex); | ||
3773 | rcu_boot_init_nocb_percpu_data(rdp); | 3884 | rcu_boot_init_nocb_percpu_data(rdp); |
3774 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 3885 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
3775 | } | 3886 | } |
@@ -3961,22 +4072,22 @@ void rcu_scheduler_starting(void) | |||
3961 | * Compute the per-level fanout, either using the exact fanout specified | 4072 | * Compute the per-level fanout, either using the exact fanout specified |
3962 | * or balancing the tree, depending on the rcu_fanout_exact boot parameter. | 4073 | * or balancing the tree, depending on the rcu_fanout_exact boot parameter. |
3963 | */ | 4074 | */ |
3964 | static void __init rcu_init_levelspread(struct rcu_state *rsp) | 4075 | static void __init rcu_init_levelspread(int *levelspread, const int *levelcnt) |
3965 | { | 4076 | { |
3966 | int i; | 4077 | int i; |
3967 | 4078 | ||
3968 | if (rcu_fanout_exact) { | 4079 | if (rcu_fanout_exact) { |
3969 | rsp->levelspread[rcu_num_lvls - 1] = rcu_fanout_leaf; | 4080 | levelspread[rcu_num_lvls - 1] = rcu_fanout_leaf; |
3970 | for (i = rcu_num_lvls - 2; i >= 0; i--) | 4081 | for (i = rcu_num_lvls - 2; i >= 0; i--) |
3971 | rsp->levelspread[i] = RCU_FANOUT; | 4082 | levelspread[i] = RCU_FANOUT; |
3972 | } else { | 4083 | } else { |
3973 | int ccur; | 4084 | int ccur; |
3974 | int cprv; | 4085 | int cprv; |
3975 | 4086 | ||
3976 | cprv = nr_cpu_ids; | 4087 | cprv = nr_cpu_ids; |
3977 | for (i = rcu_num_lvls - 1; i >= 0; i--) { | 4088 | for (i = rcu_num_lvls - 1; i >= 0; i--) { |
3978 | ccur = rsp->levelcnt[i]; | 4089 | ccur = levelcnt[i]; |
3979 | rsp->levelspread[i] = (cprv + ccur - 1) / ccur; | 4090 | levelspread[i] = (cprv + ccur - 1) / ccur; |
3980 | cprv = ccur; | 4091 | cprv = ccur; |
3981 | } | 4092 | } |
3982 | } | 4093 | } |
@@ -3988,23 +4099,20 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp) | |||
3988 | static void __init rcu_init_one(struct rcu_state *rsp, | 4099 | static void __init rcu_init_one(struct rcu_state *rsp, |
3989 | struct rcu_data __percpu *rda) | 4100 | struct rcu_data __percpu *rda) |
3990 | { | 4101 | { |
3991 | static const char * const buf[] = { | 4102 | static const char * const buf[] = RCU_NODE_NAME_INIT; |
3992 | "rcu_node_0", | 4103 | static const char * const fqs[] = RCU_FQS_NAME_INIT; |
3993 | "rcu_node_1", | 4104 | static const char * const exp[] = RCU_EXP_NAME_INIT; |
3994 | "rcu_node_2", | 4105 | static const char * const exp_sched[] = RCU_EXP_SCHED_NAME_INIT; |
3995 | "rcu_node_3" }; /* Match MAX_RCU_LVLS */ | ||
3996 | static const char * const fqs[] = { | ||
3997 | "rcu_node_fqs_0", | ||
3998 | "rcu_node_fqs_1", | ||
3999 | "rcu_node_fqs_2", | ||
4000 | "rcu_node_fqs_3" }; /* Match MAX_RCU_LVLS */ | ||
4001 | static u8 fl_mask = 0x1; | 4106 | static u8 fl_mask = 0x1; |
4107 | |||
4108 | int levelcnt[RCU_NUM_LVLS]; /* # nodes in each level. */ | ||
4109 | int levelspread[RCU_NUM_LVLS]; /* kids/node in each level. */ | ||
4002 | int cpustride = 1; | 4110 | int cpustride = 1; |
4003 | int i; | 4111 | int i; |
4004 | int j; | 4112 | int j; |
4005 | struct rcu_node *rnp; | 4113 | struct rcu_node *rnp; |
4006 | 4114 | ||
4007 | BUILD_BUG_ON(MAX_RCU_LVLS > ARRAY_SIZE(buf)); /* Fix buf[] init! */ | 4115 | BUILD_BUG_ON(RCU_NUM_LVLS > ARRAY_SIZE(buf)); /* Fix buf[] init! */ |
4008 | 4116 | ||
4009 | /* Silence gcc 4.8 false positive about array index out of range. */ | 4117 | /* Silence gcc 4.8 false positive about array index out of range. */ |
4010 | if (rcu_num_lvls <= 0 || rcu_num_lvls > RCU_NUM_LVLS) | 4118 | if (rcu_num_lvls <= 0 || rcu_num_lvls > RCU_NUM_LVLS) |
@@ -4013,19 +4121,19 @@ static void __init rcu_init_one(struct rcu_state *rsp, | |||
4013 | /* Initialize the level-tracking arrays. */ | 4121 | /* Initialize the level-tracking arrays. */ |
4014 | 4122 | ||
4015 | for (i = 0; i < rcu_num_lvls; i++) | 4123 | for (i = 0; i < rcu_num_lvls; i++) |
4016 | rsp->levelcnt[i] = num_rcu_lvl[i]; | 4124 | levelcnt[i] = num_rcu_lvl[i]; |
4017 | for (i = 1; i < rcu_num_lvls; i++) | 4125 | for (i = 1; i < rcu_num_lvls; i++) |
4018 | rsp->level[i] = rsp->level[i - 1] + rsp->levelcnt[i - 1]; | 4126 | rsp->level[i] = rsp->level[i - 1] + levelcnt[i - 1]; |
4019 | rcu_init_levelspread(rsp); | 4127 | rcu_init_levelspread(levelspread, levelcnt); |
4020 | rsp->flavor_mask = fl_mask; | 4128 | rsp->flavor_mask = fl_mask; |
4021 | fl_mask <<= 1; | 4129 | fl_mask <<= 1; |
4022 | 4130 | ||
4023 | /* Initialize the elements themselves, starting from the leaves. */ | 4131 | /* Initialize the elements themselves, starting from the leaves. */ |
4024 | 4132 | ||
4025 | for (i = rcu_num_lvls - 1; i >= 0; i--) { | 4133 | for (i = rcu_num_lvls - 1; i >= 0; i--) { |
4026 | cpustride *= rsp->levelspread[i]; | 4134 | cpustride *= levelspread[i]; |
4027 | rnp = rsp->level[i]; | 4135 | rnp = rsp->level[i]; |
4028 | for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) { | 4136 | for (j = 0; j < levelcnt[i]; j++, rnp++) { |
4029 | raw_spin_lock_init(&rnp->lock); | 4137 | raw_spin_lock_init(&rnp->lock); |
4030 | lockdep_set_class_and_name(&rnp->lock, | 4138 | lockdep_set_class_and_name(&rnp->lock, |
4031 | &rcu_node_class[i], buf[i]); | 4139 | &rcu_node_class[i], buf[i]); |
@@ -4045,14 +4153,23 @@ static void __init rcu_init_one(struct rcu_state *rsp, | |||
4045 | rnp->grpmask = 0; | 4153 | rnp->grpmask = 0; |
4046 | rnp->parent = NULL; | 4154 | rnp->parent = NULL; |
4047 | } else { | 4155 | } else { |
4048 | rnp->grpnum = j % rsp->levelspread[i - 1]; | 4156 | rnp->grpnum = j % levelspread[i - 1]; |
4049 | rnp->grpmask = 1UL << rnp->grpnum; | 4157 | rnp->grpmask = 1UL << rnp->grpnum; |
4050 | rnp->parent = rsp->level[i - 1] + | 4158 | rnp->parent = rsp->level[i - 1] + |
4051 | j / rsp->levelspread[i - 1]; | 4159 | j / levelspread[i - 1]; |
4052 | } | 4160 | } |
4053 | rnp->level = i; | 4161 | rnp->level = i; |
4054 | INIT_LIST_HEAD(&rnp->blkd_tasks); | 4162 | INIT_LIST_HEAD(&rnp->blkd_tasks); |
4055 | rcu_init_one_nocb(rnp); | 4163 | rcu_init_one_nocb(rnp); |
4164 | mutex_init(&rnp->exp_funnel_mutex); | ||
4165 | if (rsp == &rcu_sched_state) | ||
4166 | lockdep_set_class_and_name( | ||
4167 | &rnp->exp_funnel_mutex, | ||
4168 | &rcu_exp_sched_class[i], exp_sched[i]); | ||
4169 | else | ||
4170 | lockdep_set_class_and_name( | ||
4171 | &rnp->exp_funnel_mutex, | ||
4172 | &rcu_exp_class[i], exp[i]); | ||
4056 | } | 4173 | } |
4057 | } | 4174 | } |
4058 | 4175 | ||
@@ -4076,9 +4193,7 @@ static void __init rcu_init_geometry(void) | |||
4076 | { | 4193 | { |
4077 | ulong d; | 4194 | ulong d; |
4078 | int i; | 4195 | int i; |
4079 | int j; | 4196 | int rcu_capacity[RCU_NUM_LVLS]; |
4080 | int n = nr_cpu_ids; | ||
4081 | int rcu_capacity[MAX_RCU_LVLS + 1]; | ||
4082 | 4197 | ||
4083 | /* | 4198 | /* |
4084 | * Initialize any unspecified boot parameters. | 4199 | * Initialize any unspecified boot parameters. |
@@ -4101,47 +4216,49 @@ static void __init rcu_init_geometry(void) | |||
4101 | rcu_fanout_leaf, nr_cpu_ids); | 4216 | rcu_fanout_leaf, nr_cpu_ids); |
4102 | 4217 | ||
4103 | /* | 4218 | /* |
4104 | * Compute number of nodes that can be handled an rcu_node tree | ||
4105 | * with the given number of levels. Setting rcu_capacity[0] makes | ||
4106 | * some of the arithmetic easier. | ||
4107 | */ | ||
4108 | rcu_capacity[0] = 1; | ||
4109 | rcu_capacity[1] = rcu_fanout_leaf; | ||
4110 | for (i = 2; i <= MAX_RCU_LVLS; i++) | ||
4111 | rcu_capacity[i] = rcu_capacity[i - 1] * RCU_FANOUT; | ||
4112 | |||
4113 | /* | ||
4114 | * The boot-time rcu_fanout_leaf parameter is only permitted | 4219 | * The boot-time rcu_fanout_leaf parameter is only permitted |
4115 | * to increase the leaf-level fanout, not decrease it. Of course, | 4220 | * to increase the leaf-level fanout, not decrease it. Of course, |
4116 | * the leaf-level fanout cannot exceed the number of bits in | 4221 | * the leaf-level fanout cannot exceed the number of bits in |
4117 | * the rcu_node masks. Finally, the tree must be able to accommodate | 4222 | * the rcu_node masks. Complain and fall back to the compile- |
4118 | * the configured number of CPUs. Complain and fall back to the | 4223 | * time values if these limits are exceeded. |
4119 | * compile-time values if these limits are exceeded. | ||
4120 | */ | 4224 | */ |
4121 | if (rcu_fanout_leaf < RCU_FANOUT_LEAF || | 4225 | if (rcu_fanout_leaf < RCU_FANOUT_LEAF || |
4122 | rcu_fanout_leaf > sizeof(unsigned long) * 8 || | 4226 | rcu_fanout_leaf > sizeof(unsigned long) * 8) { |
4123 | n > rcu_capacity[MAX_RCU_LVLS]) { | 4227 | rcu_fanout_leaf = RCU_FANOUT_LEAF; |
4124 | WARN_ON(1); | 4228 | WARN_ON(1); |
4125 | return; | 4229 | return; |
4126 | } | 4230 | } |
4127 | 4231 | ||
4232 | /* | ||
4233 | * Compute number of nodes that can be handled an rcu_node tree | ||
4234 | * with the given number of levels. | ||
4235 | */ | ||
4236 | rcu_capacity[0] = rcu_fanout_leaf; | ||
4237 | for (i = 1; i < RCU_NUM_LVLS; i++) | ||
4238 | rcu_capacity[i] = rcu_capacity[i - 1] * RCU_FANOUT; | ||
4239 | |||
4240 | /* | ||
4241 | * The tree must be able to accommodate the configured number of CPUs. | ||
4242 | * If this limit is exceeded than we have a serious problem elsewhere. | ||
4243 | */ | ||
4244 | if (nr_cpu_ids > rcu_capacity[RCU_NUM_LVLS - 1]) | ||
4245 | panic("rcu_init_geometry: rcu_capacity[] is too small"); | ||
4246 | |||
4247 | /* Calculate the number of levels in the tree. */ | ||
4248 | for (i = 0; nr_cpu_ids > rcu_capacity[i]; i++) { | ||
4249 | } | ||
4250 | rcu_num_lvls = i + 1; | ||
4251 | |||
4128 | /* Calculate the number of rcu_nodes at each level of the tree. */ | 4252 | /* Calculate the number of rcu_nodes at each level of the tree. */ |
4129 | for (i = 1; i <= MAX_RCU_LVLS; i++) | 4253 | for (i = 0; i < rcu_num_lvls; i++) { |
4130 | if (n <= rcu_capacity[i]) { | 4254 | int cap = rcu_capacity[(rcu_num_lvls - 1) - i]; |
4131 | for (j = 0; j <= i; j++) | 4255 | num_rcu_lvl[i] = DIV_ROUND_UP(nr_cpu_ids, cap); |
4132 | num_rcu_lvl[j] = | 4256 | } |
4133 | DIV_ROUND_UP(n, rcu_capacity[i - j]); | ||
4134 | rcu_num_lvls = i; | ||
4135 | for (j = i + 1; j <= MAX_RCU_LVLS; j++) | ||
4136 | num_rcu_lvl[j] = 0; | ||
4137 | break; | ||
4138 | } | ||
4139 | 4257 | ||
4140 | /* Calculate the total number of rcu_node structures. */ | 4258 | /* Calculate the total number of rcu_node structures. */ |
4141 | rcu_num_nodes = 0; | 4259 | rcu_num_nodes = 0; |
4142 | for (i = 0; i <= MAX_RCU_LVLS; i++) | 4260 | for (i = 0; i < rcu_num_lvls; i++) |
4143 | rcu_num_nodes += num_rcu_lvl[i]; | 4261 | rcu_num_nodes += num_rcu_lvl[i]; |
4144 | rcu_num_nodes -= n; | ||
4145 | } | 4262 | } |
4146 | 4263 | ||
4147 | /* | 4264 | /* |
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 4adb7ca0bf47..2e991f8361e4 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h | |||
@@ -27,6 +27,7 @@ | |||
27 | #include <linux/threads.h> | 27 | #include <linux/threads.h> |
28 | #include <linux/cpumask.h> | 28 | #include <linux/cpumask.h> |
29 | #include <linux/seqlock.h> | 29 | #include <linux/seqlock.h> |
30 | #include <linux/stop_machine.h> | ||
30 | 31 | ||
31 | /* | 32 | /* |
32 | * Define shape of hierarchy based on NR_CPUS, CONFIG_RCU_FANOUT, and | 33 | * Define shape of hierarchy based on NR_CPUS, CONFIG_RCU_FANOUT, and |
@@ -36,8 +37,6 @@ | |||
36 | * Of course, your mileage may vary. | 37 | * Of course, your mileage may vary. |
37 | */ | 38 | */ |
38 | 39 | ||
39 | #define MAX_RCU_LVLS 4 | ||
40 | |||
41 | #ifdef CONFIG_RCU_FANOUT | 40 | #ifdef CONFIG_RCU_FANOUT |
42 | #define RCU_FANOUT CONFIG_RCU_FANOUT | 41 | #define RCU_FANOUT CONFIG_RCU_FANOUT |
43 | #else /* #ifdef CONFIG_RCU_FANOUT */ | 42 | #else /* #ifdef CONFIG_RCU_FANOUT */ |
@@ -66,38 +65,53 @@ | |||
66 | #if NR_CPUS <= RCU_FANOUT_1 | 65 | #if NR_CPUS <= RCU_FANOUT_1 |
67 | # define RCU_NUM_LVLS 1 | 66 | # define RCU_NUM_LVLS 1 |
68 | # define NUM_RCU_LVL_0 1 | 67 | # define NUM_RCU_LVL_0 1 |
69 | # define NUM_RCU_LVL_1 (NR_CPUS) | 68 | # define NUM_RCU_NODES NUM_RCU_LVL_0 |
70 | # define NUM_RCU_LVL_2 0 | 69 | # define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0 } |
71 | # define NUM_RCU_LVL_3 0 | 70 | # define RCU_NODE_NAME_INIT { "rcu_node_0" } |
72 | # define NUM_RCU_LVL_4 0 | 71 | # define RCU_FQS_NAME_INIT { "rcu_node_fqs_0" } |
72 | # define RCU_EXP_NAME_INIT { "rcu_node_exp_0" } | ||
73 | # define RCU_EXP_SCHED_NAME_INIT \ | ||
74 | { "rcu_node_exp_sched_0" } | ||
73 | #elif NR_CPUS <= RCU_FANOUT_2 | 75 | #elif NR_CPUS <= RCU_FANOUT_2 |
74 | # define RCU_NUM_LVLS 2 | 76 | # define RCU_NUM_LVLS 2 |
75 | # define NUM_RCU_LVL_0 1 | 77 | # define NUM_RCU_LVL_0 1 |
76 | # define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1) | 78 | # define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1) |
77 | # define NUM_RCU_LVL_2 (NR_CPUS) | 79 | # define NUM_RCU_NODES (NUM_RCU_LVL_0 + NUM_RCU_LVL_1) |
78 | # define NUM_RCU_LVL_3 0 | 80 | # define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1 } |
79 | # define NUM_RCU_LVL_4 0 | 81 | # define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1" } |
82 | # define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1" } | ||
83 | # define RCU_EXP_NAME_INIT { "rcu_node_exp_0", "rcu_node_exp_1" } | ||
84 | # define RCU_EXP_SCHED_NAME_INIT \ | ||
85 | { "rcu_node_exp_sched_0", "rcu_node_exp_sched_1" } | ||
80 | #elif NR_CPUS <= RCU_FANOUT_3 | 86 | #elif NR_CPUS <= RCU_FANOUT_3 |
81 | # define RCU_NUM_LVLS 3 | 87 | # define RCU_NUM_LVLS 3 |
82 | # define NUM_RCU_LVL_0 1 | 88 | # define NUM_RCU_LVL_0 1 |
83 | # define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2) | 89 | # define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2) |
84 | # define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1) | 90 | # define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1) |
85 | # define NUM_RCU_LVL_3 (NR_CPUS) | 91 | # define NUM_RCU_NODES (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2) |
86 | # define NUM_RCU_LVL_4 0 | 92 | # define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2 } |
93 | # define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1", "rcu_node_2" } | ||
94 | # define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2" } | ||
95 | # define RCU_EXP_NAME_INIT { "rcu_node_exp_0", "rcu_node_exp_1", "rcu_node_exp_2" } | ||
96 | # define RCU_EXP_SCHED_NAME_INIT \ | ||
97 | { "rcu_node_exp_sched_0", "rcu_node_exp_sched_1", "rcu_node_exp_sched_2" } | ||
87 | #elif NR_CPUS <= RCU_FANOUT_4 | 98 | #elif NR_CPUS <= RCU_FANOUT_4 |
88 | # define RCU_NUM_LVLS 4 | 99 | # define RCU_NUM_LVLS 4 |
89 | # define NUM_RCU_LVL_0 1 | 100 | # define NUM_RCU_LVL_0 1 |
90 | # define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_3) | 101 | # define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_3) |
91 | # define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2) | 102 | # define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2) |
92 | # define NUM_RCU_LVL_3 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1) | 103 | # define NUM_RCU_LVL_3 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1) |
93 | # define NUM_RCU_LVL_4 (NR_CPUS) | 104 | # define NUM_RCU_NODES (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3) |
105 | # define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2, NUM_RCU_LVL_3 } | ||
106 | # define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1", "rcu_node_2", "rcu_node_3" } | ||
107 | # define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2", "rcu_node_fqs_3" } | ||
108 | # define RCU_EXP_NAME_INIT { "rcu_node_exp_0", "rcu_node_exp_1", "rcu_node_exp_2", "rcu_node_exp_3" } | ||
109 | # define RCU_EXP_SCHED_NAME_INIT \ | ||
110 | { "rcu_node_exp_sched_0", "rcu_node_exp_sched_1", "rcu_node_exp_sched_2", "rcu_node_exp_sched_3" } | ||
94 | #else | 111 | #else |
95 | # error "CONFIG_RCU_FANOUT insufficient for NR_CPUS" | 112 | # error "CONFIG_RCU_FANOUT insufficient for NR_CPUS" |
96 | #endif /* #if (NR_CPUS) <= RCU_FANOUT_1 */ | 113 | #endif /* #if (NR_CPUS) <= RCU_FANOUT_1 */ |
97 | 114 | ||
98 | #define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3 + NUM_RCU_LVL_4) | ||
99 | #define NUM_RCU_NODES (RCU_SUM - NR_CPUS) | ||
100 | |||
101 | extern int rcu_num_lvls; | 115 | extern int rcu_num_lvls; |
102 | extern int rcu_num_nodes; | 116 | extern int rcu_num_nodes; |
103 | 117 | ||
@@ -236,6 +250,8 @@ struct rcu_node { | |||
236 | int need_future_gp[2]; | 250 | int need_future_gp[2]; |
237 | /* Counts of upcoming no-CB GP requests. */ | 251 | /* Counts of upcoming no-CB GP requests. */ |
238 | raw_spinlock_t fqslock ____cacheline_internodealigned_in_smp; | 252 | raw_spinlock_t fqslock ____cacheline_internodealigned_in_smp; |
253 | |||
254 | struct mutex exp_funnel_mutex ____cacheline_internodealigned_in_smp; | ||
239 | } ____cacheline_internodealigned_in_smp; | 255 | } ____cacheline_internodealigned_in_smp; |
240 | 256 | ||
241 | /* | 257 | /* |
@@ -287,12 +303,13 @@ struct rcu_data { | |||
287 | bool gpwrap; /* Possible gpnum/completed wrap. */ | 303 | bool gpwrap; /* Possible gpnum/completed wrap. */ |
288 | struct rcu_node *mynode; /* This CPU's leaf of hierarchy */ | 304 | struct rcu_node *mynode; /* This CPU's leaf of hierarchy */ |
289 | unsigned long grpmask; /* Mask to apply to leaf qsmask. */ | 305 | unsigned long grpmask; /* Mask to apply to leaf qsmask. */ |
290 | #ifdef CONFIG_RCU_CPU_STALL_INFO | ||
291 | unsigned long ticks_this_gp; /* The number of scheduling-clock */ | 306 | unsigned long ticks_this_gp; /* The number of scheduling-clock */ |
292 | /* ticks this CPU has handled */ | 307 | /* ticks this CPU has handled */ |
293 | /* during and after the last grace */ | 308 | /* during and after the last grace */ |
294 | /* period it is aware of. */ | 309 | /* period it is aware of. */ |
295 | #endif /* #ifdef CONFIG_RCU_CPU_STALL_INFO */ | 310 | struct cpu_stop_work exp_stop_work; |
311 | /* Expedited grace-period control */ | ||
312 | /* for CPU stopping. */ | ||
296 | 313 | ||
297 | /* 2) batch handling */ | 314 | /* 2) batch handling */ |
298 | /* | 315 | /* |
@@ -355,11 +372,13 @@ struct rcu_data { | |||
355 | unsigned long n_rp_nocb_defer_wakeup; | 372 | unsigned long n_rp_nocb_defer_wakeup; |
356 | unsigned long n_rp_need_nothing; | 373 | unsigned long n_rp_need_nothing; |
357 | 374 | ||
358 | /* 6) _rcu_barrier() and OOM callbacks. */ | 375 | /* 6) _rcu_barrier(), OOM callbacks, and expediting. */ |
359 | struct rcu_head barrier_head; | 376 | struct rcu_head barrier_head; |
360 | #ifdef CONFIG_RCU_FAST_NO_HZ | 377 | #ifdef CONFIG_RCU_FAST_NO_HZ |
361 | struct rcu_head oom_head; | 378 | struct rcu_head oom_head; |
362 | #endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ | 379 | #endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ |
380 | struct mutex exp_funnel_mutex; | ||
381 | bool exp_done; /* Expedited QS for this CPU? */ | ||
363 | 382 | ||
364 | /* 7) Callback offloading. */ | 383 | /* 7) Callback offloading. */ |
365 | #ifdef CONFIG_RCU_NOCB_CPU | 384 | #ifdef CONFIG_RCU_NOCB_CPU |
@@ -387,9 +406,7 @@ struct rcu_data { | |||
387 | #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ | 406 | #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ |
388 | 407 | ||
389 | /* 8) RCU CPU stall data. */ | 408 | /* 8) RCU CPU stall data. */ |
390 | #ifdef CONFIG_RCU_CPU_STALL_INFO | ||
391 | unsigned int softirq_snap; /* Snapshot of softirq activity. */ | 409 | unsigned int softirq_snap; /* Snapshot of softirq activity. */ |
392 | #endif /* #ifdef CONFIG_RCU_CPU_STALL_INFO */ | ||
393 | 410 | ||
394 | int cpu; | 411 | int cpu; |
395 | struct rcu_state *rsp; | 412 | struct rcu_state *rsp; |
@@ -442,9 +459,9 @@ do { \ | |||
442 | */ | 459 | */ |
443 | struct rcu_state { | 460 | struct rcu_state { |
444 | struct rcu_node node[NUM_RCU_NODES]; /* Hierarchy. */ | 461 | struct rcu_node node[NUM_RCU_NODES]; /* Hierarchy. */ |
445 | struct rcu_node *level[RCU_NUM_LVLS]; /* Hierarchy levels. */ | 462 | struct rcu_node *level[RCU_NUM_LVLS + 1]; |
446 | u32 levelcnt[MAX_RCU_LVLS + 1]; /* # nodes in each level. */ | 463 | /* Hierarchy levels (+1 to */ |
447 | u8 levelspread[RCU_NUM_LVLS]; /* kids/node in each level. */ | 464 | /* shut bogus gcc warning) */ |
448 | u8 flavor_mask; /* bit in flavor mask. */ | 465 | u8 flavor_mask; /* bit in flavor mask. */ |
449 | struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */ | 466 | struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */ |
450 | void (*call)(struct rcu_head *head, /* call_rcu() flavor. */ | 467 | void (*call)(struct rcu_head *head, /* call_rcu() flavor. */ |
@@ -479,21 +496,18 @@ struct rcu_state { | |||
479 | struct mutex barrier_mutex; /* Guards barrier fields. */ | 496 | struct mutex barrier_mutex; /* Guards barrier fields. */ |
480 | atomic_t barrier_cpu_count; /* # CPUs waiting on. */ | 497 | atomic_t barrier_cpu_count; /* # CPUs waiting on. */ |
481 | struct completion barrier_completion; /* Wake at barrier end. */ | 498 | struct completion barrier_completion; /* Wake at barrier end. */ |
482 | unsigned long n_barrier_done; /* ++ at start and end of */ | 499 | unsigned long barrier_sequence; /* ++ at start and end of */ |
483 | /* _rcu_barrier(). */ | 500 | /* _rcu_barrier(). */ |
484 | /* End of fields guarded by barrier_mutex. */ | 501 | /* End of fields guarded by barrier_mutex. */ |
485 | 502 | ||
486 | atomic_long_t expedited_start; /* Starting ticket. */ | 503 | unsigned long expedited_sequence; /* Take a ticket. */ |
487 | atomic_long_t expedited_done; /* Done ticket. */ | 504 | atomic_long_t expedited_workdone0; /* # done by others #0. */ |
488 | atomic_long_t expedited_wrap; /* # near-wrap incidents. */ | ||
489 | atomic_long_t expedited_tryfail; /* # acquisition failures. */ | ||
490 | atomic_long_t expedited_workdone1; /* # done by others #1. */ | 505 | atomic_long_t expedited_workdone1; /* # done by others #1. */ |
491 | atomic_long_t expedited_workdone2; /* # done by others #2. */ | 506 | atomic_long_t expedited_workdone2; /* # done by others #2. */ |
507 | atomic_long_t expedited_workdone3; /* # done by others #3. */ | ||
492 | atomic_long_t expedited_normal; /* # fallbacks to normal. */ | 508 | atomic_long_t expedited_normal; /* # fallbacks to normal. */ |
493 | atomic_long_t expedited_stoppedcpus; /* # successful stop_cpus. */ | 509 | atomic_t expedited_need_qs; /* # CPUs left to check in. */ |
494 | atomic_long_t expedited_done_tries; /* # tries to update _done. */ | 510 | wait_queue_head_t expedited_wq; /* Wait for check-ins. */ |
495 | atomic_long_t expedited_done_lost; /* # times beaten to _done. */ | ||
496 | atomic_long_t expedited_done_exit; /* # times exited _done loop. */ | ||
497 | 511 | ||
498 | unsigned long jiffies_force_qs; /* Time at which to invoke */ | 512 | unsigned long jiffies_force_qs; /* Time at which to invoke */ |
499 | /* force_quiescent_state(). */ | 513 | /* force_quiescent_state(). */ |
@@ -527,7 +541,11 @@ struct rcu_state { | |||
527 | /* Values for rcu_state structure's gp_flags field. */ | 541 | /* Values for rcu_state structure's gp_flags field. */ |
528 | #define RCU_GP_WAIT_INIT 0 /* Initial state. */ | 542 | #define RCU_GP_WAIT_INIT 0 /* Initial state. */ |
529 | #define RCU_GP_WAIT_GPS 1 /* Wait for grace-period start. */ | 543 | #define RCU_GP_WAIT_GPS 1 /* Wait for grace-period start. */ |
530 | #define RCU_GP_WAIT_FQS 2 /* Wait for force-quiescent-state time. */ | 544 | #define RCU_GP_DONE_GPS 2 /* Wait done for grace-period start. */ |
545 | #define RCU_GP_WAIT_FQS 3 /* Wait for force-quiescent-state time. */ | ||
546 | #define RCU_GP_DOING_FQS 4 /* Wait done for force-quiescent-state time. */ | ||
547 | #define RCU_GP_CLEANUP 5 /* Grace-period cleanup started. */ | ||
548 | #define RCU_GP_CLEANED 6 /* Grace-period cleanup complete. */ | ||
531 | 549 | ||
532 | extern struct list_head rcu_struct_flavors; | 550 | extern struct list_head rcu_struct_flavors; |
533 | 551 | ||
@@ -635,3 +653,15 @@ static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll) | |||
635 | #endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */ | 653 | #endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */ |
636 | } | 654 | } |
637 | #endif /* #ifdef CONFIG_RCU_TRACE */ | 655 | #endif /* #ifdef CONFIG_RCU_TRACE */ |
656 | |||
657 | /* | ||
658 | * Place this after a lock-acquisition primitive to guarantee that | ||
659 | * an UNLOCK+LOCK pair act as a full barrier. This guarantee applies | ||
660 | * if the UNLOCK and LOCK are executed by the same CPU or if the | ||
661 | * UNLOCK and LOCK operate on the same lock variable. | ||
662 | */ | ||
663 | #ifdef CONFIG_PPC | ||
664 | #define smp_mb__after_unlock_lock() smp_mb() /* Full ordering for lock. */ | ||
665 | #else /* #ifdef CONFIG_PPC */ | ||
666 | #define smp_mb__after_unlock_lock() do { } while (0) | ||
667 | #endif /* #else #ifdef CONFIG_PPC */ | ||
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 013485fb2b06..b2bf3963a0ae 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h | |||
@@ -82,10 +82,8 @@ static void __init rcu_bootup_announce_oddness(void) | |||
82 | pr_info("\tRCU lockdep checking is enabled.\n"); | 82 | pr_info("\tRCU lockdep checking is enabled.\n"); |
83 | if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_RUNNABLE)) | 83 | if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_RUNNABLE)) |
84 | pr_info("\tRCU torture testing starts during boot.\n"); | 84 | pr_info("\tRCU torture testing starts during boot.\n"); |
85 | if (IS_ENABLED(CONFIG_RCU_CPU_STALL_INFO)) | 85 | if (RCU_NUM_LVLS >= 4) |
86 | pr_info("\tAdditional per-CPU info printed with stalls.\n"); | 86 | pr_info("\tFour(or more)-level hierarchy is enabled.\n"); |
87 | if (NUM_RCU_LVL_4 != 0) | ||
88 | pr_info("\tFour-level hierarchy is enabled.\n"); | ||
89 | if (RCU_FANOUT_LEAF != 16) | 87 | if (RCU_FANOUT_LEAF != 16) |
90 | pr_info("\tBuild-time adjustment of leaf fanout to %d.\n", | 88 | pr_info("\tBuild-time adjustment of leaf fanout to %d.\n", |
91 | RCU_FANOUT_LEAF); | 89 | RCU_FANOUT_LEAF); |
@@ -418,8 +416,6 @@ static void rcu_print_detail_task_stall(struct rcu_state *rsp) | |||
418 | rcu_print_detail_task_stall_rnp(rnp); | 416 | rcu_print_detail_task_stall_rnp(rnp); |
419 | } | 417 | } |
420 | 418 | ||
421 | #ifdef CONFIG_RCU_CPU_STALL_INFO | ||
422 | |||
423 | static void rcu_print_task_stall_begin(struct rcu_node *rnp) | 419 | static void rcu_print_task_stall_begin(struct rcu_node *rnp) |
424 | { | 420 | { |
425 | pr_err("\tTasks blocked on level-%d rcu_node (CPUs %d-%d):", | 421 | pr_err("\tTasks blocked on level-%d rcu_node (CPUs %d-%d):", |
@@ -431,18 +427,6 @@ static void rcu_print_task_stall_end(void) | |||
431 | pr_cont("\n"); | 427 | pr_cont("\n"); |
432 | } | 428 | } |
433 | 429 | ||
434 | #else /* #ifdef CONFIG_RCU_CPU_STALL_INFO */ | ||
435 | |||
436 | static void rcu_print_task_stall_begin(struct rcu_node *rnp) | ||
437 | { | ||
438 | } | ||
439 | |||
440 | static void rcu_print_task_stall_end(void) | ||
441 | { | ||
442 | } | ||
443 | |||
444 | #endif /* #else #ifdef CONFIG_RCU_CPU_STALL_INFO */ | ||
445 | |||
446 | /* | 430 | /* |
447 | * Scan the current list of tasks blocked within RCU read-side critical | 431 | * Scan the current list of tasks blocked within RCU read-side critical |
448 | * sections, printing out the tid of each. | 432 | * sections, printing out the tid of each. |
@@ -538,10 +522,10 @@ EXPORT_SYMBOL_GPL(call_rcu); | |||
538 | */ | 522 | */ |
539 | void synchronize_rcu(void) | 523 | void synchronize_rcu(void) |
540 | { | 524 | { |
541 | rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) && | 525 | RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) || |
542 | !lock_is_held(&rcu_lock_map) && | 526 | lock_is_held(&rcu_lock_map) || |
543 | !lock_is_held(&rcu_sched_lock_map), | 527 | lock_is_held(&rcu_sched_lock_map), |
544 | "Illegal synchronize_rcu() in RCU read-side critical section"); | 528 | "Illegal synchronize_rcu() in RCU read-side critical section"); |
545 | if (!rcu_scheduler_active) | 529 | if (!rcu_scheduler_active) |
546 | return; | 530 | return; |
547 | if (rcu_gp_is_expedited()) | 531 | if (rcu_gp_is_expedited()) |
@@ -552,8 +536,6 @@ void synchronize_rcu(void) | |||
552 | EXPORT_SYMBOL_GPL(synchronize_rcu); | 536 | EXPORT_SYMBOL_GPL(synchronize_rcu); |
553 | 537 | ||
554 | static DECLARE_WAIT_QUEUE_HEAD(sync_rcu_preempt_exp_wq); | 538 | static DECLARE_WAIT_QUEUE_HEAD(sync_rcu_preempt_exp_wq); |
555 | static unsigned long sync_rcu_preempt_exp_count; | ||
556 | static DEFINE_MUTEX(sync_rcu_preempt_exp_mutex); | ||
557 | 539 | ||
558 | /* | 540 | /* |
559 | * Return non-zero if there are any tasks in RCU read-side critical | 541 | * Return non-zero if there are any tasks in RCU read-side critical |
@@ -573,7 +555,7 @@ static int rcu_preempted_readers_exp(struct rcu_node *rnp) | |||
573 | * for the current expedited grace period. Works only for preemptible | 555 | * for the current expedited grace period. Works only for preemptible |
574 | * RCU -- other RCU implementation use other means. | 556 | * RCU -- other RCU implementation use other means. |
575 | * | 557 | * |
576 | * Caller must hold sync_rcu_preempt_exp_mutex. | 558 | * Caller must hold the root rcu_node's exp_funnel_mutex. |
577 | */ | 559 | */ |
578 | static int sync_rcu_preempt_exp_done(struct rcu_node *rnp) | 560 | static int sync_rcu_preempt_exp_done(struct rcu_node *rnp) |
579 | { | 561 | { |
@@ -589,7 +571,7 @@ static int sync_rcu_preempt_exp_done(struct rcu_node *rnp) | |||
589 | * recursively up the tree. (Calm down, calm down, we do the recursion | 571 | * recursively up the tree. (Calm down, calm down, we do the recursion |
590 | * iteratively!) | 572 | * iteratively!) |
591 | * | 573 | * |
592 | * Caller must hold sync_rcu_preempt_exp_mutex. | 574 | * Caller must hold the root rcu_node's exp_funnel_mutex. |
593 | */ | 575 | */ |
594 | static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, | 576 | static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, |
595 | bool wake) | 577 | bool wake) |
@@ -628,7 +610,7 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, | |||
628 | * set the ->expmask bits on the leaf rcu_node structures to tell phase 2 | 610 | * set the ->expmask bits on the leaf rcu_node structures to tell phase 2 |
629 | * that work is needed here. | 611 | * that work is needed here. |
630 | * | 612 | * |
631 | * Caller must hold sync_rcu_preempt_exp_mutex. | 613 | * Caller must hold the root rcu_node's exp_funnel_mutex. |
632 | */ | 614 | */ |
633 | static void | 615 | static void |
634 | sync_rcu_preempt_exp_init1(struct rcu_state *rsp, struct rcu_node *rnp) | 616 | sync_rcu_preempt_exp_init1(struct rcu_state *rsp, struct rcu_node *rnp) |
@@ -671,7 +653,7 @@ sync_rcu_preempt_exp_init1(struct rcu_state *rsp, struct rcu_node *rnp) | |||
671 | * invoke rcu_report_exp_rnp() to clear out the upper-level ->expmask bits, | 653 | * invoke rcu_report_exp_rnp() to clear out the upper-level ->expmask bits, |
672 | * enabling rcu_read_unlock_special() to do the bit-clearing. | 654 | * enabling rcu_read_unlock_special() to do the bit-clearing. |
673 | * | 655 | * |
674 | * Caller must hold sync_rcu_preempt_exp_mutex. | 656 | * Caller must hold the root rcu_node's exp_funnel_mutex. |
675 | */ | 657 | */ |
676 | static void | 658 | static void |
677 | sync_rcu_preempt_exp_init2(struct rcu_state *rsp, struct rcu_node *rnp) | 659 | sync_rcu_preempt_exp_init2(struct rcu_state *rsp, struct rcu_node *rnp) |
@@ -719,51 +701,17 @@ sync_rcu_preempt_exp_init2(struct rcu_state *rsp, struct rcu_node *rnp) | |||
719 | void synchronize_rcu_expedited(void) | 701 | void synchronize_rcu_expedited(void) |
720 | { | 702 | { |
721 | struct rcu_node *rnp; | 703 | struct rcu_node *rnp; |
704 | struct rcu_node *rnp_unlock; | ||
722 | struct rcu_state *rsp = rcu_state_p; | 705 | struct rcu_state *rsp = rcu_state_p; |
723 | unsigned long snap; | 706 | unsigned long s; |
724 | int trycount = 0; | ||
725 | 707 | ||
726 | smp_mb(); /* Caller's modifications seen first by other CPUs. */ | 708 | s = rcu_exp_gp_seq_snap(rsp); |
727 | snap = READ_ONCE(sync_rcu_preempt_exp_count) + 1; | ||
728 | smp_mb(); /* Above access cannot bleed into critical section. */ | ||
729 | 709 | ||
730 | /* | 710 | rnp_unlock = exp_funnel_lock(rsp, s); |
731 | * Block CPU-hotplug operations. This means that any CPU-hotplug | 711 | if (rnp_unlock == NULL) |
732 | * operation that finds an rcu_node structure with tasks in the | 712 | return; /* Someone else did our work for us. */ |
733 | * process of being boosted will know that all tasks blocking | ||
734 | * this expedited grace period will already be in the process of | ||
735 | * being boosted. This simplifies the process of moving tasks | ||
736 | * from leaf to root rcu_node structures. | ||
737 | */ | ||
738 | if (!try_get_online_cpus()) { | ||
739 | /* CPU-hotplug operation in flight, fall back to normal GP. */ | ||
740 | wait_rcu_gp(call_rcu); | ||
741 | return; | ||
742 | } | ||
743 | 713 | ||
744 | /* | 714 | rcu_exp_gp_seq_start(rsp); |
745 | * Acquire lock, falling back to synchronize_rcu() if too many | ||
746 | * lock-acquisition failures. Of course, if someone does the | ||
747 | * expedited grace period for us, just leave. | ||
748 | */ | ||
749 | while (!mutex_trylock(&sync_rcu_preempt_exp_mutex)) { | ||
750 | if (ULONG_CMP_LT(snap, | ||
751 | READ_ONCE(sync_rcu_preempt_exp_count))) { | ||
752 | put_online_cpus(); | ||
753 | goto mb_ret; /* Others did our work for us. */ | ||
754 | } | ||
755 | if (trycount++ < 10) { | ||
756 | udelay(trycount * num_online_cpus()); | ||
757 | } else { | ||
758 | put_online_cpus(); | ||
759 | wait_rcu_gp(call_rcu); | ||
760 | return; | ||
761 | } | ||
762 | } | ||
763 | if (ULONG_CMP_LT(snap, READ_ONCE(sync_rcu_preempt_exp_count))) { | ||
764 | put_online_cpus(); | ||
765 | goto unlock_mb_ret; /* Others did our work for us. */ | ||
766 | } | ||
767 | 715 | ||
768 | /* force all RCU readers onto ->blkd_tasks lists. */ | 716 | /* force all RCU readers onto ->blkd_tasks lists. */ |
769 | synchronize_sched_expedited(); | 717 | synchronize_sched_expedited(); |
@@ -779,20 +727,14 @@ void synchronize_rcu_expedited(void) | |||
779 | rcu_for_each_leaf_node(rsp, rnp) | 727 | rcu_for_each_leaf_node(rsp, rnp) |
780 | sync_rcu_preempt_exp_init2(rsp, rnp); | 728 | sync_rcu_preempt_exp_init2(rsp, rnp); |
781 | 729 | ||
782 | put_online_cpus(); | ||
783 | |||
784 | /* Wait for snapshotted ->blkd_tasks lists to drain. */ | 730 | /* Wait for snapshotted ->blkd_tasks lists to drain. */ |
785 | rnp = rcu_get_root(rsp); | 731 | rnp = rcu_get_root(rsp); |
786 | wait_event(sync_rcu_preempt_exp_wq, | 732 | wait_event(sync_rcu_preempt_exp_wq, |
787 | sync_rcu_preempt_exp_done(rnp)); | 733 | sync_rcu_preempt_exp_done(rnp)); |
788 | 734 | ||
789 | /* Clean up and exit. */ | 735 | /* Clean up and exit. */ |
790 | smp_mb(); /* ensure expedited GP seen before counter increment. */ | 736 | rcu_exp_gp_seq_end(rsp); |
791 | WRITE_ONCE(sync_rcu_preempt_exp_count, sync_rcu_preempt_exp_count + 1); | 737 | mutex_unlock(&rnp_unlock->exp_funnel_mutex); |
792 | unlock_mb_ret: | ||
793 | mutex_unlock(&sync_rcu_preempt_exp_mutex); | ||
794 | mb_ret: | ||
795 | smp_mb(); /* ensure subsequent action seen after grace period. */ | ||
796 | } | 738 | } |
797 | EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); | 739 | EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); |
798 | 740 | ||
@@ -1061,8 +1003,7 @@ static int rcu_boost(struct rcu_node *rnp) | |||
1061 | } | 1003 | } |
1062 | 1004 | ||
1063 | /* | 1005 | /* |
1064 | * Priority-boosting kthread. One per leaf rcu_node and one for the | 1006 | * Priority-boosting kthread, one per leaf rcu_node. |
1065 | * root rcu_node. | ||
1066 | */ | 1007 | */ |
1067 | static int rcu_boost_kthread(void *arg) | 1008 | static int rcu_boost_kthread(void *arg) |
1068 | { | 1009 | { |
@@ -1680,12 +1621,10 @@ static int rcu_oom_notify(struct notifier_block *self, | |||
1680 | */ | 1621 | */ |
1681 | atomic_set(&oom_callback_count, 1); | 1622 | atomic_set(&oom_callback_count, 1); |
1682 | 1623 | ||
1683 | get_online_cpus(); | ||
1684 | for_each_online_cpu(cpu) { | 1624 | for_each_online_cpu(cpu) { |
1685 | smp_call_function_single(cpu, rcu_oom_notify_cpu, NULL, 1); | 1625 | smp_call_function_single(cpu, rcu_oom_notify_cpu, NULL, 1); |
1686 | cond_resched_rcu_qs(); | 1626 | cond_resched_rcu_qs(); |
1687 | } | 1627 | } |
1688 | put_online_cpus(); | ||
1689 | 1628 | ||
1690 | /* Unconditionally decrement: no need to wake ourselves up. */ | 1629 | /* Unconditionally decrement: no need to wake ourselves up. */ |
1691 | atomic_dec(&oom_callback_count); | 1630 | atomic_dec(&oom_callback_count); |
@@ -1706,8 +1645,6 @@ early_initcall(rcu_register_oom_notifier); | |||
1706 | 1645 | ||
1707 | #endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */ | 1646 | #endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */ |
1708 | 1647 | ||
1709 | #ifdef CONFIG_RCU_CPU_STALL_INFO | ||
1710 | |||
1711 | #ifdef CONFIG_RCU_FAST_NO_HZ | 1648 | #ifdef CONFIG_RCU_FAST_NO_HZ |
1712 | 1649 | ||
1713 | static void print_cpu_stall_fast_no_hz(char *cp, int cpu) | 1650 | static void print_cpu_stall_fast_no_hz(char *cp, int cpu) |
@@ -1796,33 +1733,6 @@ static void increment_cpu_stall_ticks(void) | |||
1796 | raw_cpu_inc(rsp->rda->ticks_this_gp); | 1733 | raw_cpu_inc(rsp->rda->ticks_this_gp); |
1797 | } | 1734 | } |
1798 | 1735 | ||
1799 | #else /* #ifdef CONFIG_RCU_CPU_STALL_INFO */ | ||
1800 | |||
1801 | static void print_cpu_stall_info_begin(void) | ||
1802 | { | ||
1803 | pr_cont(" {"); | ||
1804 | } | ||
1805 | |||
1806 | static void print_cpu_stall_info(struct rcu_state *rsp, int cpu) | ||
1807 | { | ||
1808 | pr_cont(" %d", cpu); | ||
1809 | } | ||
1810 | |||
1811 | static void print_cpu_stall_info_end(void) | ||
1812 | { | ||
1813 | pr_cont("} "); | ||
1814 | } | ||
1815 | |||
1816 | static void zero_cpu_stall_ticks(struct rcu_data *rdp) | ||
1817 | { | ||
1818 | } | ||
1819 | |||
1820 | static void increment_cpu_stall_ticks(void) | ||
1821 | { | ||
1822 | } | ||
1823 | |||
1824 | #endif /* #else #ifdef CONFIG_RCU_CPU_STALL_INFO */ | ||
1825 | |||
1826 | #ifdef CONFIG_RCU_NOCB_CPU | 1736 | #ifdef CONFIG_RCU_NOCB_CPU |
1827 | 1737 | ||
1828 | /* | 1738 | /* |
diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c index 3ea7ffc7d5c4..6fc4c5ff3bb5 100644 --- a/kernel/rcu/tree_trace.c +++ b/kernel/rcu/tree_trace.c | |||
@@ -81,9 +81,9 @@ static void r_stop(struct seq_file *m, void *v) | |||
81 | static int show_rcubarrier(struct seq_file *m, void *v) | 81 | static int show_rcubarrier(struct seq_file *m, void *v) |
82 | { | 82 | { |
83 | struct rcu_state *rsp = (struct rcu_state *)m->private; | 83 | struct rcu_state *rsp = (struct rcu_state *)m->private; |
84 | seq_printf(m, "bcc: %d nbd: %lu\n", | 84 | seq_printf(m, "bcc: %d bseq: %lu\n", |
85 | atomic_read(&rsp->barrier_cpu_count), | 85 | atomic_read(&rsp->barrier_cpu_count), |
86 | rsp->n_barrier_done); | 86 | rsp->barrier_sequence); |
87 | return 0; | 87 | return 0; |
88 | } | 88 | } |
89 | 89 | ||
@@ -185,18 +185,15 @@ static int show_rcuexp(struct seq_file *m, void *v) | |||
185 | { | 185 | { |
186 | struct rcu_state *rsp = (struct rcu_state *)m->private; | 186 | struct rcu_state *rsp = (struct rcu_state *)m->private; |
187 | 187 | ||
188 | seq_printf(m, "s=%lu d=%lu w=%lu tf=%lu wd1=%lu wd2=%lu n=%lu sc=%lu dt=%lu dl=%lu dx=%lu\n", | 188 | seq_printf(m, "s=%lu wd0=%lu wd1=%lu wd2=%lu wd3=%lu n=%lu enq=%d sc=%lu\n", |
189 | atomic_long_read(&rsp->expedited_start), | 189 | rsp->expedited_sequence, |
190 | atomic_long_read(&rsp->expedited_done), | 190 | atomic_long_read(&rsp->expedited_workdone0), |
191 | atomic_long_read(&rsp->expedited_wrap), | ||
192 | atomic_long_read(&rsp->expedited_tryfail), | ||
193 | atomic_long_read(&rsp->expedited_workdone1), | 191 | atomic_long_read(&rsp->expedited_workdone1), |
194 | atomic_long_read(&rsp->expedited_workdone2), | 192 | atomic_long_read(&rsp->expedited_workdone2), |
193 | atomic_long_read(&rsp->expedited_workdone3), | ||
195 | atomic_long_read(&rsp->expedited_normal), | 194 | atomic_long_read(&rsp->expedited_normal), |
196 | atomic_long_read(&rsp->expedited_stoppedcpus), | 195 | atomic_read(&rsp->expedited_need_qs), |
197 | atomic_long_read(&rsp->expedited_done_tries), | 196 | rsp->expedited_sequence / 2); |
198 | atomic_long_read(&rsp->expedited_done_lost), | ||
199 | atomic_long_read(&rsp->expedited_done_exit)); | ||
200 | return 0; | 197 | return 0; |
201 | } | 198 | } |
202 | 199 | ||
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index afaecb7a799a..7a0b3bc7c5ed 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c | |||
@@ -62,6 +62,55 @@ MODULE_ALIAS("rcupdate"); | |||
62 | 62 | ||
63 | module_param(rcu_expedited, int, 0); | 63 | module_param(rcu_expedited, int, 0); |
64 | 64 | ||
65 | #if defined(CONFIG_DEBUG_LOCK_ALLOC) && defined(CONFIG_PREEMPT_COUNT) | ||
66 | /** | ||
67 | * rcu_read_lock_sched_held() - might we be in RCU-sched read-side critical section? | ||
68 | * | ||
69 | * If CONFIG_DEBUG_LOCK_ALLOC is selected, returns nonzero iff in an | ||
70 | * RCU-sched read-side critical section. In absence of | ||
71 | * CONFIG_DEBUG_LOCK_ALLOC, this assumes we are in an RCU-sched read-side | ||
72 | * critical section unless it can prove otherwise. Note that disabling | ||
73 | * of preemption (including disabling irqs) counts as an RCU-sched | ||
74 | * read-side critical section. This is useful for debug checks in functions | ||
75 | * that required that they be called within an RCU-sched read-side | ||
76 | * critical section. | ||
77 | * | ||
78 | * Check debug_lockdep_rcu_enabled() to prevent false positives during boot | ||
79 | * and while lockdep is disabled. | ||
80 | * | ||
81 | * Note that if the CPU is in the idle loop from an RCU point of | ||
82 | * view (ie: that we are in the section between rcu_idle_enter() and | ||
83 | * rcu_idle_exit()) then rcu_read_lock_held() returns false even if the CPU | ||
84 | * did an rcu_read_lock(). The reason for this is that RCU ignores CPUs | ||
85 | * that are in such a section, considering these as in extended quiescent | ||
86 | * state, so such a CPU is effectively never in an RCU read-side critical | ||
87 | * section regardless of what RCU primitives it invokes. This state of | ||
88 | * affairs is required --- we need to keep an RCU-free window in idle | ||
89 | * where the CPU may possibly enter into low power mode. This way we can | ||
90 | * notice an extended quiescent state to other CPUs that started a grace | ||
91 | * period. Otherwise we would delay any grace period as long as we run in | ||
92 | * the idle task. | ||
93 | * | ||
94 | * Similarly, we avoid claiming an SRCU read lock held if the current | ||
95 | * CPU is offline. | ||
96 | */ | ||
97 | int rcu_read_lock_sched_held(void) | ||
98 | { | ||
99 | int lockdep_opinion = 0; | ||
100 | |||
101 | if (!debug_lockdep_rcu_enabled()) | ||
102 | return 1; | ||
103 | if (!rcu_is_watching()) | ||
104 | return 0; | ||
105 | if (!rcu_lockdep_current_cpu_online()) | ||
106 | return 0; | ||
107 | if (debug_locks) | ||
108 | lockdep_opinion = lock_is_held(&rcu_sched_lock_map); | ||
109 | return lockdep_opinion || preempt_count() != 0 || irqs_disabled(); | ||
110 | } | ||
111 | EXPORT_SYMBOL(rcu_read_lock_sched_held); | ||
112 | #endif | ||
113 | |||
65 | #ifndef CONFIG_TINY_RCU | 114 | #ifndef CONFIG_TINY_RCU |
66 | 115 | ||
67 | static atomic_t rcu_expedited_nesting = | 116 | static atomic_t rcu_expedited_nesting = |
@@ -269,20 +318,37 @@ void wakeme_after_rcu(struct rcu_head *head) | |||
269 | rcu = container_of(head, struct rcu_synchronize, head); | 318 | rcu = container_of(head, struct rcu_synchronize, head); |
270 | complete(&rcu->completion); | 319 | complete(&rcu->completion); |
271 | } | 320 | } |
321 | EXPORT_SYMBOL_GPL(wakeme_after_rcu); | ||
272 | 322 | ||
273 | void wait_rcu_gp(call_rcu_func_t crf) | 323 | void __wait_rcu_gp(bool checktiny, int n, call_rcu_func_t *crcu_array, |
324 | struct rcu_synchronize *rs_array) | ||
274 | { | 325 | { |
275 | struct rcu_synchronize rcu; | 326 | int i; |
276 | 327 | ||
277 | init_rcu_head_on_stack(&rcu.head); | 328 | /* Initialize and register callbacks for each flavor specified. */ |
278 | init_completion(&rcu.completion); | 329 | for (i = 0; i < n; i++) { |
279 | /* Will wake me after RCU finished. */ | 330 | if (checktiny && |
280 | crf(&rcu.head, wakeme_after_rcu); | 331 | (crcu_array[i] == call_rcu || |
281 | /* Wait for it. */ | 332 | crcu_array[i] == call_rcu_bh)) { |
282 | wait_for_completion(&rcu.completion); | 333 | might_sleep(); |
283 | destroy_rcu_head_on_stack(&rcu.head); | 334 | continue; |
335 | } | ||
336 | init_rcu_head_on_stack(&rs_array[i].head); | ||
337 | init_completion(&rs_array[i].completion); | ||
338 | (crcu_array[i])(&rs_array[i].head, wakeme_after_rcu); | ||
339 | } | ||
340 | |||
341 | /* Wait for all callbacks to be invoked. */ | ||
342 | for (i = 0; i < n; i++) { | ||
343 | if (checktiny && | ||
344 | (crcu_array[i] == call_rcu || | ||
345 | crcu_array[i] == call_rcu_bh)) | ||
346 | continue; | ||
347 | wait_for_completion(&rs_array[i].completion); | ||
348 | destroy_rcu_head_on_stack(&rs_array[i].head); | ||
349 | } | ||
284 | } | 350 | } |
285 | EXPORT_SYMBOL_GPL(wait_rcu_gp); | 351 | EXPORT_SYMBOL_GPL(__wait_rcu_gp); |
286 | 352 | ||
287 | #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD | 353 | #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD |
288 | void init_rcu_head(struct rcu_head *head) | 354 | void init_rcu_head(struct rcu_head *head) |
@@ -523,8 +589,8 @@ EXPORT_SYMBOL_GPL(call_rcu_tasks); | |||
523 | void synchronize_rcu_tasks(void) | 589 | void synchronize_rcu_tasks(void) |
524 | { | 590 | { |
525 | /* Complain if the scheduler has not started. */ | 591 | /* Complain if the scheduler has not started. */ |
526 | rcu_lockdep_assert(!rcu_scheduler_active, | 592 | RCU_LOCKDEP_WARN(!rcu_scheduler_active, |
527 | "synchronize_rcu_tasks called too soon"); | 593 | "synchronize_rcu_tasks called too soon"); |
528 | 594 | ||
529 | /* Wait for the grace period. */ | 595 | /* Wait for the grace period. */ |
530 | wait_rcu_gp(call_rcu_tasks); | 596 | wait_rcu_gp(call_rcu_tasks); |
diff --git a/kernel/reboot.c b/kernel/reboot.c index d20c85d9f8c0..bd30a973fe94 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c | |||
@@ -346,7 +346,7 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd, | |||
346 | kernel_restart(buffer); | 346 | kernel_restart(buffer); |
347 | break; | 347 | break; |
348 | 348 | ||
349 | #ifdef CONFIG_KEXEC | 349 | #ifdef CONFIG_KEXEC_CORE |
350 | case LINUX_REBOOT_CMD_KEXEC: | 350 | case LINUX_REBOOT_CMD_KEXEC: |
351 | ret = kernel_kexec(); | 351 | ret = kernel_kexec(); |
352 | break; | 352 | break; |
diff --git a/kernel/resource.c b/kernel/resource.c index 90552aab5f2d..f150dbbe6f62 100644 --- a/kernel/resource.c +++ b/kernel/resource.c | |||
@@ -492,40 +492,51 @@ int __weak page_is_ram(unsigned long pfn) | |||
492 | } | 492 | } |
493 | EXPORT_SYMBOL_GPL(page_is_ram); | 493 | EXPORT_SYMBOL_GPL(page_is_ram); |
494 | 494 | ||
495 | /* | 495 | /** |
496 | * Search for a resouce entry that fully contains the specified region. | 496 | * region_intersects() - determine intersection of region with known resources |
497 | * If found, return 1 if it is RAM, 0 if not. | 497 | * @start: region start address |
498 | * If not found, or region is not fully contained, return -1 | 498 | * @size: size of region |
499 | * @name: name of resource (in iomem_resource) | ||
499 | * | 500 | * |
500 | * Used by the ioremap functions to ensure the user is not remapping RAM and is | 501 | * Check if the specified region partially overlaps or fully eclipses a |
501 | * a vast speed up over walking through the resource table page by page. | 502 | * resource identified by @name. Return REGION_DISJOINT if the region |
503 | * does not overlap @name, return REGION_MIXED if the region overlaps | ||
504 | * @type and another resource, and return REGION_INTERSECTS if the | ||
505 | * region overlaps @type and no other defined resource. Note, that | ||
506 | * REGION_INTERSECTS is also returned in the case when the specified | ||
507 | * region overlaps RAM and undefined memory holes. | ||
508 | * | ||
509 | * region_intersect() is used by memory remapping functions to ensure | ||
510 | * the user is not remapping RAM and is a vast speed up over walking | ||
511 | * through the resource table page by page. | ||
502 | */ | 512 | */ |
503 | int region_is_ram(resource_size_t start, unsigned long size) | 513 | int region_intersects(resource_size_t start, size_t size, const char *name) |
504 | { | 514 | { |
505 | struct resource *p; | 515 | unsigned long flags = IORESOURCE_MEM | IORESOURCE_BUSY; |
506 | resource_size_t end = start + size - 1; | 516 | resource_size_t end = start + size - 1; |
507 | int flags = IORESOURCE_MEM | IORESOURCE_BUSY; | 517 | int type = 0; int other = 0; |
508 | const char *name = "System RAM"; | 518 | struct resource *p; |
509 | int ret = -1; | ||
510 | 519 | ||
511 | read_lock(&resource_lock); | 520 | read_lock(&resource_lock); |
512 | for (p = iomem_resource.child; p ; p = p->sibling) { | 521 | for (p = iomem_resource.child; p ; p = p->sibling) { |
513 | if (end < p->start) | 522 | bool is_type = strcmp(p->name, name) == 0 && p->flags == flags; |
514 | continue; | 523 | |
515 | 524 | if (start >= p->start && start <= p->end) | |
516 | if (p->start <= start && end <= p->end) { | 525 | is_type ? type++ : other++; |
517 | /* resource fully contains region */ | 526 | if (end >= p->start && end <= p->end) |
518 | if ((p->flags != flags) || strcmp(p->name, name)) | 527 | is_type ? type++ : other++; |
519 | ret = 0; | 528 | if (p->start >= start && p->end <= end) |
520 | else | 529 | is_type ? type++ : other++; |
521 | ret = 1; | ||
522 | break; | ||
523 | } | ||
524 | if (p->end < start) | ||
525 | break; /* not found */ | ||
526 | } | 530 | } |
527 | read_unlock(&resource_lock); | 531 | read_unlock(&resource_lock); |
528 | return ret; | 532 | |
533 | if (other == 0) | ||
534 | return type ? REGION_INTERSECTS : REGION_DISJOINT; | ||
535 | |||
536 | if (type) | ||
537 | return REGION_MIXED; | ||
538 | |||
539 | return REGION_DISJOINT; | ||
529 | } | 540 | } |
530 | 541 | ||
531 | void __weak arch_remove_reservations(struct resource *avail) | 542 | void __weak arch_remove_reservations(struct resource *avail) |
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 78b4bad10081..3595403921bd 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
@@ -164,14 +164,12 @@ struct static_key sched_feat_keys[__SCHED_FEAT_NR] = { | |||
164 | 164 | ||
165 | static void sched_feat_disable(int i) | 165 | static void sched_feat_disable(int i) |
166 | { | 166 | { |
167 | if (static_key_enabled(&sched_feat_keys[i])) | 167 | static_key_disable(&sched_feat_keys[i]); |
168 | static_key_slow_dec(&sched_feat_keys[i]); | ||
169 | } | 168 | } |
170 | 169 | ||
171 | static void sched_feat_enable(int i) | 170 | static void sched_feat_enable(int i) |
172 | { | 171 | { |
173 | if (!static_key_enabled(&sched_feat_keys[i])) | 172 | static_key_enable(&sched_feat_keys[i]); |
174 | static_key_slow_inc(&sched_feat_keys[i]); | ||
175 | } | 173 | } |
176 | #else | 174 | #else |
177 | static void sched_feat_disable(int i) { }; | 175 | static void sched_feat_disable(int i) { }; |
@@ -1151,15 +1149,45 @@ static int migration_cpu_stop(void *data) | |||
1151 | return 0; | 1149 | return 0; |
1152 | } | 1150 | } |
1153 | 1151 | ||
1154 | void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) | 1152 | /* |
1153 | * sched_class::set_cpus_allowed must do the below, but is not required to | ||
1154 | * actually call this function. | ||
1155 | */ | ||
1156 | void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask) | ||
1155 | { | 1157 | { |
1156 | if (p->sched_class->set_cpus_allowed) | ||
1157 | p->sched_class->set_cpus_allowed(p, new_mask); | ||
1158 | |||
1159 | cpumask_copy(&p->cpus_allowed, new_mask); | 1158 | cpumask_copy(&p->cpus_allowed, new_mask); |
1160 | p->nr_cpus_allowed = cpumask_weight(new_mask); | 1159 | p->nr_cpus_allowed = cpumask_weight(new_mask); |
1161 | } | 1160 | } |
1162 | 1161 | ||
1162 | void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) | ||
1163 | { | ||
1164 | struct rq *rq = task_rq(p); | ||
1165 | bool queued, running; | ||
1166 | |||
1167 | lockdep_assert_held(&p->pi_lock); | ||
1168 | |||
1169 | queued = task_on_rq_queued(p); | ||
1170 | running = task_current(rq, p); | ||
1171 | |||
1172 | if (queued) { | ||
1173 | /* | ||
1174 | * Because __kthread_bind() calls this on blocked tasks without | ||
1175 | * holding rq->lock. | ||
1176 | */ | ||
1177 | lockdep_assert_held(&rq->lock); | ||
1178 | dequeue_task(rq, p, 0); | ||
1179 | } | ||
1180 | if (running) | ||
1181 | put_prev_task(rq, p); | ||
1182 | |||
1183 | p->sched_class->set_cpus_allowed(p, new_mask); | ||
1184 | |||
1185 | if (running) | ||
1186 | p->sched_class->set_curr_task(rq); | ||
1187 | if (queued) | ||
1188 | enqueue_task(rq, p, 0); | ||
1189 | } | ||
1190 | |||
1163 | /* | 1191 | /* |
1164 | * Change a given task's CPU affinity. Migrate the thread to a | 1192 | * Change a given task's CPU affinity. Migrate the thread to a |
1165 | * proper CPU and schedule it away if the CPU it's executing on | 1193 | * proper CPU and schedule it away if the CPU it's executing on |
@@ -1169,7 +1197,8 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) | |||
1169 | * task must not exit() & deallocate itself prematurely. The | 1197 | * task must not exit() & deallocate itself prematurely. The |
1170 | * call is not atomic; no spinlocks may be held. | 1198 | * call is not atomic; no spinlocks may be held. |
1171 | */ | 1199 | */ |
1172 | int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) | 1200 | static int __set_cpus_allowed_ptr(struct task_struct *p, |
1201 | const struct cpumask *new_mask, bool check) | ||
1173 | { | 1202 | { |
1174 | unsigned long flags; | 1203 | unsigned long flags; |
1175 | struct rq *rq; | 1204 | struct rq *rq; |
@@ -1178,6 +1207,15 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) | |||
1178 | 1207 | ||
1179 | rq = task_rq_lock(p, &flags); | 1208 | rq = task_rq_lock(p, &flags); |
1180 | 1209 | ||
1210 | /* | ||
1211 | * Must re-check here, to close a race against __kthread_bind(), | ||
1212 | * sched_setaffinity() is not guaranteed to observe the flag. | ||
1213 | */ | ||
1214 | if (check && (p->flags & PF_NO_SETAFFINITY)) { | ||
1215 | ret = -EINVAL; | ||
1216 | goto out; | ||
1217 | } | ||
1218 | |||
1181 | if (cpumask_equal(&p->cpus_allowed, new_mask)) | 1219 | if (cpumask_equal(&p->cpus_allowed, new_mask)) |
1182 | goto out; | 1220 | goto out; |
1183 | 1221 | ||
@@ -1214,6 +1252,11 @@ out: | |||
1214 | 1252 | ||
1215 | return ret; | 1253 | return ret; |
1216 | } | 1254 | } |
1255 | |||
1256 | int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) | ||
1257 | { | ||
1258 | return __set_cpus_allowed_ptr(p, new_mask, false); | ||
1259 | } | ||
1217 | EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); | 1260 | EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); |
1218 | 1261 | ||
1219 | void set_task_cpu(struct task_struct *p, unsigned int new_cpu) | 1262 | void set_task_cpu(struct task_struct *p, unsigned int new_cpu) |
@@ -1595,6 +1638,15 @@ static void update_avg(u64 *avg, u64 sample) | |||
1595 | s64 diff = sample - *avg; | 1638 | s64 diff = sample - *avg; |
1596 | *avg += diff >> 3; | 1639 | *avg += diff >> 3; |
1597 | } | 1640 | } |
1641 | |||
1642 | #else | ||
1643 | |||
1644 | static inline int __set_cpus_allowed_ptr(struct task_struct *p, | ||
1645 | const struct cpumask *new_mask, bool check) | ||
1646 | { | ||
1647 | return set_cpus_allowed_ptr(p, new_mask); | ||
1648 | } | ||
1649 | |||
1598 | #endif /* CONFIG_SMP */ | 1650 | #endif /* CONFIG_SMP */ |
1599 | 1651 | ||
1600 | static void | 1652 | static void |
@@ -1654,9 +1706,9 @@ static void | |||
1654 | ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) | 1706 | ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) |
1655 | { | 1707 | { |
1656 | check_preempt_curr(rq, p, wake_flags); | 1708 | check_preempt_curr(rq, p, wake_flags); |
1657 | trace_sched_wakeup(p, true); | ||
1658 | |||
1659 | p->state = TASK_RUNNING; | 1709 | p->state = TASK_RUNNING; |
1710 | trace_sched_wakeup(p); | ||
1711 | |||
1660 | #ifdef CONFIG_SMP | 1712 | #ifdef CONFIG_SMP |
1661 | if (p->sched_class->task_woken) { | 1713 | if (p->sched_class->task_woken) { |
1662 | /* | 1714 | /* |
@@ -1874,6 +1926,8 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) | |||
1874 | if (!(p->state & state)) | 1926 | if (!(p->state & state)) |
1875 | goto out; | 1927 | goto out; |
1876 | 1928 | ||
1929 | trace_sched_waking(p); | ||
1930 | |||
1877 | success = 1; /* we're going to change ->state */ | 1931 | success = 1; /* we're going to change ->state */ |
1878 | cpu = task_cpu(p); | 1932 | cpu = task_cpu(p); |
1879 | 1933 | ||
@@ -1949,6 +2003,8 @@ static void try_to_wake_up_local(struct task_struct *p) | |||
1949 | if (!(p->state & TASK_NORMAL)) | 2003 | if (!(p->state & TASK_NORMAL)) |
1950 | goto out; | 2004 | goto out; |
1951 | 2005 | ||
2006 | trace_sched_waking(p); | ||
2007 | |||
1952 | if (!task_on_rq_queued(p)) | 2008 | if (!task_on_rq_queued(p)) |
1953 | ttwu_activate(rq, p, ENQUEUE_WAKEUP); | 2009 | ttwu_activate(rq, p, ENQUEUE_WAKEUP); |
1954 | 2010 | ||
@@ -2016,9 +2072,6 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) | |||
2016 | p->se.prev_sum_exec_runtime = 0; | 2072 | p->se.prev_sum_exec_runtime = 0; |
2017 | p->se.nr_migrations = 0; | 2073 | p->se.nr_migrations = 0; |
2018 | p->se.vruntime = 0; | 2074 | p->se.vruntime = 0; |
2019 | #ifdef CONFIG_SMP | ||
2020 | p->se.avg.decay_count = 0; | ||
2021 | #endif | ||
2022 | INIT_LIST_HEAD(&p->se.group_node); | 2075 | INIT_LIST_HEAD(&p->se.group_node); |
2023 | 2076 | ||
2024 | #ifdef CONFIG_SCHEDSTATS | 2077 | #ifdef CONFIG_SCHEDSTATS |
@@ -2200,8 +2253,8 @@ unsigned long to_ratio(u64 period, u64 runtime) | |||
2200 | #ifdef CONFIG_SMP | 2253 | #ifdef CONFIG_SMP |
2201 | inline struct dl_bw *dl_bw_of(int i) | 2254 | inline struct dl_bw *dl_bw_of(int i) |
2202 | { | 2255 | { |
2203 | rcu_lockdep_assert(rcu_read_lock_sched_held(), | 2256 | RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(), |
2204 | "sched RCU must be held"); | 2257 | "sched RCU must be held"); |
2205 | return &cpu_rq(i)->rd->dl_bw; | 2258 | return &cpu_rq(i)->rd->dl_bw; |
2206 | } | 2259 | } |
2207 | 2260 | ||
@@ -2210,8 +2263,8 @@ static inline int dl_bw_cpus(int i) | |||
2210 | struct root_domain *rd = cpu_rq(i)->rd; | 2263 | struct root_domain *rd = cpu_rq(i)->rd; |
2211 | int cpus = 0; | 2264 | int cpus = 0; |
2212 | 2265 | ||
2213 | rcu_lockdep_assert(rcu_read_lock_sched_held(), | 2266 | RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(), |
2214 | "sched RCU must be held"); | 2267 | "sched RCU must be held"); |
2215 | for_each_cpu_and(i, rd->span, cpu_active_mask) | 2268 | for_each_cpu_and(i, rd->span, cpu_active_mask) |
2216 | cpus++; | 2269 | cpus++; |
2217 | 2270 | ||
@@ -2303,11 +2356,11 @@ void wake_up_new_task(struct task_struct *p) | |||
2303 | #endif | 2356 | #endif |
2304 | 2357 | ||
2305 | /* Initialize new task's runnable average */ | 2358 | /* Initialize new task's runnable average */ |
2306 | init_task_runnable_average(p); | 2359 | init_entity_runnable_average(&p->se); |
2307 | rq = __task_rq_lock(p); | 2360 | rq = __task_rq_lock(p); |
2308 | activate_task(rq, p, 0); | 2361 | activate_task(rq, p, 0); |
2309 | p->on_rq = TASK_ON_RQ_QUEUED; | 2362 | p->on_rq = TASK_ON_RQ_QUEUED; |
2310 | trace_sched_wakeup_new(p, true); | 2363 | trace_sched_wakeup_new(p); |
2311 | check_preempt_curr(rq, p, WF_FORK); | 2364 | check_preempt_curr(rq, p, WF_FORK); |
2312 | #ifdef CONFIG_SMP | 2365 | #ifdef CONFIG_SMP |
2313 | if (p->sched_class->task_woken) | 2366 | if (p->sched_class->task_woken) |
@@ -2469,7 +2522,6 @@ static struct rq *finish_task_switch(struct task_struct *prev) | |||
2469 | */ | 2522 | */ |
2470 | prev_state = prev->state; | 2523 | prev_state = prev->state; |
2471 | vtime_task_switch(prev); | 2524 | vtime_task_switch(prev); |
2472 | finish_arch_switch(prev); | ||
2473 | perf_event_task_sched_in(prev, current); | 2525 | perf_event_task_sched_in(prev, current); |
2474 | finish_lock_switch(rq, prev); | 2526 | finish_lock_switch(rq, prev); |
2475 | finish_arch_post_lock_switch(); | 2527 | finish_arch_post_lock_switch(); |
@@ -2489,7 +2541,7 @@ static struct rq *finish_task_switch(struct task_struct *prev) | |||
2489 | put_task_struct(prev); | 2541 | put_task_struct(prev); |
2490 | } | 2542 | } |
2491 | 2543 | ||
2492 | tick_nohz_task_switch(current); | 2544 | tick_nohz_task_switch(); |
2493 | return rq; | 2545 | return rq; |
2494 | } | 2546 | } |
2495 | 2547 | ||
@@ -4340,7 +4392,7 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) | |||
4340 | } | 4392 | } |
4341 | #endif | 4393 | #endif |
4342 | again: | 4394 | again: |
4343 | retval = set_cpus_allowed_ptr(p, new_mask); | 4395 | retval = __set_cpus_allowed_ptr(p, new_mask, true); |
4344 | 4396 | ||
4345 | if (!retval) { | 4397 | if (!retval) { |
4346 | cpuset_cpus_allowed(p, cpus_allowed); | 4398 | cpuset_cpus_allowed(p, cpus_allowed); |
@@ -4492,7 +4544,7 @@ SYSCALL_DEFINE0(sched_yield) | |||
4492 | 4544 | ||
4493 | int __sched _cond_resched(void) | 4545 | int __sched _cond_resched(void) |
4494 | { | 4546 | { |
4495 | if (should_resched()) { | 4547 | if (should_resched(0)) { |
4496 | preempt_schedule_common(); | 4548 | preempt_schedule_common(); |
4497 | return 1; | 4549 | return 1; |
4498 | } | 4550 | } |
@@ -4510,7 +4562,7 @@ EXPORT_SYMBOL(_cond_resched); | |||
4510 | */ | 4562 | */ |
4511 | int __cond_resched_lock(spinlock_t *lock) | 4563 | int __cond_resched_lock(spinlock_t *lock) |
4512 | { | 4564 | { |
4513 | int resched = should_resched(); | 4565 | int resched = should_resched(PREEMPT_LOCK_OFFSET); |
4514 | int ret = 0; | 4566 | int ret = 0; |
4515 | 4567 | ||
4516 | lockdep_assert_held(lock); | 4568 | lockdep_assert_held(lock); |
@@ -4532,7 +4584,7 @@ int __sched __cond_resched_softirq(void) | |||
4532 | { | 4584 | { |
4533 | BUG_ON(!in_softirq()); | 4585 | BUG_ON(!in_softirq()); |
4534 | 4586 | ||
4535 | if (should_resched()) { | 4587 | if (should_resched(SOFTIRQ_DISABLE_OFFSET)) { |
4536 | local_bh_enable(); | 4588 | local_bh_enable(); |
4537 | preempt_schedule_common(); | 4589 | preempt_schedule_common(); |
4538 | local_bh_disable(); | 4590 | local_bh_disable(); |
@@ -4865,7 +4917,8 @@ void init_idle(struct task_struct *idle, int cpu) | |||
4865 | struct rq *rq = cpu_rq(cpu); | 4917 | struct rq *rq = cpu_rq(cpu); |
4866 | unsigned long flags; | 4918 | unsigned long flags; |
4867 | 4919 | ||
4868 | raw_spin_lock_irqsave(&rq->lock, flags); | 4920 | raw_spin_lock_irqsave(&idle->pi_lock, flags); |
4921 | raw_spin_lock(&rq->lock); | ||
4869 | 4922 | ||
4870 | __sched_fork(0, idle); | 4923 | __sched_fork(0, idle); |
4871 | idle->state = TASK_RUNNING; | 4924 | idle->state = TASK_RUNNING; |
@@ -4891,7 +4944,8 @@ void init_idle(struct task_struct *idle, int cpu) | |||
4891 | #if defined(CONFIG_SMP) | 4944 | #if defined(CONFIG_SMP) |
4892 | idle->on_cpu = 1; | 4945 | idle->on_cpu = 1; |
4893 | #endif | 4946 | #endif |
4894 | raw_spin_unlock_irqrestore(&rq->lock, flags); | 4947 | raw_spin_unlock(&rq->lock); |
4948 | raw_spin_unlock_irqrestore(&idle->pi_lock, flags); | ||
4895 | 4949 | ||
4896 | /* Set the preempt count _outside_ the spinlocks! */ | 4950 | /* Set the preempt count _outside_ the spinlocks! */ |
4897 | init_idle_preempt_count(idle, cpu); | 4951 | init_idle_preempt_count(idle, cpu); |
@@ -5311,8 +5365,7 @@ static void register_sched_domain_sysctl(void) | |||
5311 | /* may be called multiple times per register */ | 5365 | /* may be called multiple times per register */ |
5312 | static void unregister_sched_domain_sysctl(void) | 5366 | static void unregister_sched_domain_sysctl(void) |
5313 | { | 5367 | { |
5314 | if (sd_sysctl_header) | 5368 | unregister_sysctl_table(sd_sysctl_header); |
5315 | unregister_sysctl_table(sd_sysctl_header); | ||
5316 | sd_sysctl_header = NULL; | 5369 | sd_sysctl_header = NULL; |
5317 | if (sd_ctl_dir[0].child) | 5370 | if (sd_ctl_dir[0].child) |
5318 | sd_free_ctl_entry(&sd_ctl_dir[0].child); | 5371 | sd_free_ctl_entry(&sd_ctl_dir[0].child); |
@@ -5433,6 +5486,14 @@ static int sched_cpu_active(struct notifier_block *nfb, | |||
5433 | case CPU_STARTING: | 5486 | case CPU_STARTING: |
5434 | set_cpu_rq_start_time(); | 5487 | set_cpu_rq_start_time(); |
5435 | return NOTIFY_OK; | 5488 | return NOTIFY_OK; |
5489 | case CPU_ONLINE: | ||
5490 | /* | ||
5491 | * At this point a starting CPU has marked itself as online via | ||
5492 | * set_cpu_online(). But it might not yet have marked itself | ||
5493 | * as active, which is essential from here on. | ||
5494 | * | ||
5495 | * Thus, fall-through and help the starting CPU along. | ||
5496 | */ | ||
5436 | case CPU_DOWN_FAILED: | 5497 | case CPU_DOWN_FAILED: |
5437 | set_cpu_active((long)hcpu, true); | 5498 | set_cpu_active((long)hcpu, true); |
5438 | return NOTIFY_OK; | 5499 | return NOTIFY_OK; |
@@ -6445,8 +6506,10 @@ static void init_numa_topology_type(void) | |||
6445 | 6506 | ||
6446 | n = sched_max_numa_distance; | 6507 | n = sched_max_numa_distance; |
6447 | 6508 | ||
6448 | if (n <= 1) | 6509 | if (sched_domains_numa_levels <= 1) { |
6449 | sched_numa_topology_type = NUMA_DIRECT; | 6510 | sched_numa_topology_type = NUMA_DIRECT; |
6511 | return; | ||
6512 | } | ||
6450 | 6513 | ||
6451 | for_each_online_node(a) { | 6514 | for_each_online_node(a) { |
6452 | for_each_online_node(b) { | 6515 | for_each_online_node(b) { |
@@ -8068,7 +8131,7 @@ static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css) | |||
8068 | sched_offline_group(tg); | 8131 | sched_offline_group(tg); |
8069 | } | 8132 | } |
8070 | 8133 | ||
8071 | static void cpu_cgroup_fork(struct task_struct *task) | 8134 | static void cpu_cgroup_fork(struct task_struct *task, void *private) |
8072 | { | 8135 | { |
8073 | sched_move_task(task); | 8136 | sched_move_task(task); |
8074 | } | 8137 | } |
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index f5a64ffad176..8cbc3db671df 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c | |||
@@ -555,48 +555,43 @@ drop_precision: | |||
555 | } | 555 | } |
556 | 556 | ||
557 | /* | 557 | /* |
558 | * Atomically advance counter to the new value. Interrupts, vcpu | 558 | * Adjust tick based cputime random precision against scheduler runtime |
559 | * scheduling, and scaling inaccuracies can cause cputime_advance | 559 | * accounting. |
560 | * to be occasionally called with a new value smaller than counter. | ||
561 | * Let's enforce atomicity. | ||
562 | * | 560 | * |
563 | * Normally a caller will only go through this loop once, or not | 561 | * Tick based cputime accounting depend on random scheduling timeslices of a |
564 | * at all in case a previous caller updated counter the same jiffy. | 562 | * task to be interrupted or not by the timer. Depending on these |
565 | */ | 563 | * circumstances, the number of these interrupts may be over or |
566 | static void cputime_advance(cputime_t *counter, cputime_t new) | 564 | * under-optimistic, matching the real user and system cputime with a variable |
567 | { | 565 | * precision. |
568 | cputime_t old; | 566 | * |
569 | 567 | * Fix this by scaling these tick based values against the total runtime | |
570 | while (new > (old = READ_ONCE(*counter))) | 568 | * accounted by the CFS scheduler. |
571 | cmpxchg_cputime(counter, old, new); | 569 | * |
572 | } | 570 | * This code provides the following guarantees: |
573 | 571 | * | |
574 | /* | 572 | * stime + utime == rtime |
575 | * Adjust tick based cputime random precision against scheduler | 573 | * stime_i+1 >= stime_i, utime_i+1 >= utime_i |
576 | * runtime accounting. | 574 | * |
575 | * Assuming that rtime_i+1 >= rtime_i. | ||
577 | */ | 576 | */ |
578 | static void cputime_adjust(struct task_cputime *curr, | 577 | static void cputime_adjust(struct task_cputime *curr, |
579 | struct cputime *prev, | 578 | struct prev_cputime *prev, |
580 | cputime_t *ut, cputime_t *st) | 579 | cputime_t *ut, cputime_t *st) |
581 | { | 580 | { |
582 | cputime_t rtime, stime, utime; | 581 | cputime_t rtime, stime, utime; |
582 | unsigned long flags; | ||
583 | 583 | ||
584 | /* | 584 | /* Serialize concurrent callers such that we can honour our guarantees */ |
585 | * Tick based cputime accounting depend on random scheduling | 585 | raw_spin_lock_irqsave(&prev->lock, flags); |
586 | * timeslices of a task to be interrupted or not by the timer. | ||
587 | * Depending on these circumstances, the number of these interrupts | ||
588 | * may be over or under-optimistic, matching the real user and system | ||
589 | * cputime with a variable precision. | ||
590 | * | ||
591 | * Fix this by scaling these tick based values against the total | ||
592 | * runtime accounted by the CFS scheduler. | ||
593 | */ | ||
594 | rtime = nsecs_to_cputime(curr->sum_exec_runtime); | 586 | rtime = nsecs_to_cputime(curr->sum_exec_runtime); |
595 | 587 | ||
596 | /* | 588 | /* |
597 | * Update userspace visible utime/stime values only if actual execution | 589 | * This is possible under two circumstances: |
598 | * time is bigger than already exported. Note that can happen, that we | 590 | * - rtime isn't monotonic after all (a bug); |
599 | * provided bigger values due to scaling inaccuracy on big numbers. | 591 | * - we got reordered by the lock. |
592 | * | ||
593 | * In both cases this acts as a filter such that the rest of the code | ||
594 | * can assume it is monotonic regardless of anything else. | ||
600 | */ | 595 | */ |
601 | if (prev->stime + prev->utime >= rtime) | 596 | if (prev->stime + prev->utime >= rtime) |
602 | goto out; | 597 | goto out; |
@@ -606,22 +601,46 @@ static void cputime_adjust(struct task_cputime *curr, | |||
606 | 601 | ||
607 | if (utime == 0) { | 602 | if (utime == 0) { |
608 | stime = rtime; | 603 | stime = rtime; |
609 | } else if (stime == 0) { | 604 | goto update; |
610 | utime = rtime; | 605 | } |
611 | } else { | ||
612 | cputime_t total = stime + utime; | ||
613 | 606 | ||
614 | stime = scale_stime((__force u64)stime, | 607 | if (stime == 0) { |
615 | (__force u64)rtime, (__force u64)total); | 608 | utime = rtime; |
616 | utime = rtime - stime; | 609 | goto update; |
617 | } | 610 | } |
618 | 611 | ||
619 | cputime_advance(&prev->stime, stime); | 612 | stime = scale_stime((__force u64)stime, (__force u64)rtime, |
620 | cputime_advance(&prev->utime, utime); | 613 | (__force u64)(stime + utime)); |
614 | |||
615 | /* | ||
616 | * Make sure stime doesn't go backwards; this preserves monotonicity | ||
617 | * for utime because rtime is monotonic. | ||
618 | * | ||
619 | * utime_i+1 = rtime_i+1 - stime_i | ||
620 | * = rtime_i+1 - (rtime_i - utime_i) | ||
621 | * = (rtime_i+1 - rtime_i) + utime_i | ||
622 | * >= utime_i | ||
623 | */ | ||
624 | if (stime < prev->stime) | ||
625 | stime = prev->stime; | ||
626 | utime = rtime - stime; | ||
627 | |||
628 | /* | ||
629 | * Make sure utime doesn't go backwards; this still preserves | ||
630 | * monotonicity for stime, analogous argument to above. | ||
631 | */ | ||
632 | if (utime < prev->utime) { | ||
633 | utime = prev->utime; | ||
634 | stime = rtime - utime; | ||
635 | } | ||
621 | 636 | ||
637 | update: | ||
638 | prev->stime = stime; | ||
639 | prev->utime = utime; | ||
622 | out: | 640 | out: |
623 | *ut = prev->utime; | 641 | *ut = prev->utime; |
624 | *st = prev->stime; | 642 | *st = prev->stime; |
643 | raw_spin_unlock_irqrestore(&prev->lock, flags); | ||
625 | } | 644 | } |
626 | 645 | ||
627 | void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) | 646 | void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) |
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 0a17af35670a..fc8f01083527 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c | |||
@@ -953,7 +953,7 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) | |||
953 | 953 | ||
954 | /* | 954 | /* |
955 | * Use the scheduling parameters of the top pi-waiter | 955 | * Use the scheduling parameters of the top pi-waiter |
956 | * task if we have one and its (relative) deadline is | 956 | * task if we have one and its (absolute) deadline is |
957 | * smaller than our one... OTW we keep our runtime and | 957 | * smaller than our one... OTW we keep our runtime and |
958 | * deadline. | 958 | * deadline. |
959 | */ | 959 | */ |
@@ -1563,7 +1563,7 @@ out: | |||
1563 | 1563 | ||
1564 | static void push_dl_tasks(struct rq *rq) | 1564 | static void push_dl_tasks(struct rq *rq) |
1565 | { | 1565 | { |
1566 | /* Terminates as it moves a -deadline task */ | 1566 | /* push_dl_task() will return true if it moved a -deadline task */ |
1567 | while (push_dl_task(rq)) | 1567 | while (push_dl_task(rq)) |
1568 | ; | 1568 | ; |
1569 | } | 1569 | } |
@@ -1657,7 +1657,6 @@ static void task_woken_dl(struct rq *rq, struct task_struct *p) | |||
1657 | { | 1657 | { |
1658 | if (!task_running(rq, p) && | 1658 | if (!task_running(rq, p) && |
1659 | !test_tsk_need_resched(rq->curr) && | 1659 | !test_tsk_need_resched(rq->curr) && |
1660 | has_pushable_dl_tasks(rq) && | ||
1661 | p->nr_cpus_allowed > 1 && | 1660 | p->nr_cpus_allowed > 1 && |
1662 | dl_task(rq->curr) && | 1661 | dl_task(rq->curr) && |
1663 | (rq->curr->nr_cpus_allowed < 2 || | 1662 | (rq->curr->nr_cpus_allowed < 2 || |
@@ -1669,9 +1668,8 @@ static void task_woken_dl(struct rq *rq, struct task_struct *p) | |||
1669 | static void set_cpus_allowed_dl(struct task_struct *p, | 1668 | static void set_cpus_allowed_dl(struct task_struct *p, |
1670 | const struct cpumask *new_mask) | 1669 | const struct cpumask *new_mask) |
1671 | { | 1670 | { |
1672 | struct rq *rq; | ||
1673 | struct root_domain *src_rd; | 1671 | struct root_domain *src_rd; |
1674 | int weight; | 1672 | struct rq *rq; |
1675 | 1673 | ||
1676 | BUG_ON(!dl_task(p)); | 1674 | BUG_ON(!dl_task(p)); |
1677 | 1675 | ||
@@ -1697,37 +1695,7 @@ static void set_cpus_allowed_dl(struct task_struct *p, | |||
1697 | raw_spin_unlock(&src_dl_b->lock); | 1695 | raw_spin_unlock(&src_dl_b->lock); |
1698 | } | 1696 | } |
1699 | 1697 | ||
1700 | /* | 1698 | set_cpus_allowed_common(p, new_mask); |
1701 | * Update only if the task is actually running (i.e., | ||
1702 | * it is on the rq AND it is not throttled). | ||
1703 | */ | ||
1704 | if (!on_dl_rq(&p->dl)) | ||
1705 | return; | ||
1706 | |||
1707 | weight = cpumask_weight(new_mask); | ||
1708 | |||
1709 | /* | ||
1710 | * Only update if the process changes its state from whether it | ||
1711 | * can migrate or not. | ||
1712 | */ | ||
1713 | if ((p->nr_cpus_allowed > 1) == (weight > 1)) | ||
1714 | return; | ||
1715 | |||
1716 | /* | ||
1717 | * The process used to be able to migrate OR it can now migrate | ||
1718 | */ | ||
1719 | if (weight <= 1) { | ||
1720 | if (!task_current(rq, p)) | ||
1721 | dequeue_pushable_dl_task(rq, p); | ||
1722 | BUG_ON(!rq->dl.dl_nr_migratory); | ||
1723 | rq->dl.dl_nr_migratory--; | ||
1724 | } else { | ||
1725 | if (!task_current(rq, p)) | ||
1726 | enqueue_pushable_dl_task(rq, p); | ||
1727 | rq->dl.dl_nr_migratory++; | ||
1728 | } | ||
1729 | |||
1730 | update_dl_migration(&rq->dl); | ||
1731 | } | 1699 | } |
1732 | 1700 | ||
1733 | /* Assumes rq->lock is held */ | 1701 | /* Assumes rq->lock is held */ |
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 4222ec50ab88..641511771ae6 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c | |||
@@ -68,13 +68,8 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group | |||
68 | #define PN(F) \ | 68 | #define PN(F) \ |
69 | SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F)) | 69 | SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F)) |
70 | 70 | ||
71 | if (!se) { | 71 | if (!se) |
72 | struct sched_avg *avg = &cpu_rq(cpu)->avg; | ||
73 | P(avg->runnable_avg_sum); | ||
74 | P(avg->avg_period); | ||
75 | return; | 72 | return; |
76 | } | ||
77 | |||
78 | 73 | ||
79 | PN(se->exec_start); | 74 | PN(se->exec_start); |
80 | PN(se->vruntime); | 75 | PN(se->vruntime); |
@@ -93,12 +88,8 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group | |||
93 | #endif | 88 | #endif |
94 | P(se->load.weight); | 89 | P(se->load.weight); |
95 | #ifdef CONFIG_SMP | 90 | #ifdef CONFIG_SMP |
96 | P(se->avg.runnable_avg_sum); | 91 | P(se->avg.load_avg); |
97 | P(se->avg.running_avg_sum); | 92 | P(se->avg.util_avg); |
98 | P(se->avg.avg_period); | ||
99 | P(se->avg.load_avg_contrib); | ||
100 | P(se->avg.utilization_avg_contrib); | ||
101 | P(se->avg.decay_count); | ||
102 | #endif | 93 | #endif |
103 | #undef PN | 94 | #undef PN |
104 | #undef P | 95 | #undef P |
@@ -214,21 +205,21 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) | |||
214 | SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running); | 205 | SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running); |
215 | SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); | 206 | SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); |
216 | #ifdef CONFIG_SMP | 207 | #ifdef CONFIG_SMP |
217 | SEQ_printf(m, " .%-30s: %ld\n", "runnable_load_avg", | 208 | SEQ_printf(m, " .%-30s: %lu\n", "load_avg", |
209 | cfs_rq->avg.load_avg); | ||
210 | SEQ_printf(m, " .%-30s: %lu\n", "runnable_load_avg", | ||
218 | cfs_rq->runnable_load_avg); | 211 | cfs_rq->runnable_load_avg); |
219 | SEQ_printf(m, " .%-30s: %ld\n", "blocked_load_avg", | 212 | SEQ_printf(m, " .%-30s: %lu\n", "util_avg", |
220 | cfs_rq->blocked_load_avg); | 213 | cfs_rq->avg.util_avg); |
221 | SEQ_printf(m, " .%-30s: %ld\n", "utilization_load_avg", | 214 | SEQ_printf(m, " .%-30s: %ld\n", "removed_load_avg", |
222 | cfs_rq->utilization_load_avg); | 215 | atomic_long_read(&cfs_rq->removed_load_avg)); |
216 | SEQ_printf(m, " .%-30s: %ld\n", "removed_util_avg", | ||
217 | atomic_long_read(&cfs_rq->removed_util_avg)); | ||
223 | #ifdef CONFIG_FAIR_GROUP_SCHED | 218 | #ifdef CONFIG_FAIR_GROUP_SCHED |
224 | SEQ_printf(m, " .%-30s: %ld\n", "tg_load_contrib", | 219 | SEQ_printf(m, " .%-30s: %lu\n", "tg_load_avg_contrib", |
225 | cfs_rq->tg_load_contrib); | 220 | cfs_rq->tg_load_avg_contrib); |
226 | SEQ_printf(m, " .%-30s: %d\n", "tg_runnable_contrib", | ||
227 | cfs_rq->tg_runnable_contrib); | ||
228 | SEQ_printf(m, " .%-30s: %ld\n", "tg_load_avg", | 221 | SEQ_printf(m, " .%-30s: %ld\n", "tg_load_avg", |
229 | atomic_long_read(&cfs_rq->tg->load_avg)); | 222 | atomic_long_read(&cfs_rq->tg->load_avg)); |
230 | SEQ_printf(m, " .%-30s: %d\n", "tg->runnable_avg", | ||
231 | atomic_read(&cfs_rq->tg->runnable_avg)); | ||
232 | #endif | 223 | #endif |
233 | #endif | 224 | #endif |
234 | #ifdef CONFIG_CFS_BANDWIDTH | 225 | #ifdef CONFIG_CFS_BANDWIDTH |
@@ -636,12 +627,11 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) | |||
636 | 627 | ||
637 | P(se.load.weight); | 628 | P(se.load.weight); |
638 | #ifdef CONFIG_SMP | 629 | #ifdef CONFIG_SMP |
639 | P(se.avg.runnable_avg_sum); | 630 | P(se.avg.load_sum); |
640 | P(se.avg.running_avg_sum); | 631 | P(se.avg.util_sum); |
641 | P(se.avg.avg_period); | 632 | P(se.avg.load_avg); |
642 | P(se.avg.load_avg_contrib); | 633 | P(se.avg.util_avg); |
643 | P(se.avg.utilization_avg_contrib); | 634 | P(se.avg.last_update_time); |
644 | P(se.avg.decay_count); | ||
645 | #endif | 635 | #endif |
646 | P(policy); | 636 | P(policy); |
647 | P(prio); | 637 | P(prio); |
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 65c8f3ebdc3c..6e2e3483b1ec 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c | |||
@@ -283,9 +283,6 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) | |||
283 | return grp->my_q; | 283 | return grp->my_q; |
284 | } | 284 | } |
285 | 285 | ||
286 | static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, | ||
287 | int force_update); | ||
288 | |||
289 | static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) | 286 | static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) |
290 | { | 287 | { |
291 | if (!cfs_rq->on_list) { | 288 | if (!cfs_rq->on_list) { |
@@ -305,8 +302,6 @@ static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) | |||
305 | } | 302 | } |
306 | 303 | ||
307 | cfs_rq->on_list = 1; | 304 | cfs_rq->on_list = 1; |
308 | /* We should have no load, but we need to update last_decay. */ | ||
309 | update_cfs_rq_blocked_load(cfs_rq, 0); | ||
310 | } | 305 | } |
311 | } | 306 | } |
312 | 307 | ||
@@ -616,15 +611,10 @@ static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se) | |||
616 | */ | 611 | */ |
617 | static u64 __sched_period(unsigned long nr_running) | 612 | static u64 __sched_period(unsigned long nr_running) |
618 | { | 613 | { |
619 | u64 period = sysctl_sched_latency; | 614 | if (unlikely(nr_running > sched_nr_latency)) |
620 | unsigned long nr_latency = sched_nr_latency; | 615 | return nr_running * sysctl_sched_min_granularity; |
621 | 616 | else | |
622 | if (unlikely(nr_running > nr_latency)) { | 617 | return sysctl_sched_latency; |
623 | period = sysctl_sched_min_granularity; | ||
624 | period *= nr_running; | ||
625 | } | ||
626 | |||
627 | return period; | ||
628 | } | 618 | } |
629 | 619 | ||
630 | /* | 620 | /* |
@@ -669,22 +659,37 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
669 | static int select_idle_sibling(struct task_struct *p, int cpu); | 659 | static int select_idle_sibling(struct task_struct *p, int cpu); |
670 | static unsigned long task_h_load(struct task_struct *p); | 660 | static unsigned long task_h_load(struct task_struct *p); |
671 | 661 | ||
672 | static inline void __update_task_entity_contrib(struct sched_entity *se); | 662 | /* |
673 | static inline void __update_task_entity_utilization(struct sched_entity *se); | 663 | * We choose a half-life close to 1 scheduling period. |
664 | * Note: The tables below are dependent on this value. | ||
665 | */ | ||
666 | #define LOAD_AVG_PERIOD 32 | ||
667 | #define LOAD_AVG_MAX 47742 /* maximum possible load avg */ | ||
668 | #define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_MAX_AVG */ | ||
674 | 669 | ||
675 | /* Give new task start runnable values to heavy its load in infant time */ | 670 | /* Give new sched_entity start runnable values to heavy its load in infant time */ |
676 | void init_task_runnable_average(struct task_struct *p) | 671 | void init_entity_runnable_average(struct sched_entity *se) |
677 | { | 672 | { |
678 | u32 slice; | 673 | struct sched_avg *sa = &se->avg; |
679 | 674 | ||
680 | slice = sched_slice(task_cfs_rq(p), &p->se) >> 10; | 675 | sa->last_update_time = 0; |
681 | p->se.avg.runnable_avg_sum = p->se.avg.running_avg_sum = slice; | 676 | /* |
682 | p->se.avg.avg_period = slice; | 677 | * sched_avg's period_contrib should be strictly less then 1024, so |
683 | __update_task_entity_contrib(&p->se); | 678 | * we give it 1023 to make sure it is almost a period (1024us), and |
684 | __update_task_entity_utilization(&p->se); | 679 | * will definitely be update (after enqueue). |
680 | */ | ||
681 | sa->period_contrib = 1023; | ||
682 | sa->load_avg = scale_load_down(se->load.weight); | ||
683 | sa->load_sum = sa->load_avg * LOAD_AVG_MAX; | ||
684 | sa->util_avg = scale_load_down(SCHED_LOAD_SCALE); | ||
685 | sa->util_sum = LOAD_AVG_MAX; | ||
686 | /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */ | ||
685 | } | 687 | } |
688 | |||
689 | static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq); | ||
690 | static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq); | ||
686 | #else | 691 | #else |
687 | void init_task_runnable_average(struct task_struct *p) | 692 | void init_entity_runnable_average(struct sched_entity *se) |
688 | { | 693 | { |
689 | } | 694 | } |
690 | #endif | 695 | #endif |
@@ -1415,8 +1420,9 @@ static bool numa_has_capacity(struct task_numa_env *env) | |||
1415 | * --------------------- vs --------------------- | 1420 | * --------------------- vs --------------------- |
1416 | * src->compute_capacity dst->compute_capacity | 1421 | * src->compute_capacity dst->compute_capacity |
1417 | */ | 1422 | */ |
1418 | if (src->load * dst->compute_capacity > | 1423 | if (src->load * dst->compute_capacity * env->imbalance_pct > |
1419 | dst->load * src->compute_capacity) | 1424 | |
1425 | dst->load * src->compute_capacity * 100) | ||
1420 | return true; | 1426 | return true; |
1421 | 1427 | ||
1422 | return false; | 1428 | return false; |
@@ -1702,8 +1708,8 @@ static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period) | |||
1702 | delta = runtime - p->last_sum_exec_runtime; | 1708 | delta = runtime - p->last_sum_exec_runtime; |
1703 | *period = now - p->last_task_numa_placement; | 1709 | *period = now - p->last_task_numa_placement; |
1704 | } else { | 1710 | } else { |
1705 | delta = p->se.avg.runnable_avg_sum; | 1711 | delta = p->se.avg.load_sum / p->se.load.weight; |
1706 | *period = p->se.avg.avg_period; | 1712 | *period = LOAD_AVG_MAX; |
1707 | } | 1713 | } |
1708 | 1714 | ||
1709 | p->last_sum_exec_runtime = runtime; | 1715 | p->last_sum_exec_runtime = runtime; |
@@ -2351,13 +2357,13 @@ static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq) | |||
2351 | long tg_weight; | 2357 | long tg_weight; |
2352 | 2358 | ||
2353 | /* | 2359 | /* |
2354 | * Use this CPU's actual weight instead of the last load_contribution | 2360 | * Use this CPU's real-time load instead of the last load contribution |
2355 | * to gain a more accurate current total weight. See | 2361 | * as the updating of the contribution is delayed, and we will use the |
2356 | * update_cfs_rq_load_contribution(). | 2362 | * the real-time load to calc the share. See update_tg_load_avg(). |
2357 | */ | 2363 | */ |
2358 | tg_weight = atomic_long_read(&tg->load_avg); | 2364 | tg_weight = atomic_long_read(&tg->load_avg); |
2359 | tg_weight -= cfs_rq->tg_load_contrib; | 2365 | tg_weight -= cfs_rq->tg_load_avg_contrib; |
2360 | tg_weight += cfs_rq->load.weight; | 2366 | tg_weight += cfs_rq_load_avg(cfs_rq); |
2361 | 2367 | ||
2362 | return tg_weight; | 2368 | return tg_weight; |
2363 | } | 2369 | } |
@@ -2367,7 +2373,7 @@ static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg) | |||
2367 | long tg_weight, load, shares; | 2373 | long tg_weight, load, shares; |
2368 | 2374 | ||
2369 | tg_weight = calc_tg_weight(tg, cfs_rq); | 2375 | tg_weight = calc_tg_weight(tg, cfs_rq); |
2370 | load = cfs_rq->load.weight; | 2376 | load = cfs_rq_load_avg(cfs_rq); |
2371 | 2377 | ||
2372 | shares = (tg->shares * load); | 2378 | shares = (tg->shares * load); |
2373 | if (tg_weight) | 2379 | if (tg_weight) |
@@ -2429,14 +2435,6 @@ static inline void update_cfs_shares(struct cfs_rq *cfs_rq) | |||
2429 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 2435 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
2430 | 2436 | ||
2431 | #ifdef CONFIG_SMP | 2437 | #ifdef CONFIG_SMP |
2432 | /* | ||
2433 | * We choose a half-life close to 1 scheduling period. | ||
2434 | * Note: The tables below are dependent on this value. | ||
2435 | */ | ||
2436 | #define LOAD_AVG_PERIOD 32 | ||
2437 | #define LOAD_AVG_MAX 47742 /* maximum possible load avg */ | ||
2438 | #define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_MAX_AVG */ | ||
2439 | |||
2440 | /* Precomputed fixed inverse multiplies for multiplication by y^n */ | 2438 | /* Precomputed fixed inverse multiplies for multiplication by y^n */ |
2441 | static const u32 runnable_avg_yN_inv[] = { | 2439 | static const u32 runnable_avg_yN_inv[] = { |
2442 | 0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6, | 2440 | 0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6, |
@@ -2485,9 +2483,8 @@ static __always_inline u64 decay_load(u64 val, u64 n) | |||
2485 | local_n %= LOAD_AVG_PERIOD; | 2483 | local_n %= LOAD_AVG_PERIOD; |
2486 | } | 2484 | } |
2487 | 2485 | ||
2488 | val *= runnable_avg_yN_inv[local_n]; | 2486 | val = mul_u64_u32_shr(val, runnable_avg_yN_inv[local_n], 32); |
2489 | /* We don't use SRR here since we always want to round down. */ | 2487 | return val; |
2490 | return val >> 32; | ||
2491 | } | 2488 | } |
2492 | 2489 | ||
2493 | /* | 2490 | /* |
@@ -2546,23 +2543,22 @@ static u32 __compute_runnable_contrib(u64 n) | |||
2546 | * load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... ) | 2543 | * load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... ) |
2547 | * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}] | 2544 | * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}] |
2548 | */ | 2545 | */ |
2549 | static __always_inline int __update_entity_runnable_avg(u64 now, int cpu, | 2546 | static __always_inline int |
2550 | struct sched_avg *sa, | 2547 | __update_load_avg(u64 now, int cpu, struct sched_avg *sa, |
2551 | int runnable, | 2548 | unsigned long weight, int running, struct cfs_rq *cfs_rq) |
2552 | int running) | ||
2553 | { | 2549 | { |
2554 | u64 delta, periods; | 2550 | u64 delta, periods; |
2555 | u32 runnable_contrib; | 2551 | u32 contrib; |
2556 | int delta_w, decayed = 0; | 2552 | int delta_w, decayed = 0; |
2557 | unsigned long scale_freq = arch_scale_freq_capacity(NULL, cpu); | 2553 | unsigned long scale_freq = arch_scale_freq_capacity(NULL, cpu); |
2558 | 2554 | ||
2559 | delta = now - sa->last_runnable_update; | 2555 | delta = now - sa->last_update_time; |
2560 | /* | 2556 | /* |
2561 | * This should only happen when time goes backwards, which it | 2557 | * This should only happen when time goes backwards, which it |
2562 | * unfortunately does during sched clock init when we swap over to TSC. | 2558 | * unfortunately does during sched clock init when we swap over to TSC. |
2563 | */ | 2559 | */ |
2564 | if ((s64)delta < 0) { | 2560 | if ((s64)delta < 0) { |
2565 | sa->last_runnable_update = now; | 2561 | sa->last_update_time = now; |
2566 | return 0; | 2562 | return 0; |
2567 | } | 2563 | } |
2568 | 2564 | ||
@@ -2573,26 +2569,29 @@ static __always_inline int __update_entity_runnable_avg(u64 now, int cpu, | |||
2573 | delta >>= 10; | 2569 | delta >>= 10; |
2574 | if (!delta) | 2570 | if (!delta) |
2575 | return 0; | 2571 | return 0; |
2576 | sa->last_runnable_update = now; | 2572 | sa->last_update_time = now; |
2577 | 2573 | ||
2578 | /* delta_w is the amount already accumulated against our next period */ | 2574 | /* delta_w is the amount already accumulated against our next period */ |
2579 | delta_w = sa->avg_period % 1024; | 2575 | delta_w = sa->period_contrib; |
2580 | if (delta + delta_w >= 1024) { | 2576 | if (delta + delta_w >= 1024) { |
2581 | /* period roll-over */ | ||
2582 | decayed = 1; | 2577 | decayed = 1; |
2583 | 2578 | ||
2579 | /* how much left for next period will start over, we don't know yet */ | ||
2580 | sa->period_contrib = 0; | ||
2581 | |||
2584 | /* | 2582 | /* |
2585 | * Now that we know we're crossing a period boundary, figure | 2583 | * Now that we know we're crossing a period boundary, figure |
2586 | * out how much from delta we need to complete the current | 2584 | * out how much from delta we need to complete the current |
2587 | * period and accrue it. | 2585 | * period and accrue it. |
2588 | */ | 2586 | */ |
2589 | delta_w = 1024 - delta_w; | 2587 | delta_w = 1024 - delta_w; |
2590 | if (runnable) | 2588 | if (weight) { |
2591 | sa->runnable_avg_sum += delta_w; | 2589 | sa->load_sum += weight * delta_w; |
2590 | if (cfs_rq) | ||
2591 | cfs_rq->runnable_load_sum += weight * delta_w; | ||
2592 | } | ||
2592 | if (running) | 2593 | if (running) |
2593 | sa->running_avg_sum += delta_w * scale_freq | 2594 | sa->util_sum += delta_w * scale_freq >> SCHED_CAPACITY_SHIFT; |
2594 | >> SCHED_CAPACITY_SHIFT; | ||
2595 | sa->avg_period += delta_w; | ||
2596 | 2595 | ||
2597 | delta -= delta_w; | 2596 | delta -= delta_w; |
2598 | 2597 | ||
@@ -2600,341 +2599,186 @@ static __always_inline int __update_entity_runnable_avg(u64 now, int cpu, | |||
2600 | periods = delta / 1024; | 2599 | periods = delta / 1024; |
2601 | delta %= 1024; | 2600 | delta %= 1024; |
2602 | 2601 | ||
2603 | sa->runnable_avg_sum = decay_load(sa->runnable_avg_sum, | 2602 | sa->load_sum = decay_load(sa->load_sum, periods + 1); |
2604 | periods + 1); | 2603 | if (cfs_rq) { |
2605 | sa->running_avg_sum = decay_load(sa->running_avg_sum, | 2604 | cfs_rq->runnable_load_sum = |
2606 | periods + 1); | 2605 | decay_load(cfs_rq->runnable_load_sum, periods + 1); |
2607 | sa->avg_period = decay_load(sa->avg_period, | 2606 | } |
2608 | periods + 1); | 2607 | sa->util_sum = decay_load((u64)(sa->util_sum), periods + 1); |
2609 | 2608 | ||
2610 | /* Efficiently calculate \sum (1..n_period) 1024*y^i */ | 2609 | /* Efficiently calculate \sum (1..n_period) 1024*y^i */ |
2611 | runnable_contrib = __compute_runnable_contrib(periods); | 2610 | contrib = __compute_runnable_contrib(periods); |
2612 | if (runnable) | 2611 | if (weight) { |
2613 | sa->runnable_avg_sum += runnable_contrib; | 2612 | sa->load_sum += weight * contrib; |
2613 | if (cfs_rq) | ||
2614 | cfs_rq->runnable_load_sum += weight * contrib; | ||
2615 | } | ||
2614 | if (running) | 2616 | if (running) |
2615 | sa->running_avg_sum += runnable_contrib * scale_freq | 2617 | sa->util_sum += contrib * scale_freq >> SCHED_CAPACITY_SHIFT; |
2616 | >> SCHED_CAPACITY_SHIFT; | ||
2617 | sa->avg_period += runnable_contrib; | ||
2618 | } | 2618 | } |
2619 | 2619 | ||
2620 | /* Remainder of delta accrued against u_0` */ | 2620 | /* Remainder of delta accrued against u_0` */ |
2621 | if (runnable) | 2621 | if (weight) { |
2622 | sa->runnable_avg_sum += delta; | 2622 | sa->load_sum += weight * delta; |
2623 | if (cfs_rq) | ||
2624 | cfs_rq->runnable_load_sum += weight * delta; | ||
2625 | } | ||
2623 | if (running) | 2626 | if (running) |
2624 | sa->running_avg_sum += delta * scale_freq | 2627 | sa->util_sum += delta * scale_freq >> SCHED_CAPACITY_SHIFT; |
2625 | >> SCHED_CAPACITY_SHIFT; | ||
2626 | sa->avg_period += delta; | ||
2627 | |||
2628 | return decayed; | ||
2629 | } | ||
2630 | 2628 | ||
2631 | /* Synchronize an entity's decay with its parenting cfs_rq.*/ | 2629 | sa->period_contrib += delta; |
2632 | static inline u64 __synchronize_entity_decay(struct sched_entity *se) | ||
2633 | { | ||
2634 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | ||
2635 | u64 decays = atomic64_read(&cfs_rq->decay_counter); | ||
2636 | |||
2637 | decays -= se->avg.decay_count; | ||
2638 | se->avg.decay_count = 0; | ||
2639 | if (!decays) | ||
2640 | return 0; | ||
2641 | 2630 | ||
2642 | se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays); | 2631 | if (decayed) { |
2643 | se->avg.utilization_avg_contrib = | 2632 | sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX); |
2644 | decay_load(se->avg.utilization_avg_contrib, decays); | 2633 | if (cfs_rq) { |
2634 | cfs_rq->runnable_load_avg = | ||
2635 | div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX); | ||
2636 | } | ||
2637 | sa->util_avg = (sa->util_sum << SCHED_LOAD_SHIFT) / LOAD_AVG_MAX; | ||
2638 | } | ||
2645 | 2639 | ||
2646 | return decays; | 2640 | return decayed; |
2647 | } | 2641 | } |
2648 | 2642 | ||
2649 | #ifdef CONFIG_FAIR_GROUP_SCHED | 2643 | #ifdef CONFIG_FAIR_GROUP_SCHED |
2650 | static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq, | ||
2651 | int force_update) | ||
2652 | { | ||
2653 | struct task_group *tg = cfs_rq->tg; | ||
2654 | long tg_contrib; | ||
2655 | |||
2656 | tg_contrib = cfs_rq->runnable_load_avg + cfs_rq->blocked_load_avg; | ||
2657 | tg_contrib -= cfs_rq->tg_load_contrib; | ||
2658 | |||
2659 | if (!tg_contrib) | ||
2660 | return; | ||
2661 | |||
2662 | if (force_update || abs(tg_contrib) > cfs_rq->tg_load_contrib / 8) { | ||
2663 | atomic_long_add(tg_contrib, &tg->load_avg); | ||
2664 | cfs_rq->tg_load_contrib += tg_contrib; | ||
2665 | } | ||
2666 | } | ||
2667 | |||
2668 | /* | 2644 | /* |
2669 | * Aggregate cfs_rq runnable averages into an equivalent task_group | 2645 | * Updating tg's load_avg is necessary before update_cfs_share (which is done) |
2670 | * representation for computing load contributions. | 2646 | * and effective_load (which is not done because it is too costly). |
2671 | */ | 2647 | */ |
2672 | static inline void __update_tg_runnable_avg(struct sched_avg *sa, | 2648 | static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) |
2673 | struct cfs_rq *cfs_rq) | ||
2674 | { | 2649 | { |
2675 | struct task_group *tg = cfs_rq->tg; | 2650 | long delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib; |
2676 | long contrib; | ||
2677 | 2651 | ||
2678 | /* The fraction of a cpu used by this cfs_rq */ | 2652 | if (force || abs(delta) > cfs_rq->tg_load_avg_contrib / 64) { |
2679 | contrib = div_u64((u64)sa->runnable_avg_sum << NICE_0_SHIFT, | 2653 | atomic_long_add(delta, &cfs_rq->tg->load_avg); |
2680 | sa->avg_period + 1); | 2654 | cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg; |
2681 | contrib -= cfs_rq->tg_runnable_contrib; | ||
2682 | |||
2683 | if (abs(contrib) > cfs_rq->tg_runnable_contrib / 64) { | ||
2684 | atomic_add(contrib, &tg->runnable_avg); | ||
2685 | cfs_rq->tg_runnable_contrib += contrib; | ||
2686 | } | 2655 | } |
2687 | } | 2656 | } |
2688 | 2657 | ||
2689 | static inline void __update_group_entity_contrib(struct sched_entity *se) | ||
2690 | { | ||
2691 | struct cfs_rq *cfs_rq = group_cfs_rq(se); | ||
2692 | struct task_group *tg = cfs_rq->tg; | ||
2693 | int runnable_avg; | ||
2694 | |||
2695 | u64 contrib; | ||
2696 | |||
2697 | contrib = cfs_rq->tg_load_contrib * tg->shares; | ||
2698 | se->avg.load_avg_contrib = div_u64(contrib, | ||
2699 | atomic_long_read(&tg->load_avg) + 1); | ||
2700 | |||
2701 | /* | ||
2702 | * For group entities we need to compute a correction term in the case | ||
2703 | * that they are consuming <1 cpu so that we would contribute the same | ||
2704 | * load as a task of equal weight. | ||
2705 | * | ||
2706 | * Explicitly co-ordinating this measurement would be expensive, but | ||
2707 | * fortunately the sum of each cpus contribution forms a usable | ||
2708 | * lower-bound on the true value. | ||
2709 | * | ||
2710 | * Consider the aggregate of 2 contributions. Either they are disjoint | ||
2711 | * (and the sum represents true value) or they are disjoint and we are | ||
2712 | * understating by the aggregate of their overlap. | ||
2713 | * | ||
2714 | * Extending this to N cpus, for a given overlap, the maximum amount we | ||
2715 | * understand is then n_i(n_i+1)/2 * w_i where n_i is the number of | ||
2716 | * cpus that overlap for this interval and w_i is the interval width. | ||
2717 | * | ||
2718 | * On a small machine; the first term is well-bounded which bounds the | ||
2719 | * total error since w_i is a subset of the period. Whereas on a | ||
2720 | * larger machine, while this first term can be larger, if w_i is the | ||
2721 | * of consequential size guaranteed to see n_i*w_i quickly converge to | ||
2722 | * our upper bound of 1-cpu. | ||
2723 | */ | ||
2724 | runnable_avg = atomic_read(&tg->runnable_avg); | ||
2725 | if (runnable_avg < NICE_0_LOAD) { | ||
2726 | se->avg.load_avg_contrib *= runnable_avg; | ||
2727 | se->avg.load_avg_contrib >>= NICE_0_SHIFT; | ||
2728 | } | ||
2729 | } | ||
2730 | |||
2731 | static inline void update_rq_runnable_avg(struct rq *rq, int runnable) | ||
2732 | { | ||
2733 | __update_entity_runnable_avg(rq_clock_task(rq), cpu_of(rq), &rq->avg, | ||
2734 | runnable, runnable); | ||
2735 | __update_tg_runnable_avg(&rq->avg, &rq->cfs); | ||
2736 | } | ||
2737 | #else /* CONFIG_FAIR_GROUP_SCHED */ | 2658 | #else /* CONFIG_FAIR_GROUP_SCHED */ |
2738 | static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq, | 2659 | static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {} |
2739 | int force_update) {} | ||
2740 | static inline void __update_tg_runnable_avg(struct sched_avg *sa, | ||
2741 | struct cfs_rq *cfs_rq) {} | ||
2742 | static inline void __update_group_entity_contrib(struct sched_entity *se) {} | ||
2743 | static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {} | ||
2744 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 2660 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
2745 | 2661 | ||
2746 | static inline void __update_task_entity_contrib(struct sched_entity *se) | 2662 | static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq); |
2747 | { | ||
2748 | u32 contrib; | ||
2749 | |||
2750 | /* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */ | ||
2751 | contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight); | ||
2752 | contrib /= (se->avg.avg_period + 1); | ||
2753 | se->avg.load_avg_contrib = scale_load(contrib); | ||
2754 | } | ||
2755 | 2663 | ||
2756 | /* Compute the current contribution to load_avg by se, return any delta */ | 2664 | /* Group cfs_rq's load_avg is used for task_h_load and update_cfs_share */ |
2757 | static long __update_entity_load_avg_contrib(struct sched_entity *se) | 2665 | static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) |
2758 | { | 2666 | { |
2759 | long old_contrib = se->avg.load_avg_contrib; | 2667 | int decayed; |
2668 | struct sched_avg *sa = &cfs_rq->avg; | ||
2760 | 2669 | ||
2761 | if (entity_is_task(se)) { | 2670 | if (atomic_long_read(&cfs_rq->removed_load_avg)) { |
2762 | __update_task_entity_contrib(se); | 2671 | long r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0); |
2763 | } else { | 2672 | sa->load_avg = max_t(long, sa->load_avg - r, 0); |
2764 | __update_tg_runnable_avg(&se->avg, group_cfs_rq(se)); | 2673 | sa->load_sum = max_t(s64, sa->load_sum - r * LOAD_AVG_MAX, 0); |
2765 | __update_group_entity_contrib(se); | ||
2766 | } | 2674 | } |
2767 | 2675 | ||
2768 | return se->avg.load_avg_contrib - old_contrib; | 2676 | if (atomic_long_read(&cfs_rq->removed_util_avg)) { |
2769 | } | 2677 | long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0); |
2770 | 2678 | sa->util_avg = max_t(long, sa->util_avg - r, 0); | |
2771 | 2679 | sa->util_sum = max_t(s32, sa->util_sum - | |
2772 | static inline void __update_task_entity_utilization(struct sched_entity *se) | 2680 | ((r * LOAD_AVG_MAX) >> SCHED_LOAD_SHIFT), 0); |
2773 | { | 2681 | } |
2774 | u32 contrib; | ||
2775 | 2682 | ||
2776 | /* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */ | 2683 | decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa, |
2777 | contrib = se->avg.running_avg_sum * scale_load_down(SCHED_LOAD_SCALE); | 2684 | scale_load_down(cfs_rq->load.weight), cfs_rq->curr != NULL, cfs_rq); |
2778 | contrib /= (se->avg.avg_period + 1); | ||
2779 | se->avg.utilization_avg_contrib = scale_load(contrib); | ||
2780 | } | ||
2781 | 2685 | ||
2782 | static long __update_entity_utilization_avg_contrib(struct sched_entity *se) | 2686 | #ifndef CONFIG_64BIT |
2783 | { | 2687 | smp_wmb(); |
2784 | long old_contrib = se->avg.utilization_avg_contrib; | 2688 | cfs_rq->load_last_update_time_copy = sa->last_update_time; |
2785 | 2689 | #endif | |
2786 | if (entity_is_task(se)) | ||
2787 | __update_task_entity_utilization(se); | ||
2788 | else | ||
2789 | se->avg.utilization_avg_contrib = | ||
2790 | group_cfs_rq(se)->utilization_load_avg; | ||
2791 | 2690 | ||
2792 | return se->avg.utilization_avg_contrib - old_contrib; | 2691 | return decayed; |
2793 | } | 2692 | } |
2794 | 2693 | ||
2795 | static inline void subtract_blocked_load_contrib(struct cfs_rq *cfs_rq, | 2694 | /* Update task and its cfs_rq load average */ |
2796 | long load_contrib) | 2695 | static inline void update_load_avg(struct sched_entity *se, int update_tg) |
2797 | { | ||
2798 | if (likely(load_contrib < cfs_rq->blocked_load_avg)) | ||
2799 | cfs_rq->blocked_load_avg -= load_contrib; | ||
2800 | else | ||
2801 | cfs_rq->blocked_load_avg = 0; | ||
2802 | } | ||
2803 | |||
2804 | static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq); | ||
2805 | |||
2806 | /* Update a sched_entity's runnable average */ | ||
2807 | static inline void update_entity_load_avg(struct sched_entity *se, | ||
2808 | int update_cfs_rq) | ||
2809 | { | 2696 | { |
2810 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | 2697 | struct cfs_rq *cfs_rq = cfs_rq_of(se); |
2811 | long contrib_delta, utilization_delta; | ||
2812 | int cpu = cpu_of(rq_of(cfs_rq)); | 2698 | int cpu = cpu_of(rq_of(cfs_rq)); |
2813 | u64 now; | 2699 | u64 now = cfs_rq_clock_task(cfs_rq); |
2814 | 2700 | ||
2815 | /* | 2701 | /* |
2816 | * For a group entity we need to use their owned cfs_rq_clock_task() in | 2702 | * Track task load average for carrying it to new CPU after migrated, and |
2817 | * case they are the parent of a throttled hierarchy. | 2703 | * track group sched_entity load average for task_h_load calc in migration |
2818 | */ | 2704 | */ |
2819 | if (entity_is_task(se)) | 2705 | __update_load_avg(now, cpu, &se->avg, |
2820 | now = cfs_rq_clock_task(cfs_rq); | 2706 | se->on_rq * scale_load_down(se->load.weight), cfs_rq->curr == se, NULL); |
2821 | else | ||
2822 | now = cfs_rq_clock_task(group_cfs_rq(se)); | ||
2823 | |||
2824 | if (!__update_entity_runnable_avg(now, cpu, &se->avg, se->on_rq, | ||
2825 | cfs_rq->curr == se)) | ||
2826 | return; | ||
2827 | |||
2828 | contrib_delta = __update_entity_load_avg_contrib(se); | ||
2829 | utilization_delta = __update_entity_utilization_avg_contrib(se); | ||
2830 | |||
2831 | if (!update_cfs_rq) | ||
2832 | return; | ||
2833 | 2707 | ||
2834 | if (se->on_rq) { | 2708 | if (update_cfs_rq_load_avg(now, cfs_rq) && update_tg) |
2835 | cfs_rq->runnable_load_avg += contrib_delta; | 2709 | update_tg_load_avg(cfs_rq, 0); |
2836 | cfs_rq->utilization_load_avg += utilization_delta; | ||
2837 | } else { | ||
2838 | subtract_blocked_load_contrib(cfs_rq, -contrib_delta); | ||
2839 | } | ||
2840 | } | 2710 | } |
2841 | 2711 | ||
2842 | /* | 2712 | /* Add the load generated by se into cfs_rq's load average */ |
2843 | * Decay the load contributed by all blocked children and account this so that | 2713 | static inline void |
2844 | * their contribution may appropriately discounted when they wake up. | 2714 | enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) |
2845 | */ | ||
2846 | static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update) | ||
2847 | { | 2715 | { |
2848 | u64 now = cfs_rq_clock_task(cfs_rq) >> 20; | 2716 | struct sched_avg *sa = &se->avg; |
2849 | u64 decays; | 2717 | u64 now = cfs_rq_clock_task(cfs_rq); |
2850 | 2718 | int migrated = 0, decayed; | |
2851 | decays = now - cfs_rq->last_decay; | ||
2852 | if (!decays && !force_update) | ||
2853 | return; | ||
2854 | 2719 | ||
2855 | if (atomic_long_read(&cfs_rq->removed_load)) { | 2720 | if (sa->last_update_time == 0) { |
2856 | unsigned long removed_load; | 2721 | sa->last_update_time = now; |
2857 | removed_load = atomic_long_xchg(&cfs_rq->removed_load, 0); | 2722 | migrated = 1; |
2858 | subtract_blocked_load_contrib(cfs_rq, removed_load); | ||
2859 | } | 2723 | } |
2724 | else { | ||
2725 | __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa, | ||
2726 | se->on_rq * scale_load_down(se->load.weight), | ||
2727 | cfs_rq->curr == se, NULL); | ||
2728 | } | ||
2729 | |||
2730 | decayed = update_cfs_rq_load_avg(now, cfs_rq); | ||
2860 | 2731 | ||
2861 | if (decays) { | 2732 | cfs_rq->runnable_load_avg += sa->load_avg; |
2862 | cfs_rq->blocked_load_avg = decay_load(cfs_rq->blocked_load_avg, | 2733 | cfs_rq->runnable_load_sum += sa->load_sum; |
2863 | decays); | 2734 | |
2864 | atomic64_add(decays, &cfs_rq->decay_counter); | 2735 | if (migrated) { |
2865 | cfs_rq->last_decay = now; | 2736 | cfs_rq->avg.load_avg += sa->load_avg; |
2737 | cfs_rq->avg.load_sum += sa->load_sum; | ||
2738 | cfs_rq->avg.util_avg += sa->util_avg; | ||
2739 | cfs_rq->avg.util_sum += sa->util_sum; | ||
2866 | } | 2740 | } |
2867 | 2741 | ||
2868 | __update_cfs_rq_tg_load_contrib(cfs_rq, force_update); | 2742 | if (decayed || migrated) |
2743 | update_tg_load_avg(cfs_rq, 0); | ||
2869 | } | 2744 | } |
2870 | 2745 | ||
2871 | /* Add the load generated by se into cfs_rq's child load-average */ | 2746 | /* Remove the runnable load generated by se from cfs_rq's runnable load average */ |
2872 | static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq, | 2747 | static inline void |
2873 | struct sched_entity *se, | 2748 | dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) |
2874 | int wakeup) | ||
2875 | { | 2749 | { |
2876 | /* | 2750 | update_load_avg(se, 1); |
2877 | * We track migrations using entity decay_count <= 0, on a wake-up | ||
2878 | * migration we use a negative decay count to track the remote decays | ||
2879 | * accumulated while sleeping. | ||
2880 | * | ||
2881 | * Newly forked tasks are enqueued with se->avg.decay_count == 0, they | ||
2882 | * are seen by enqueue_entity_load_avg() as a migration with an already | ||
2883 | * constructed load_avg_contrib. | ||
2884 | */ | ||
2885 | if (unlikely(se->avg.decay_count <= 0)) { | ||
2886 | se->avg.last_runnable_update = rq_clock_task(rq_of(cfs_rq)); | ||
2887 | if (se->avg.decay_count) { | ||
2888 | /* | ||
2889 | * In a wake-up migration we have to approximate the | ||
2890 | * time sleeping. This is because we can't synchronize | ||
2891 | * clock_task between the two cpus, and it is not | ||
2892 | * guaranteed to be read-safe. Instead, we can | ||
2893 | * approximate this using our carried decays, which are | ||
2894 | * explicitly atomically readable. | ||
2895 | */ | ||
2896 | se->avg.last_runnable_update -= (-se->avg.decay_count) | ||
2897 | << 20; | ||
2898 | update_entity_load_avg(se, 0); | ||
2899 | /* Indicate that we're now synchronized and on-rq */ | ||
2900 | se->avg.decay_count = 0; | ||
2901 | } | ||
2902 | wakeup = 0; | ||
2903 | } else { | ||
2904 | __synchronize_entity_decay(se); | ||
2905 | } | ||
2906 | 2751 | ||
2907 | /* migrated tasks did not contribute to our blocked load */ | 2752 | cfs_rq->runnable_load_avg = |
2908 | if (wakeup) { | 2753 | max_t(long, cfs_rq->runnable_load_avg - se->avg.load_avg, 0); |
2909 | subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib); | 2754 | cfs_rq->runnable_load_sum = |
2910 | update_entity_load_avg(se, 0); | 2755 | max_t(s64, cfs_rq->runnable_load_sum - se->avg.load_sum, 0); |
2911 | } | ||
2912 | |||
2913 | cfs_rq->runnable_load_avg += se->avg.load_avg_contrib; | ||
2914 | cfs_rq->utilization_load_avg += se->avg.utilization_avg_contrib; | ||
2915 | /* we force update consideration on load-balancer moves */ | ||
2916 | update_cfs_rq_blocked_load(cfs_rq, !wakeup); | ||
2917 | } | 2756 | } |
2918 | 2757 | ||
2919 | /* | 2758 | /* |
2920 | * Remove se's load from this cfs_rq child load-average, if the entity is | 2759 | * Task first catches up with cfs_rq, and then subtract |
2921 | * transitioning to a blocked state we track its projected decay using | 2760 | * itself from the cfs_rq (task must be off the queue now). |
2922 | * blocked_load_avg. | ||
2923 | */ | 2761 | */ |
2924 | static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq, | 2762 | void remove_entity_load_avg(struct sched_entity *se) |
2925 | struct sched_entity *se, | ||
2926 | int sleep) | ||
2927 | { | 2763 | { |
2928 | update_entity_load_avg(se, 1); | 2764 | struct cfs_rq *cfs_rq = cfs_rq_of(se); |
2929 | /* we force update consideration on load-balancer moves */ | 2765 | u64 last_update_time; |
2930 | update_cfs_rq_blocked_load(cfs_rq, !sleep); | 2766 | |
2767 | #ifndef CONFIG_64BIT | ||
2768 | u64 last_update_time_copy; | ||
2931 | 2769 | ||
2932 | cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib; | 2770 | do { |
2933 | cfs_rq->utilization_load_avg -= se->avg.utilization_avg_contrib; | 2771 | last_update_time_copy = cfs_rq->load_last_update_time_copy; |
2934 | if (sleep) { | 2772 | smp_rmb(); |
2935 | cfs_rq->blocked_load_avg += se->avg.load_avg_contrib; | 2773 | last_update_time = cfs_rq->avg.last_update_time; |
2936 | se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter); | 2774 | } while (last_update_time != last_update_time_copy); |
2937 | } /* migrations, e.g. sleep=0 leave decay_count == 0 */ | 2775 | #else |
2776 | last_update_time = cfs_rq->avg.last_update_time; | ||
2777 | #endif | ||
2778 | |||
2779 | __update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL); | ||
2780 | atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg); | ||
2781 | atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg); | ||
2938 | } | 2782 | } |
2939 | 2783 | ||
2940 | /* | 2784 | /* |
@@ -2944,7 +2788,6 @@ static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq, | |||
2944 | */ | 2788 | */ |
2945 | void idle_enter_fair(struct rq *this_rq) | 2789 | void idle_enter_fair(struct rq *this_rq) |
2946 | { | 2790 | { |
2947 | update_rq_runnable_avg(this_rq, 1); | ||
2948 | } | 2791 | } |
2949 | 2792 | ||
2950 | /* | 2793 | /* |
@@ -2954,24 +2797,28 @@ void idle_enter_fair(struct rq *this_rq) | |||
2954 | */ | 2797 | */ |
2955 | void idle_exit_fair(struct rq *this_rq) | 2798 | void idle_exit_fair(struct rq *this_rq) |
2956 | { | 2799 | { |
2957 | update_rq_runnable_avg(this_rq, 0); | 2800 | } |
2801 | |||
2802 | static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq) | ||
2803 | { | ||
2804 | return cfs_rq->runnable_load_avg; | ||
2805 | } | ||
2806 | |||
2807 | static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq) | ||
2808 | { | ||
2809 | return cfs_rq->avg.load_avg; | ||
2958 | } | 2810 | } |
2959 | 2811 | ||
2960 | static int idle_balance(struct rq *this_rq); | 2812 | static int idle_balance(struct rq *this_rq); |
2961 | 2813 | ||
2962 | #else /* CONFIG_SMP */ | 2814 | #else /* CONFIG_SMP */ |
2963 | 2815 | ||
2964 | static inline void update_entity_load_avg(struct sched_entity *se, | 2816 | static inline void update_load_avg(struct sched_entity *se, int update_tg) {} |
2965 | int update_cfs_rq) {} | 2817 | static inline void |
2966 | static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {} | 2818 | enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} |
2967 | static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq, | 2819 | static inline void |
2968 | struct sched_entity *se, | 2820 | dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} |
2969 | int wakeup) {} | 2821 | static inline void remove_entity_load_avg(struct sched_entity *se) {} |
2970 | static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq, | ||
2971 | struct sched_entity *se, | ||
2972 | int sleep) {} | ||
2973 | static inline void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, | ||
2974 | int force_update) {} | ||
2975 | 2822 | ||
2976 | static inline int idle_balance(struct rq *rq) | 2823 | static inline int idle_balance(struct rq *rq) |
2977 | { | 2824 | { |
@@ -3103,7 +2950,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) | |||
3103 | * Update run-time statistics of the 'current'. | 2950 | * Update run-time statistics of the 'current'. |
3104 | */ | 2951 | */ |
3105 | update_curr(cfs_rq); | 2952 | update_curr(cfs_rq); |
3106 | enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP); | 2953 | enqueue_entity_load_avg(cfs_rq, se); |
3107 | account_entity_enqueue(cfs_rq, se); | 2954 | account_entity_enqueue(cfs_rq, se); |
3108 | update_cfs_shares(cfs_rq); | 2955 | update_cfs_shares(cfs_rq); |
3109 | 2956 | ||
@@ -3178,7 +3025,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) | |||
3178 | * Update run-time statistics of the 'current'. | 3025 | * Update run-time statistics of the 'current'. |
3179 | */ | 3026 | */ |
3180 | update_curr(cfs_rq); | 3027 | update_curr(cfs_rq); |
3181 | dequeue_entity_load_avg(cfs_rq, se, flags & DEQUEUE_SLEEP); | 3028 | dequeue_entity_load_avg(cfs_rq, se); |
3182 | 3029 | ||
3183 | update_stats_dequeue(cfs_rq, se); | 3030 | update_stats_dequeue(cfs_rq, se); |
3184 | if (flags & DEQUEUE_SLEEP) { | 3031 | if (flags & DEQUEUE_SLEEP) { |
@@ -3268,7 +3115,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
3268 | */ | 3115 | */ |
3269 | update_stats_wait_end(cfs_rq, se); | 3116 | update_stats_wait_end(cfs_rq, se); |
3270 | __dequeue_entity(cfs_rq, se); | 3117 | __dequeue_entity(cfs_rq, se); |
3271 | update_entity_load_avg(se, 1); | 3118 | update_load_avg(se, 1); |
3272 | } | 3119 | } |
3273 | 3120 | ||
3274 | update_stats_curr_start(cfs_rq, se); | 3121 | update_stats_curr_start(cfs_rq, se); |
@@ -3368,7 +3215,7 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) | |||
3368 | /* Put 'current' back into the tree. */ | 3215 | /* Put 'current' back into the tree. */ |
3369 | __enqueue_entity(cfs_rq, prev); | 3216 | __enqueue_entity(cfs_rq, prev); |
3370 | /* in !on_rq case, update occurred at dequeue */ | 3217 | /* in !on_rq case, update occurred at dequeue */ |
3371 | update_entity_load_avg(prev, 1); | 3218 | update_load_avg(prev, 0); |
3372 | } | 3219 | } |
3373 | cfs_rq->curr = NULL; | 3220 | cfs_rq->curr = NULL; |
3374 | } | 3221 | } |
@@ -3384,8 +3231,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) | |||
3384 | /* | 3231 | /* |
3385 | * Ensure that runnable average is periodically updated. | 3232 | * Ensure that runnable average is periodically updated. |
3386 | */ | 3233 | */ |
3387 | update_entity_load_avg(curr, 1); | 3234 | update_load_avg(curr, 1); |
3388 | update_cfs_rq_blocked_load(cfs_rq, 1); | ||
3389 | update_cfs_shares(cfs_rq); | 3235 | update_cfs_shares(cfs_rq); |
3390 | 3236 | ||
3391 | #ifdef CONFIG_SCHED_HRTICK | 3237 | #ifdef CONFIG_SCHED_HRTICK |
@@ -3683,7 +3529,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq) | |||
3683 | cfs_rq->throttled = 1; | 3529 | cfs_rq->throttled = 1; |
3684 | cfs_rq->throttled_clock = rq_clock(rq); | 3530 | cfs_rq->throttled_clock = rq_clock(rq); |
3685 | raw_spin_lock(&cfs_b->lock); | 3531 | raw_spin_lock(&cfs_b->lock); |
3686 | empty = list_empty(&cfs_rq->throttled_list); | 3532 | empty = list_empty(&cfs_b->throttled_cfs_rq); |
3687 | 3533 | ||
3688 | /* | 3534 | /* |
3689 | * Add to the _head_ of the list, so that an already-started | 3535 | * Add to the _head_ of the list, so that an already-started |
@@ -4258,14 +4104,13 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) | |||
4258 | if (cfs_rq_throttled(cfs_rq)) | 4104 | if (cfs_rq_throttled(cfs_rq)) |
4259 | break; | 4105 | break; |
4260 | 4106 | ||
4107 | update_load_avg(se, 1); | ||
4261 | update_cfs_shares(cfs_rq); | 4108 | update_cfs_shares(cfs_rq); |
4262 | update_entity_load_avg(se, 1); | ||
4263 | } | 4109 | } |
4264 | 4110 | ||
4265 | if (!se) { | 4111 | if (!se) |
4266 | update_rq_runnable_avg(rq, rq->nr_running); | ||
4267 | add_nr_running(rq, 1); | 4112 | add_nr_running(rq, 1); |
4268 | } | 4113 | |
4269 | hrtick_update(rq); | 4114 | hrtick_update(rq); |
4270 | } | 4115 | } |
4271 | 4116 | ||
@@ -4319,14 +4164,13 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) | |||
4319 | if (cfs_rq_throttled(cfs_rq)) | 4164 | if (cfs_rq_throttled(cfs_rq)) |
4320 | break; | 4165 | break; |
4321 | 4166 | ||
4167 | update_load_avg(se, 1); | ||
4322 | update_cfs_shares(cfs_rq); | 4168 | update_cfs_shares(cfs_rq); |
4323 | update_entity_load_avg(se, 1); | ||
4324 | } | 4169 | } |
4325 | 4170 | ||
4326 | if (!se) { | 4171 | if (!se) |
4327 | sub_nr_running(rq, 1); | 4172 | sub_nr_running(rq, 1); |
4328 | update_rq_runnable_avg(rq, 1); | 4173 | |
4329 | } | ||
4330 | hrtick_update(rq); | 4174 | hrtick_update(rq); |
4331 | } | 4175 | } |
4332 | 4176 | ||
@@ -4439,6 +4283,12 @@ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load, | |||
4439 | sched_avg_update(this_rq); | 4283 | sched_avg_update(this_rq); |
4440 | } | 4284 | } |
4441 | 4285 | ||
4286 | /* Used instead of source_load when we know the type == 0 */ | ||
4287 | static unsigned long weighted_cpuload(const int cpu) | ||
4288 | { | ||
4289 | return cfs_rq_runnable_load_avg(&cpu_rq(cpu)->cfs); | ||
4290 | } | ||
4291 | |||
4442 | #ifdef CONFIG_NO_HZ_COMMON | 4292 | #ifdef CONFIG_NO_HZ_COMMON |
4443 | /* | 4293 | /* |
4444 | * There is no sane way to deal with nohz on smp when using jiffies because the | 4294 | * There is no sane way to deal with nohz on smp when using jiffies because the |
@@ -4460,7 +4310,7 @@ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load, | |||
4460 | static void update_idle_cpu_load(struct rq *this_rq) | 4310 | static void update_idle_cpu_load(struct rq *this_rq) |
4461 | { | 4311 | { |
4462 | unsigned long curr_jiffies = READ_ONCE(jiffies); | 4312 | unsigned long curr_jiffies = READ_ONCE(jiffies); |
4463 | unsigned long load = this_rq->cfs.runnable_load_avg; | 4313 | unsigned long load = weighted_cpuload(cpu_of(this_rq)); |
4464 | unsigned long pending_updates; | 4314 | unsigned long pending_updates; |
4465 | 4315 | ||
4466 | /* | 4316 | /* |
@@ -4506,7 +4356,7 @@ void update_cpu_load_nohz(void) | |||
4506 | */ | 4356 | */ |
4507 | void update_cpu_load_active(struct rq *this_rq) | 4357 | void update_cpu_load_active(struct rq *this_rq) |
4508 | { | 4358 | { |
4509 | unsigned long load = this_rq->cfs.runnable_load_avg; | 4359 | unsigned long load = weighted_cpuload(cpu_of(this_rq)); |
4510 | /* | 4360 | /* |
4511 | * See the mess around update_idle_cpu_load() / update_cpu_load_nohz(). | 4361 | * See the mess around update_idle_cpu_load() / update_cpu_load_nohz(). |
4512 | */ | 4362 | */ |
@@ -4514,12 +4364,6 @@ void update_cpu_load_active(struct rq *this_rq) | |||
4514 | __update_cpu_load(this_rq, load, 1); | 4364 | __update_cpu_load(this_rq, load, 1); |
4515 | } | 4365 | } |
4516 | 4366 | ||
4517 | /* Used instead of source_load when we know the type == 0 */ | ||
4518 | static unsigned long weighted_cpuload(const int cpu) | ||
4519 | { | ||
4520 | return cpu_rq(cpu)->cfs.runnable_load_avg; | ||
4521 | } | ||
4522 | |||
4523 | /* | 4367 | /* |
4524 | * Return a low guess at the load of a migration-source cpu weighted | 4368 | * Return a low guess at the load of a migration-source cpu weighted |
4525 | * according to the scheduling class and "nice" value. | 4369 | * according to the scheduling class and "nice" value. |
@@ -4567,7 +4411,7 @@ static unsigned long cpu_avg_load_per_task(int cpu) | |||
4567 | { | 4411 | { |
4568 | struct rq *rq = cpu_rq(cpu); | 4412 | struct rq *rq = cpu_rq(cpu); |
4569 | unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running); | 4413 | unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running); |
4570 | unsigned long load_avg = rq->cfs.runnable_load_avg; | 4414 | unsigned long load_avg = weighted_cpuload(cpu); |
4571 | 4415 | ||
4572 | if (nr_running) | 4416 | if (nr_running) |
4573 | return load_avg / nr_running; | 4417 | return load_avg / nr_running; |
@@ -4686,7 +4530,7 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg) | |||
4686 | /* | 4530 | /* |
4687 | * w = rw_i + @wl | 4531 | * w = rw_i + @wl |
4688 | */ | 4532 | */ |
4689 | w = se->my_q->load.weight + wl; | 4533 | w = cfs_rq_load_avg(se->my_q) + wl; |
4690 | 4534 | ||
4691 | /* | 4535 | /* |
4692 | * wl = S * s'_i; see (2) | 4536 | * wl = S * s'_i; see (2) |
@@ -4707,7 +4551,7 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg) | |||
4707 | /* | 4551 | /* |
4708 | * wl = dw_i = S * (s'_i - s_i); see (3) | 4552 | * wl = dw_i = S * (s'_i - s_i); see (3) |
4709 | */ | 4553 | */ |
4710 | wl -= se->load.weight; | 4554 | wl -= se->avg.load_avg; |
4711 | 4555 | ||
4712 | /* | 4556 | /* |
4713 | * Recursively apply this logic to all parent groups to compute | 4557 | * Recursively apply this logic to all parent groups to compute |
@@ -4730,26 +4574,29 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg) | |||
4730 | 4574 | ||
4731 | #endif | 4575 | #endif |
4732 | 4576 | ||
4577 | /* | ||
4578 | * Detect M:N waker/wakee relationships via a switching-frequency heuristic. | ||
4579 | * A waker of many should wake a different task than the one last awakened | ||
4580 | * at a frequency roughly N times higher than one of its wakees. In order | ||
4581 | * to determine whether we should let the load spread vs consolodating to | ||
4582 | * shared cache, we look for a minimum 'flip' frequency of llc_size in one | ||
4583 | * partner, and a factor of lls_size higher frequency in the other. With | ||
4584 | * both conditions met, we can be relatively sure that the relationship is | ||
4585 | * non-monogamous, with partner count exceeding socket size. Waker/wakee | ||
4586 | * being client/server, worker/dispatcher, interrupt source or whatever is | ||
4587 | * irrelevant, spread criteria is apparent partner count exceeds socket size. | ||
4588 | */ | ||
4733 | static int wake_wide(struct task_struct *p) | 4589 | static int wake_wide(struct task_struct *p) |
4734 | { | 4590 | { |
4591 | unsigned int master = current->wakee_flips; | ||
4592 | unsigned int slave = p->wakee_flips; | ||
4735 | int factor = this_cpu_read(sd_llc_size); | 4593 | int factor = this_cpu_read(sd_llc_size); |
4736 | 4594 | ||
4737 | /* | 4595 | if (master < slave) |
4738 | * Yeah, it's the switching-frequency, could means many wakee or | 4596 | swap(master, slave); |
4739 | * rapidly switch, use factor here will just help to automatically | 4597 | if (slave < factor || master < slave * factor) |
4740 | * adjust the loose-degree, so bigger node will lead to more pull. | 4598 | return 0; |
4741 | */ | 4599 | return 1; |
4742 | if (p->wakee_flips > factor) { | ||
4743 | /* | ||
4744 | * wakee is somewhat hot, it needs certain amount of cpu | ||
4745 | * resource, so if waker is far more hot, prefer to leave | ||
4746 | * it alone. | ||
4747 | */ | ||
4748 | if (current->wakee_flips > (factor * p->wakee_flips)) | ||
4749 | return 1; | ||
4750 | } | ||
4751 | |||
4752 | return 0; | ||
4753 | } | 4600 | } |
4754 | 4601 | ||
4755 | static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) | 4602 | static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) |
@@ -4761,13 +4608,6 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) | |||
4761 | unsigned long weight; | 4608 | unsigned long weight; |
4762 | int balanced; | 4609 | int balanced; |
4763 | 4610 | ||
4764 | /* | ||
4765 | * If we wake multiple tasks be careful to not bounce | ||
4766 | * ourselves around too much. | ||
4767 | */ | ||
4768 | if (wake_wide(p)) | ||
4769 | return 0; | ||
4770 | |||
4771 | idx = sd->wake_idx; | 4611 | idx = sd->wake_idx; |
4772 | this_cpu = smp_processor_id(); | 4612 | this_cpu = smp_processor_id(); |
4773 | prev_cpu = task_cpu(p); | 4613 | prev_cpu = task_cpu(p); |
@@ -4781,14 +4621,14 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) | |||
4781 | */ | 4621 | */ |
4782 | if (sync) { | 4622 | if (sync) { |
4783 | tg = task_group(current); | 4623 | tg = task_group(current); |
4784 | weight = current->se.load.weight; | 4624 | weight = current->se.avg.load_avg; |
4785 | 4625 | ||
4786 | this_load += effective_load(tg, this_cpu, -weight, -weight); | 4626 | this_load += effective_load(tg, this_cpu, -weight, -weight); |
4787 | load += effective_load(tg, prev_cpu, 0, -weight); | 4627 | load += effective_load(tg, prev_cpu, 0, -weight); |
4788 | } | 4628 | } |
4789 | 4629 | ||
4790 | tg = task_group(p); | 4630 | tg = task_group(p); |
4791 | weight = p->se.load.weight; | 4631 | weight = p->se.avg.load_avg; |
4792 | 4632 | ||
4793 | /* | 4633 | /* |
4794 | * In low-load situations, where prev_cpu is idle and this_cpu is idle | 4634 | * In low-load situations, where prev_cpu is idle and this_cpu is idle |
@@ -4981,12 +4821,12 @@ done: | |||
4981 | * tasks. The unit of the return value must be the one of capacity so we can | 4821 | * tasks. The unit of the return value must be the one of capacity so we can |
4982 | * compare the usage with the capacity of the CPU that is available for CFS | 4822 | * compare the usage with the capacity of the CPU that is available for CFS |
4983 | * task (ie cpu_capacity). | 4823 | * task (ie cpu_capacity). |
4984 | * cfs.utilization_load_avg is the sum of running time of runnable tasks on a | 4824 | * cfs.avg.util_avg is the sum of running time of runnable tasks on a |
4985 | * CPU. It represents the amount of utilization of a CPU in the range | 4825 | * CPU. It represents the amount of utilization of a CPU in the range |
4986 | * [0..SCHED_LOAD_SCALE]. The usage of a CPU can't be higher than the full | 4826 | * [0..SCHED_LOAD_SCALE]. The usage of a CPU can't be higher than the full |
4987 | * capacity of the CPU because it's about the running time on this CPU. | 4827 | * capacity of the CPU because it's about the running time on this CPU. |
4988 | * Nevertheless, cfs.utilization_load_avg can be higher than SCHED_LOAD_SCALE | 4828 | * Nevertheless, cfs.avg.util_avg can be higher than SCHED_LOAD_SCALE |
4989 | * because of unfortunate rounding in avg_period and running_load_avg or just | 4829 | * because of unfortunate rounding in util_avg or just |
4990 | * after migrating tasks until the average stabilizes with the new running | 4830 | * after migrating tasks until the average stabilizes with the new running |
4991 | * time. So we need to check that the usage stays into the range | 4831 | * time. So we need to check that the usage stays into the range |
4992 | * [0..cpu_capacity_orig] and cap if necessary. | 4832 | * [0..cpu_capacity_orig] and cap if necessary. |
@@ -4995,7 +4835,7 @@ done: | |||
4995 | */ | 4835 | */ |
4996 | static int get_cpu_usage(int cpu) | 4836 | static int get_cpu_usage(int cpu) |
4997 | { | 4837 | { |
4998 | unsigned long usage = cpu_rq(cpu)->cfs.utilization_load_avg; | 4838 | unsigned long usage = cpu_rq(cpu)->cfs.avg.util_avg; |
4999 | unsigned long capacity = capacity_orig_of(cpu); | 4839 | unsigned long capacity = capacity_orig_of(cpu); |
5000 | 4840 | ||
5001 | if (usage >= SCHED_LOAD_SCALE) | 4841 | if (usage >= SCHED_LOAD_SCALE) |
@@ -5021,17 +4861,17 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f | |||
5021 | { | 4861 | { |
5022 | struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL; | 4862 | struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL; |
5023 | int cpu = smp_processor_id(); | 4863 | int cpu = smp_processor_id(); |
5024 | int new_cpu = cpu; | 4864 | int new_cpu = prev_cpu; |
5025 | int want_affine = 0; | 4865 | int want_affine = 0; |
5026 | int sync = wake_flags & WF_SYNC; | 4866 | int sync = wake_flags & WF_SYNC; |
5027 | 4867 | ||
5028 | if (sd_flag & SD_BALANCE_WAKE) | 4868 | if (sd_flag & SD_BALANCE_WAKE) |
5029 | want_affine = cpumask_test_cpu(cpu, tsk_cpus_allowed(p)); | 4869 | want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, tsk_cpus_allowed(p)); |
5030 | 4870 | ||
5031 | rcu_read_lock(); | 4871 | rcu_read_lock(); |
5032 | for_each_domain(cpu, tmp) { | 4872 | for_each_domain(cpu, tmp) { |
5033 | if (!(tmp->flags & SD_LOAD_BALANCE)) | 4873 | if (!(tmp->flags & SD_LOAD_BALANCE)) |
5034 | continue; | 4874 | break; |
5035 | 4875 | ||
5036 | /* | 4876 | /* |
5037 | * If both cpu and prev_cpu are part of this domain, | 4877 | * If both cpu and prev_cpu are part of this domain, |
@@ -5045,17 +4885,21 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f | |||
5045 | 4885 | ||
5046 | if (tmp->flags & sd_flag) | 4886 | if (tmp->flags & sd_flag) |
5047 | sd = tmp; | 4887 | sd = tmp; |
4888 | else if (!want_affine) | ||
4889 | break; | ||
5048 | } | 4890 | } |
5049 | 4891 | ||
5050 | if (affine_sd && cpu != prev_cpu && wake_affine(affine_sd, p, sync)) | 4892 | if (affine_sd) { |
5051 | prev_cpu = cpu; | 4893 | sd = NULL; /* Prefer wake_affine over balance flags */ |
5052 | 4894 | if (cpu != prev_cpu && wake_affine(affine_sd, p, sync)) | |
5053 | if (sd_flag & SD_BALANCE_WAKE) { | 4895 | new_cpu = cpu; |
5054 | new_cpu = select_idle_sibling(p, prev_cpu); | ||
5055 | goto unlock; | ||
5056 | } | 4896 | } |
5057 | 4897 | ||
5058 | while (sd) { | 4898 | if (!sd) { |
4899 | if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */ | ||
4900 | new_cpu = select_idle_sibling(p, new_cpu); | ||
4901 | |||
4902 | } else while (sd) { | ||
5059 | struct sched_group *group; | 4903 | struct sched_group *group; |
5060 | int weight; | 4904 | int weight; |
5061 | 4905 | ||
@@ -5089,7 +4933,6 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f | |||
5089 | } | 4933 | } |
5090 | /* while loop will break here if sd == NULL */ | 4934 | /* while loop will break here if sd == NULL */ |
5091 | } | 4935 | } |
5092 | unlock: | ||
5093 | rcu_read_unlock(); | 4936 | rcu_read_unlock(); |
5094 | 4937 | ||
5095 | return new_cpu; | 4938 | return new_cpu; |
@@ -5101,26 +4944,27 @@ unlock: | |||
5101 | * previous cpu. However, the caller only guarantees p->pi_lock is held; no | 4944 | * previous cpu. However, the caller only guarantees p->pi_lock is held; no |
5102 | * other assumptions, including the state of rq->lock, should be made. | 4945 | * other assumptions, including the state of rq->lock, should be made. |
5103 | */ | 4946 | */ |
5104 | static void | 4947 | static void migrate_task_rq_fair(struct task_struct *p, int next_cpu) |
5105 | migrate_task_rq_fair(struct task_struct *p, int next_cpu) | ||
5106 | { | 4948 | { |
5107 | struct sched_entity *se = &p->se; | ||
5108 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | ||
5109 | |||
5110 | /* | 4949 | /* |
5111 | * Load tracking: accumulate removed load so that it can be processed | 4950 | * We are supposed to update the task to "current" time, then its up to date |
5112 | * when we next update owning cfs_rq under rq->lock. Tasks contribute | 4951 | * and ready to go to new CPU/cfs_rq. But we have difficulty in getting |
5113 | * to blocked load iff they have a positive decay-count. It can never | 4952 | * what current time is, so simply throw away the out-of-date time. This |
5114 | * be negative here since on-rq tasks have decay-count == 0. | 4953 | * will result in the wakee task is less decayed, but giving the wakee more |
4954 | * load sounds not bad. | ||
5115 | */ | 4955 | */ |
5116 | if (se->avg.decay_count) { | 4956 | remove_entity_load_avg(&p->se); |
5117 | se->avg.decay_count = -__synchronize_entity_decay(se); | 4957 | |
5118 | atomic_long_add(se->avg.load_avg_contrib, | 4958 | /* Tell new CPU we are migrated */ |
5119 | &cfs_rq->removed_load); | 4959 | p->se.avg.last_update_time = 0; |
5120 | } | ||
5121 | 4960 | ||
5122 | /* We have migrated, no longer consider this task hot */ | 4961 | /* We have migrated, no longer consider this task hot */ |
5123 | se->exec_start = 0; | 4962 | p->se.exec_start = 0; |
4963 | } | ||
4964 | |||
4965 | static void task_dead_fair(struct task_struct *p) | ||
4966 | { | ||
4967 | remove_entity_load_avg(&p->se); | ||
5124 | } | 4968 | } |
5125 | #endif /* CONFIG_SMP */ | 4969 | #endif /* CONFIG_SMP */ |
5126 | 4970 | ||
@@ -5670,72 +5514,39 @@ static int task_hot(struct task_struct *p, struct lb_env *env) | |||
5670 | 5514 | ||
5671 | #ifdef CONFIG_NUMA_BALANCING | 5515 | #ifdef CONFIG_NUMA_BALANCING |
5672 | /* | 5516 | /* |
5673 | * Returns true if the destination node is the preferred node. | 5517 | * Returns 1, if task migration degrades locality |
5674 | * Needs to match fbq_classify_rq(): if there is a runnable task | 5518 | * Returns 0, if task migration improves locality i.e migration preferred. |
5675 | * that is not on its preferred node, we should identify it. | 5519 | * Returns -1, if task migration is not affected by locality. |
5676 | */ | 5520 | */ |
5677 | static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env) | 5521 | static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env) |
5678 | { | 5522 | { |
5679 | struct numa_group *numa_group = rcu_dereference(p->numa_group); | 5523 | struct numa_group *numa_group = rcu_dereference(p->numa_group); |
5680 | unsigned long src_faults, dst_faults; | 5524 | unsigned long src_faults, dst_faults; |
5681 | int src_nid, dst_nid; | 5525 | int src_nid, dst_nid; |
5682 | 5526 | ||
5683 | if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults || | ||
5684 | !(env->sd->flags & SD_NUMA)) { | ||
5685 | return false; | ||
5686 | } | ||
5687 | |||
5688 | src_nid = cpu_to_node(env->src_cpu); | ||
5689 | dst_nid = cpu_to_node(env->dst_cpu); | ||
5690 | |||
5691 | if (src_nid == dst_nid) | ||
5692 | return false; | ||
5693 | |||
5694 | /* Encourage migration to the preferred node. */ | ||
5695 | if (dst_nid == p->numa_preferred_nid) | ||
5696 | return true; | ||
5697 | |||
5698 | /* Migrating away from the preferred node is bad. */ | ||
5699 | if (src_nid == p->numa_preferred_nid) | ||
5700 | return false; | ||
5701 | |||
5702 | if (numa_group) { | ||
5703 | src_faults = group_faults(p, src_nid); | ||
5704 | dst_faults = group_faults(p, dst_nid); | ||
5705 | } else { | ||
5706 | src_faults = task_faults(p, src_nid); | ||
5707 | dst_faults = task_faults(p, dst_nid); | ||
5708 | } | ||
5709 | |||
5710 | return dst_faults > src_faults; | ||
5711 | } | ||
5712 | |||
5713 | |||
5714 | static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env) | ||
5715 | { | ||
5716 | struct numa_group *numa_group = rcu_dereference(p->numa_group); | ||
5717 | unsigned long src_faults, dst_faults; | ||
5718 | int src_nid, dst_nid; | ||
5719 | |||
5720 | if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER)) | ||
5721 | return false; | ||
5722 | |||
5723 | if (!p->numa_faults || !(env->sd->flags & SD_NUMA)) | 5527 | if (!p->numa_faults || !(env->sd->flags & SD_NUMA)) |
5724 | return false; | 5528 | return -1; |
5529 | |||
5530 | if (!sched_feat(NUMA)) | ||
5531 | return -1; | ||
5725 | 5532 | ||
5726 | src_nid = cpu_to_node(env->src_cpu); | 5533 | src_nid = cpu_to_node(env->src_cpu); |
5727 | dst_nid = cpu_to_node(env->dst_cpu); | 5534 | dst_nid = cpu_to_node(env->dst_cpu); |
5728 | 5535 | ||
5729 | if (src_nid == dst_nid) | 5536 | if (src_nid == dst_nid) |
5730 | return false; | 5537 | return -1; |
5731 | 5538 | ||
5732 | /* Migrating away from the preferred node is bad. */ | 5539 | /* Migrating away from the preferred node is always bad. */ |
5733 | if (src_nid == p->numa_preferred_nid) | 5540 | if (src_nid == p->numa_preferred_nid) { |
5734 | return true; | 5541 | if (env->src_rq->nr_running > env->src_rq->nr_preferred_running) |
5542 | return 1; | ||
5543 | else | ||
5544 | return -1; | ||
5545 | } | ||
5735 | 5546 | ||
5736 | /* Encourage migration to the preferred node. */ | 5547 | /* Encourage migration to the preferred node. */ |
5737 | if (dst_nid == p->numa_preferred_nid) | 5548 | if (dst_nid == p->numa_preferred_nid) |
5738 | return false; | 5549 | return 0; |
5739 | 5550 | ||
5740 | if (numa_group) { | 5551 | if (numa_group) { |
5741 | src_faults = group_faults(p, src_nid); | 5552 | src_faults = group_faults(p, src_nid); |
@@ -5749,16 +5560,10 @@ static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env) | |||
5749 | } | 5560 | } |
5750 | 5561 | ||
5751 | #else | 5562 | #else |
5752 | static inline bool migrate_improves_locality(struct task_struct *p, | 5563 | static inline int migrate_degrades_locality(struct task_struct *p, |
5753 | struct lb_env *env) | 5564 | struct lb_env *env) |
5754 | { | 5565 | { |
5755 | return false; | 5566 | return -1; |
5756 | } | ||
5757 | |||
5758 | static inline bool migrate_degrades_locality(struct task_struct *p, | ||
5759 | struct lb_env *env) | ||
5760 | { | ||
5761 | return false; | ||
5762 | } | 5567 | } |
5763 | #endif | 5568 | #endif |
5764 | 5569 | ||
@@ -5768,7 +5573,7 @@ static inline bool migrate_degrades_locality(struct task_struct *p, | |||
5768 | static | 5573 | static |
5769 | int can_migrate_task(struct task_struct *p, struct lb_env *env) | 5574 | int can_migrate_task(struct task_struct *p, struct lb_env *env) |
5770 | { | 5575 | { |
5771 | int tsk_cache_hot = 0; | 5576 | int tsk_cache_hot; |
5772 | 5577 | ||
5773 | lockdep_assert_held(&env->src_rq->lock); | 5578 | lockdep_assert_held(&env->src_rq->lock); |
5774 | 5579 | ||
@@ -5826,13 +5631,13 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) | |||
5826 | * 2) task is cache cold, or | 5631 | * 2) task is cache cold, or |
5827 | * 3) too many balance attempts have failed. | 5632 | * 3) too many balance attempts have failed. |
5828 | */ | 5633 | */ |
5829 | tsk_cache_hot = task_hot(p, env); | 5634 | tsk_cache_hot = migrate_degrades_locality(p, env); |
5830 | if (!tsk_cache_hot) | 5635 | if (tsk_cache_hot == -1) |
5831 | tsk_cache_hot = migrate_degrades_locality(p, env); | 5636 | tsk_cache_hot = task_hot(p, env); |
5832 | 5637 | ||
5833 | if (migrate_improves_locality(p, env) || !tsk_cache_hot || | 5638 | if (tsk_cache_hot <= 0 || |
5834 | env->sd->nr_balance_failed > env->sd->cache_nice_tries) { | 5639 | env->sd->nr_balance_failed > env->sd->cache_nice_tries) { |
5835 | if (tsk_cache_hot) { | 5640 | if (tsk_cache_hot == 1) { |
5836 | schedstat_inc(env->sd, lb_hot_gained[env->idle]); | 5641 | schedstat_inc(env->sd, lb_hot_gained[env->idle]); |
5837 | schedstat_inc(p, se.statistics.nr_forced_migrations); | 5642 | schedstat_inc(p, se.statistics.nr_forced_migrations); |
5838 | } | 5643 | } |
@@ -5906,6 +5711,13 @@ static int detach_tasks(struct lb_env *env) | |||
5906 | return 0; | 5711 | return 0; |
5907 | 5712 | ||
5908 | while (!list_empty(tasks)) { | 5713 | while (!list_empty(tasks)) { |
5714 | /* | ||
5715 | * We don't want to steal all, otherwise we may be treated likewise, | ||
5716 | * which could at worst lead to a livelock crash. | ||
5717 | */ | ||
5718 | if (env->idle != CPU_NOT_IDLE && env->src_rq->nr_running <= 1) | ||
5719 | break; | ||
5720 | |||
5909 | p = list_first_entry(tasks, struct task_struct, se.group_node); | 5721 | p = list_first_entry(tasks, struct task_struct, se.group_node); |
5910 | 5722 | ||
5911 | env->loop++; | 5723 | env->loop++; |
@@ -6015,39 +5827,6 @@ static void attach_tasks(struct lb_env *env) | |||
6015 | } | 5827 | } |
6016 | 5828 | ||
6017 | #ifdef CONFIG_FAIR_GROUP_SCHED | 5829 | #ifdef CONFIG_FAIR_GROUP_SCHED |
6018 | /* | ||
6019 | * update tg->load_weight by folding this cpu's load_avg | ||
6020 | */ | ||
6021 | static void __update_blocked_averages_cpu(struct task_group *tg, int cpu) | ||
6022 | { | ||
6023 | struct sched_entity *se = tg->se[cpu]; | ||
6024 | struct cfs_rq *cfs_rq = tg->cfs_rq[cpu]; | ||
6025 | |||
6026 | /* throttled entities do not contribute to load */ | ||
6027 | if (throttled_hierarchy(cfs_rq)) | ||
6028 | return; | ||
6029 | |||
6030 | update_cfs_rq_blocked_load(cfs_rq, 1); | ||
6031 | |||
6032 | if (se) { | ||
6033 | update_entity_load_avg(se, 1); | ||
6034 | /* | ||
6035 | * We pivot on our runnable average having decayed to zero for | ||
6036 | * list removal. This generally implies that all our children | ||
6037 | * have also been removed (modulo rounding error or bandwidth | ||
6038 | * control); however, such cases are rare and we can fix these | ||
6039 | * at enqueue. | ||
6040 | * | ||
6041 | * TODO: fix up out-of-order children on enqueue. | ||
6042 | */ | ||
6043 | if (!se->avg.runnable_avg_sum && !cfs_rq->nr_running) | ||
6044 | list_del_leaf_cfs_rq(cfs_rq); | ||
6045 | } else { | ||
6046 | struct rq *rq = rq_of(cfs_rq); | ||
6047 | update_rq_runnable_avg(rq, rq->nr_running); | ||
6048 | } | ||
6049 | } | ||
6050 | |||
6051 | static void update_blocked_averages(int cpu) | 5830 | static void update_blocked_averages(int cpu) |
6052 | { | 5831 | { |
6053 | struct rq *rq = cpu_rq(cpu); | 5832 | struct rq *rq = cpu_rq(cpu); |
@@ -6056,19 +5835,19 @@ static void update_blocked_averages(int cpu) | |||
6056 | 5835 | ||
6057 | raw_spin_lock_irqsave(&rq->lock, flags); | 5836 | raw_spin_lock_irqsave(&rq->lock, flags); |
6058 | update_rq_clock(rq); | 5837 | update_rq_clock(rq); |
5838 | |||
6059 | /* | 5839 | /* |
6060 | * Iterates the task_group tree in a bottom up fashion, see | 5840 | * Iterates the task_group tree in a bottom up fashion, see |
6061 | * list_add_leaf_cfs_rq() for details. | 5841 | * list_add_leaf_cfs_rq() for details. |
6062 | */ | 5842 | */ |
6063 | for_each_leaf_cfs_rq(rq, cfs_rq) { | 5843 | for_each_leaf_cfs_rq(rq, cfs_rq) { |
6064 | /* | 5844 | /* throttled entities do not contribute to load */ |
6065 | * Note: We may want to consider periodically releasing | 5845 | if (throttled_hierarchy(cfs_rq)) |
6066 | * rq->lock about these updates so that creating many task | 5846 | continue; |
6067 | * groups does not result in continually extending hold time. | ||
6068 | */ | ||
6069 | __update_blocked_averages_cpu(cfs_rq->tg, rq->cpu); | ||
6070 | } | ||
6071 | 5847 | ||
5848 | if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq)) | ||
5849 | update_tg_load_avg(cfs_rq, 0); | ||
5850 | } | ||
6072 | raw_spin_unlock_irqrestore(&rq->lock, flags); | 5851 | raw_spin_unlock_irqrestore(&rq->lock, flags); |
6073 | } | 5852 | } |
6074 | 5853 | ||
@@ -6096,14 +5875,14 @@ static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq) | |||
6096 | } | 5875 | } |
6097 | 5876 | ||
6098 | if (!se) { | 5877 | if (!se) { |
6099 | cfs_rq->h_load = cfs_rq->runnable_load_avg; | 5878 | cfs_rq->h_load = cfs_rq_load_avg(cfs_rq); |
6100 | cfs_rq->last_h_load_update = now; | 5879 | cfs_rq->last_h_load_update = now; |
6101 | } | 5880 | } |
6102 | 5881 | ||
6103 | while ((se = cfs_rq->h_load_next) != NULL) { | 5882 | while ((se = cfs_rq->h_load_next) != NULL) { |
6104 | load = cfs_rq->h_load; | 5883 | load = cfs_rq->h_load; |
6105 | load = div64_ul(load * se->avg.load_avg_contrib, | 5884 | load = div64_ul(load * se->avg.load_avg, |
6106 | cfs_rq->runnable_load_avg + 1); | 5885 | cfs_rq_load_avg(cfs_rq) + 1); |
6107 | cfs_rq = group_cfs_rq(se); | 5886 | cfs_rq = group_cfs_rq(se); |
6108 | cfs_rq->h_load = load; | 5887 | cfs_rq->h_load = load; |
6109 | cfs_rq->last_h_load_update = now; | 5888 | cfs_rq->last_h_load_update = now; |
@@ -6115,17 +5894,25 @@ static unsigned long task_h_load(struct task_struct *p) | |||
6115 | struct cfs_rq *cfs_rq = task_cfs_rq(p); | 5894 | struct cfs_rq *cfs_rq = task_cfs_rq(p); |
6116 | 5895 | ||
6117 | update_cfs_rq_h_load(cfs_rq); | 5896 | update_cfs_rq_h_load(cfs_rq); |
6118 | return div64_ul(p->se.avg.load_avg_contrib * cfs_rq->h_load, | 5897 | return div64_ul(p->se.avg.load_avg * cfs_rq->h_load, |
6119 | cfs_rq->runnable_load_avg + 1); | 5898 | cfs_rq_load_avg(cfs_rq) + 1); |
6120 | } | 5899 | } |
6121 | #else | 5900 | #else |
6122 | static inline void update_blocked_averages(int cpu) | 5901 | static inline void update_blocked_averages(int cpu) |
6123 | { | 5902 | { |
5903 | struct rq *rq = cpu_rq(cpu); | ||
5904 | struct cfs_rq *cfs_rq = &rq->cfs; | ||
5905 | unsigned long flags; | ||
5906 | |||
5907 | raw_spin_lock_irqsave(&rq->lock, flags); | ||
5908 | update_rq_clock(rq); | ||
5909 | update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq); | ||
5910 | raw_spin_unlock_irqrestore(&rq->lock, flags); | ||
6124 | } | 5911 | } |
6125 | 5912 | ||
6126 | static unsigned long task_h_load(struct task_struct *p) | 5913 | static unsigned long task_h_load(struct task_struct *p) |
6127 | { | 5914 | { |
6128 | return p->se.avg.load_avg_contrib; | 5915 | return p->se.avg.load_avg; |
6129 | } | 5916 | } |
6130 | #endif | 5917 | #endif |
6131 | 5918 | ||
@@ -8025,8 +7812,6 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) | |||
8025 | 7812 | ||
8026 | if (numabalancing_enabled) | 7813 | if (numabalancing_enabled) |
8027 | task_tick_numa(rq, curr); | 7814 | task_tick_numa(rq, curr); |
8028 | |||
8029 | update_rq_runnable_avg(rq, 1); | ||
8030 | } | 7815 | } |
8031 | 7816 | ||
8032 | /* | 7817 | /* |
@@ -8125,15 +7910,18 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p) | |||
8125 | } | 7910 | } |
8126 | 7911 | ||
8127 | #ifdef CONFIG_SMP | 7912 | #ifdef CONFIG_SMP |
8128 | /* | 7913 | /* Catch up with the cfs_rq and remove our load when we leave */ |
8129 | * Remove our load from contribution when we leave sched_fair | 7914 | __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq), &se->avg, |
8130 | * and ensure we don't carry in an old decay_count if we | 7915 | se->on_rq * scale_load_down(se->load.weight), cfs_rq->curr == se, NULL); |
8131 | * switch back. | 7916 | |
8132 | */ | 7917 | cfs_rq->avg.load_avg = |
8133 | if (se->avg.decay_count) { | 7918 | max_t(long, cfs_rq->avg.load_avg - se->avg.load_avg, 0); |
8134 | __synchronize_entity_decay(se); | 7919 | cfs_rq->avg.load_sum = |
8135 | subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib); | 7920 | max_t(s64, cfs_rq->avg.load_sum - se->avg.load_sum, 0); |
8136 | } | 7921 | cfs_rq->avg.util_avg = |
7922 | max_t(long, cfs_rq->avg.util_avg - se->avg.util_avg, 0); | ||
7923 | cfs_rq->avg.util_sum = | ||
7924 | max_t(s32, cfs_rq->avg.util_sum - se->avg.util_sum, 0); | ||
8137 | #endif | 7925 | #endif |
8138 | } | 7926 | } |
8139 | 7927 | ||
@@ -8142,16 +7930,31 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p) | |||
8142 | */ | 7930 | */ |
8143 | static void switched_to_fair(struct rq *rq, struct task_struct *p) | 7931 | static void switched_to_fair(struct rq *rq, struct task_struct *p) |
8144 | { | 7932 | { |
8145 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
8146 | struct sched_entity *se = &p->se; | 7933 | struct sched_entity *se = &p->se; |
7934 | |||
7935 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
8147 | /* | 7936 | /* |
8148 | * Since the real-depth could have been changed (only FAIR | 7937 | * Since the real-depth could have been changed (only FAIR |
8149 | * class maintain depth value), reset depth properly. | 7938 | * class maintain depth value), reset depth properly. |
8150 | */ | 7939 | */ |
8151 | se->depth = se->parent ? se->parent->depth + 1 : 0; | 7940 | se->depth = se->parent ? se->parent->depth + 1 : 0; |
8152 | #endif | 7941 | #endif |
8153 | if (!task_on_rq_queued(p)) | 7942 | |
7943 | if (!task_on_rq_queued(p)) { | ||
7944 | |||
7945 | /* | ||
7946 | * Ensure the task has a non-normalized vruntime when it is switched | ||
7947 | * back to the fair class with !queued, so that enqueue_entity() at | ||
7948 | * wake-up time will do the right thing. | ||
7949 | * | ||
7950 | * If it's queued, then the enqueue_entity(.flags=0) makes the task | ||
7951 | * has non-normalized vruntime, if it's !queued, then it still has | ||
7952 | * normalized vruntime. | ||
7953 | */ | ||
7954 | if (p->state != TASK_RUNNING) | ||
7955 | se->vruntime += cfs_rq_of(se)->min_vruntime; | ||
8154 | return; | 7956 | return; |
7957 | } | ||
8155 | 7958 | ||
8156 | /* | 7959 | /* |
8157 | * We were most likely switched from sched_rt, so | 7960 | * We were most likely switched from sched_rt, so |
@@ -8190,8 +7993,8 @@ void init_cfs_rq(struct cfs_rq *cfs_rq) | |||
8190 | cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; | 7993 | cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; |
8191 | #endif | 7994 | #endif |
8192 | #ifdef CONFIG_SMP | 7995 | #ifdef CONFIG_SMP |
8193 | atomic64_set(&cfs_rq->decay_counter, 1); | 7996 | atomic_long_set(&cfs_rq->removed_load_avg, 0); |
8194 | atomic_long_set(&cfs_rq->removed_load, 0); | 7997 | atomic_long_set(&cfs_rq->removed_util_avg, 0); |
8195 | #endif | 7998 | #endif |
8196 | } | 7999 | } |
8197 | 8000 | ||
@@ -8236,14 +8039,14 @@ static void task_move_group_fair(struct task_struct *p, int queued) | |||
8236 | if (!queued) { | 8039 | if (!queued) { |
8237 | cfs_rq = cfs_rq_of(se); | 8040 | cfs_rq = cfs_rq_of(se); |
8238 | se->vruntime += cfs_rq->min_vruntime; | 8041 | se->vruntime += cfs_rq->min_vruntime; |
8042 | |||
8239 | #ifdef CONFIG_SMP | 8043 | #ifdef CONFIG_SMP |
8240 | /* | 8044 | /* Virtually synchronize task with its new cfs_rq */ |
8241 | * migrate_task_rq_fair() will have removed our previous | 8045 | p->se.avg.last_update_time = cfs_rq->avg.last_update_time; |
8242 | * contribution, but we must synchronize for ongoing future | 8046 | cfs_rq->avg.load_avg += p->se.avg.load_avg; |
8243 | * decay. | 8047 | cfs_rq->avg.load_sum += p->se.avg.load_sum; |
8244 | */ | 8048 | cfs_rq->avg.util_avg += p->se.avg.util_avg; |
8245 | se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter); | 8049 | cfs_rq->avg.util_sum += p->se.avg.util_sum; |
8246 | cfs_rq->blocked_load_avg += se->avg.load_avg_contrib; | ||
8247 | #endif | 8050 | #endif |
8248 | } | 8051 | } |
8249 | } | 8052 | } |
@@ -8257,8 +8060,11 @@ void free_fair_sched_group(struct task_group *tg) | |||
8257 | for_each_possible_cpu(i) { | 8060 | for_each_possible_cpu(i) { |
8258 | if (tg->cfs_rq) | 8061 | if (tg->cfs_rq) |
8259 | kfree(tg->cfs_rq[i]); | 8062 | kfree(tg->cfs_rq[i]); |
8260 | if (tg->se) | 8063 | if (tg->se) { |
8064 | if (tg->se[i]) | ||
8065 | remove_entity_load_avg(tg->se[i]); | ||
8261 | kfree(tg->se[i]); | 8066 | kfree(tg->se[i]); |
8067 | } | ||
8262 | } | 8068 | } |
8263 | 8069 | ||
8264 | kfree(tg->cfs_rq); | 8070 | kfree(tg->cfs_rq); |
@@ -8295,6 +8101,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) | |||
8295 | 8101 | ||
8296 | init_cfs_rq(cfs_rq); | 8102 | init_cfs_rq(cfs_rq); |
8297 | init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]); | 8103 | init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]); |
8104 | init_entity_runnable_average(se); | ||
8298 | } | 8105 | } |
8299 | 8106 | ||
8300 | return 1; | 8107 | return 1; |
@@ -8444,6 +8251,8 @@ const struct sched_class fair_sched_class = { | |||
8444 | .rq_offline = rq_offline_fair, | 8251 | .rq_offline = rq_offline_fair, |
8445 | 8252 | ||
8446 | .task_waking = task_waking_fair, | 8253 | .task_waking = task_waking_fair, |
8254 | .task_dead = task_dead_fair, | ||
8255 | .set_cpus_allowed = set_cpus_allowed_common, | ||
8447 | #endif | 8256 | #endif |
8448 | 8257 | ||
8449 | .set_curr_task = set_curr_task_fair, | 8258 | .set_curr_task = set_curr_task_fair, |
diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 91e33cd485f6..83a50e7ca533 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h | |||
@@ -79,20 +79,12 @@ SCHED_FEAT(LB_MIN, false) | |||
79 | * numa_balancing= | 79 | * numa_balancing= |
80 | */ | 80 | */ |
81 | #ifdef CONFIG_NUMA_BALANCING | 81 | #ifdef CONFIG_NUMA_BALANCING |
82 | SCHED_FEAT(NUMA, false) | ||
83 | 82 | ||
84 | /* | 83 | /* |
85 | * NUMA_FAVOUR_HIGHER will favor moving tasks towards nodes where a | 84 | * NUMA will favor moving tasks towards nodes where a higher number of |
86 | * higher number of hinting faults are recorded during active load | 85 | * hinting faults are recorded during active load balancing. It will |
87 | * balancing. | 86 | * resist moving tasks towards nodes where a lower number of hinting |
87 | * faults have been recorded. | ||
88 | */ | 88 | */ |
89 | SCHED_FEAT(NUMA_FAVOUR_HIGHER, true) | 89 | SCHED_FEAT(NUMA, true) |
90 | |||
91 | /* | ||
92 | * NUMA_RESIST_LOWER will resist moving tasks towards nodes where a | ||
93 | * lower number of hinting faults have been recorded. As this has | ||
94 | * the potential to prevent a task ever migrating to a new node | ||
95 | * due to CPU overload it is disabled by default. | ||
96 | */ | ||
97 | SCHED_FEAT(NUMA_RESIST_LOWER, false) | ||
98 | #endif | 90 | #endif |
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 594275ed2620..8f177c73ae19 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c | |||
@@ -83,10 +83,13 @@ void __weak arch_cpu_idle(void) | |||
83 | */ | 83 | */ |
84 | void default_idle_call(void) | 84 | void default_idle_call(void) |
85 | { | 85 | { |
86 | if (current_clr_polling_and_test()) | 86 | if (current_clr_polling_and_test()) { |
87 | local_irq_enable(); | 87 | local_irq_enable(); |
88 | else | 88 | } else { |
89 | stop_critical_timings(); | ||
89 | arch_cpu_idle(); | 90 | arch_cpu_idle(); |
91 | start_critical_timings(); | ||
92 | } | ||
90 | } | 93 | } |
91 | 94 | ||
92 | static int call_cpuidle(struct cpuidle_driver *drv, struct cpuidle_device *dev, | 95 | static int call_cpuidle(struct cpuidle_driver *drv, struct cpuidle_device *dev, |
@@ -141,12 +144,6 @@ static void cpuidle_idle_call(void) | |||
141 | } | 144 | } |
142 | 145 | ||
143 | /* | 146 | /* |
144 | * During the idle period, stop measuring the disabled irqs | ||
145 | * critical sections latencies | ||
146 | */ | ||
147 | stop_critical_timings(); | ||
148 | |||
149 | /* | ||
150 | * Tell the RCU framework we are entering an idle section, | 147 | * Tell the RCU framework we are entering an idle section, |
151 | * so no more rcu read side critical sections and one more | 148 | * so no more rcu read side critical sections and one more |
152 | * step to the grace period | 149 | * step to the grace period |
@@ -198,7 +195,6 @@ exit_idle: | |||
198 | local_irq_enable(); | 195 | local_irq_enable(); |
199 | 196 | ||
200 | rcu_idle_exit(); | 197 | rcu_idle_exit(); |
201 | start_critical_timings(); | ||
202 | } | 198 | } |
203 | 199 | ||
204 | DEFINE_PER_CPU(bool, cpu_dead_idle); | 200 | DEFINE_PER_CPU(bool, cpu_dead_idle); |
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c index c65dac8c97cd..c4ae0f1fdf9b 100644 --- a/kernel/sched/idle_task.c +++ b/kernel/sched/idle_task.c | |||
@@ -96,6 +96,7 @@ const struct sched_class idle_sched_class = { | |||
96 | 96 | ||
97 | #ifdef CONFIG_SMP | 97 | #ifdef CONFIG_SMP |
98 | .select_task_rq = select_task_rq_idle, | 98 | .select_task_rq = select_task_rq_idle, |
99 | .set_cpus_allowed = set_cpus_allowed_common, | ||
99 | #endif | 100 | #endif |
100 | 101 | ||
101 | .set_curr_task = set_curr_task_idle, | 102 | .set_curr_task = set_curr_task_idle, |
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 0d193a243e96..d2ea59364a1c 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c | |||
@@ -2069,7 +2069,6 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p) | |||
2069 | { | 2069 | { |
2070 | if (!task_running(rq, p) && | 2070 | if (!task_running(rq, p) && |
2071 | !test_tsk_need_resched(rq->curr) && | 2071 | !test_tsk_need_resched(rq->curr) && |
2072 | has_pushable_tasks(rq) && | ||
2073 | p->nr_cpus_allowed > 1 && | 2072 | p->nr_cpus_allowed > 1 && |
2074 | (dl_task(rq->curr) || rt_task(rq->curr)) && | 2073 | (dl_task(rq->curr) || rt_task(rq->curr)) && |
2075 | (rq->curr->nr_cpus_allowed < 2 || | 2074 | (rq->curr->nr_cpus_allowed < 2 || |
@@ -2077,45 +2076,6 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p) | |||
2077 | push_rt_tasks(rq); | 2076 | push_rt_tasks(rq); |
2078 | } | 2077 | } |
2079 | 2078 | ||
2080 | static void set_cpus_allowed_rt(struct task_struct *p, | ||
2081 | const struct cpumask *new_mask) | ||
2082 | { | ||
2083 | struct rq *rq; | ||
2084 | int weight; | ||
2085 | |||
2086 | BUG_ON(!rt_task(p)); | ||
2087 | |||
2088 | if (!task_on_rq_queued(p)) | ||
2089 | return; | ||
2090 | |||
2091 | weight = cpumask_weight(new_mask); | ||
2092 | |||
2093 | /* | ||
2094 | * Only update if the process changes its state from whether it | ||
2095 | * can migrate or not. | ||
2096 | */ | ||
2097 | if ((p->nr_cpus_allowed > 1) == (weight > 1)) | ||
2098 | return; | ||
2099 | |||
2100 | rq = task_rq(p); | ||
2101 | |||
2102 | /* | ||
2103 | * The process used to be able to migrate OR it can now migrate | ||
2104 | */ | ||
2105 | if (weight <= 1) { | ||
2106 | if (!task_current(rq, p)) | ||
2107 | dequeue_pushable_task(rq, p); | ||
2108 | BUG_ON(!rq->rt.rt_nr_migratory); | ||
2109 | rq->rt.rt_nr_migratory--; | ||
2110 | } else { | ||
2111 | if (!task_current(rq, p)) | ||
2112 | enqueue_pushable_task(rq, p); | ||
2113 | rq->rt.rt_nr_migratory++; | ||
2114 | } | ||
2115 | |||
2116 | update_rt_migration(&rq->rt); | ||
2117 | } | ||
2118 | |||
2119 | /* Assumes rq->lock is held */ | 2079 | /* Assumes rq->lock is held */ |
2120 | static void rq_online_rt(struct rq *rq) | 2080 | static void rq_online_rt(struct rq *rq) |
2121 | { | 2081 | { |
@@ -2324,7 +2284,7 @@ const struct sched_class rt_sched_class = { | |||
2324 | #ifdef CONFIG_SMP | 2284 | #ifdef CONFIG_SMP |
2325 | .select_task_rq = select_task_rq_rt, | 2285 | .select_task_rq = select_task_rq_rt, |
2326 | 2286 | ||
2327 | .set_cpus_allowed = set_cpus_allowed_rt, | 2287 | .set_cpus_allowed = set_cpus_allowed_common, |
2328 | .rq_online = rq_online_rt, | 2288 | .rq_online = rq_online_rt, |
2329 | .rq_offline = rq_offline_rt, | 2289 | .rq_offline = rq_offline_rt, |
2330 | .task_woken = task_woken_rt, | 2290 | .task_woken = task_woken_rt, |
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 84d48790bb6d..68cda117574c 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h | |||
@@ -245,7 +245,6 @@ struct task_group { | |||
245 | 245 | ||
246 | #ifdef CONFIG_SMP | 246 | #ifdef CONFIG_SMP |
247 | atomic_long_t load_avg; | 247 | atomic_long_t load_avg; |
248 | atomic_t runnable_avg; | ||
249 | #endif | 248 | #endif |
250 | #endif | 249 | #endif |
251 | 250 | ||
@@ -366,27 +365,20 @@ struct cfs_rq { | |||
366 | 365 | ||
367 | #ifdef CONFIG_SMP | 366 | #ifdef CONFIG_SMP |
368 | /* | 367 | /* |
369 | * CFS Load tracking | 368 | * CFS load tracking |
370 | * Under CFS, load is tracked on a per-entity basis and aggregated up. | ||
371 | * This allows for the description of both thread and group usage (in | ||
372 | * the FAIR_GROUP_SCHED case). | ||
373 | * runnable_load_avg is the sum of the load_avg_contrib of the | ||
374 | * sched_entities on the rq. | ||
375 | * blocked_load_avg is similar to runnable_load_avg except that its | ||
376 | * the blocked sched_entities on the rq. | ||
377 | * utilization_load_avg is the sum of the average running time of the | ||
378 | * sched_entities on the rq. | ||
379 | */ | 369 | */ |
380 | unsigned long runnable_load_avg, blocked_load_avg, utilization_load_avg; | 370 | struct sched_avg avg; |
381 | atomic64_t decay_counter; | 371 | u64 runnable_load_sum; |
382 | u64 last_decay; | 372 | unsigned long runnable_load_avg; |
383 | atomic_long_t removed_load; | ||
384 | |||
385 | #ifdef CONFIG_FAIR_GROUP_SCHED | 373 | #ifdef CONFIG_FAIR_GROUP_SCHED |
386 | /* Required to track per-cpu representation of a task_group */ | 374 | unsigned long tg_load_avg_contrib; |
387 | u32 tg_runnable_contrib; | 375 | #endif |
388 | unsigned long tg_load_contrib; | 376 | atomic_long_t removed_load_avg, removed_util_avg; |
377 | #ifndef CONFIG_64BIT | ||
378 | u64 load_last_update_time_copy; | ||
379 | #endif | ||
389 | 380 | ||
381 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
390 | /* | 382 | /* |
391 | * h_load = weight * f(tg) | 383 | * h_load = weight * f(tg) |
392 | * | 384 | * |
@@ -595,8 +587,6 @@ struct rq { | |||
595 | #ifdef CONFIG_FAIR_GROUP_SCHED | 587 | #ifdef CONFIG_FAIR_GROUP_SCHED |
596 | /* list of leaf cfs_rq on this cpu: */ | 588 | /* list of leaf cfs_rq on this cpu: */ |
597 | struct list_head leaf_cfs_rq_list; | 589 | struct list_head leaf_cfs_rq_list; |
598 | |||
599 | struct sched_avg avg; | ||
600 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 590 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
601 | 591 | ||
602 | /* | 592 | /* |
@@ -1065,9 +1055,6 @@ static inline int task_on_rq_migrating(struct task_struct *p) | |||
1065 | #ifndef prepare_arch_switch | 1055 | #ifndef prepare_arch_switch |
1066 | # define prepare_arch_switch(next) do { } while (0) | 1056 | # define prepare_arch_switch(next) do { } while (0) |
1067 | #endif | 1057 | #endif |
1068 | #ifndef finish_arch_switch | ||
1069 | # define finish_arch_switch(prev) do { } while (0) | ||
1070 | #endif | ||
1071 | #ifndef finish_arch_post_lock_switch | 1058 | #ifndef finish_arch_post_lock_switch |
1072 | # define finish_arch_post_lock_switch() do { } while (0) | 1059 | # define finish_arch_post_lock_switch() do { } while (0) |
1073 | #endif | 1060 | #endif |
@@ -1268,6 +1255,8 @@ extern void trigger_load_balance(struct rq *rq); | |||
1268 | extern void idle_enter_fair(struct rq *this_rq); | 1255 | extern void idle_enter_fair(struct rq *this_rq); |
1269 | extern void idle_exit_fair(struct rq *this_rq); | 1256 | extern void idle_exit_fair(struct rq *this_rq); |
1270 | 1257 | ||
1258 | extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask); | ||
1259 | |||
1271 | #else | 1260 | #else |
1272 | 1261 | ||
1273 | static inline void idle_enter_fair(struct rq *rq) { } | 1262 | static inline void idle_enter_fair(struct rq *rq) { } |
@@ -1319,7 +1308,7 @@ extern void init_dl_task_timer(struct sched_dl_entity *dl_se); | |||
1319 | 1308 | ||
1320 | unsigned long to_ratio(u64 period, u64 runtime); | 1309 | unsigned long to_ratio(u64 period, u64 runtime); |
1321 | 1310 | ||
1322 | extern void init_task_runnable_average(struct task_struct *p); | 1311 | extern void init_entity_runnable_average(struct sched_entity *se); |
1323 | 1312 | ||
1324 | static inline void add_nr_running(struct rq *rq, unsigned count) | 1313 | static inline void add_nr_running(struct rq *rq, unsigned count) |
1325 | { | 1314 | { |
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index 79ffec45a6ac..cbc67da10954 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c | |||
@@ -123,6 +123,7 @@ const struct sched_class stop_sched_class = { | |||
123 | 123 | ||
124 | #ifdef CONFIG_SMP | 124 | #ifdef CONFIG_SMP |
125 | .select_task_rq = select_task_rq_stop, | 125 | .select_task_rq = select_task_rq_stop, |
126 | .set_cpus_allowed = set_cpus_allowed_common, | ||
126 | #endif | 127 | #endif |
127 | 128 | ||
128 | .set_curr_task = set_curr_task_stop, | 129 | .set_curr_task = set_curr_task_stop, |
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 052e02672d12..272d9322bc5d 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c | |||
@@ -106,9 +106,10 @@ void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr) | |||
106 | } | 106 | } |
107 | EXPORT_SYMBOL_GPL(__wake_up_locked); | 107 | EXPORT_SYMBOL_GPL(__wake_up_locked); |
108 | 108 | ||
109 | void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key) | 109 | void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, int nr, |
110 | void *key) | ||
110 | { | 111 | { |
111 | __wake_up_common(q, mode, 1, 0, key); | 112 | __wake_up_common(q, mode, nr, 0, key); |
112 | } | 113 | } |
113 | EXPORT_SYMBOL_GPL(__wake_up_locked_key); | 114 | EXPORT_SYMBOL_GPL(__wake_up_locked_key); |
114 | 115 | ||
@@ -283,7 +284,7 @@ void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait, | |||
283 | if (!list_empty(&wait->task_list)) | 284 | if (!list_empty(&wait->task_list)) |
284 | list_del_init(&wait->task_list); | 285 | list_del_init(&wait->task_list); |
285 | else if (waitqueue_active(q)) | 286 | else if (waitqueue_active(q)) |
286 | __wake_up_locked_key(q, mode, key); | 287 | __wake_up_locked_key(q, mode, 1, key); |
287 | spin_unlock_irqrestore(&q->lock, flags); | 288 | spin_unlock_irqrestore(&q->lock, flags); |
288 | } | 289 | } |
289 | EXPORT_SYMBOL(abort_exclusive_wait); | 290 | EXPORT_SYMBOL(abort_exclusive_wait); |
diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 245df6b32b81..5bd4779282df 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c | |||
@@ -175,17 +175,16 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen) | |||
175 | */ | 175 | */ |
176 | static u32 seccomp_run_filters(struct seccomp_data *sd) | 176 | static u32 seccomp_run_filters(struct seccomp_data *sd) |
177 | { | 177 | { |
178 | struct seccomp_filter *f = ACCESS_ONCE(current->seccomp.filter); | ||
179 | struct seccomp_data sd_local; | 178 | struct seccomp_data sd_local; |
180 | u32 ret = SECCOMP_RET_ALLOW; | 179 | u32 ret = SECCOMP_RET_ALLOW; |
180 | /* Make sure cross-thread synced filter points somewhere sane. */ | ||
181 | struct seccomp_filter *f = | ||
182 | lockless_dereference(current->seccomp.filter); | ||
181 | 183 | ||
182 | /* Ensure unexpected behavior doesn't result in failing open. */ | 184 | /* Ensure unexpected behavior doesn't result in failing open. */ |
183 | if (unlikely(WARN_ON(f == NULL))) | 185 | if (unlikely(WARN_ON(f == NULL))) |
184 | return SECCOMP_RET_KILL; | 186 | return SECCOMP_RET_KILL; |
185 | 187 | ||
186 | /* Make sure cross-thread synced filter points somewhere sane. */ | ||
187 | smp_read_barrier_depends(); | ||
188 | |||
189 | if (!sd) { | 188 | if (!sd) { |
190 | populate_seccomp_data(&sd_local); | 189 | populate_seccomp_data(&sd_local); |
191 | sd = &sd_local; | 190 | sd = &sd_local; |
@@ -549,7 +548,11 @@ void secure_computing_strict(int this_syscall) | |||
549 | { | 548 | { |
550 | int mode = current->seccomp.mode; | 549 | int mode = current->seccomp.mode; |
551 | 550 | ||
552 | if (mode == 0) | 551 | if (config_enabled(CONFIG_CHECKPOINT_RESTORE) && |
552 | unlikely(current->ptrace & PT_SUSPEND_SECCOMP)) | ||
553 | return; | ||
554 | |||
555 | if (mode == SECCOMP_MODE_DISABLED) | ||
553 | return; | 556 | return; |
554 | else if (mode == SECCOMP_MODE_STRICT) | 557 | else if (mode == SECCOMP_MODE_STRICT) |
555 | __secure_computing_strict(this_syscall); | 558 | __secure_computing_strict(this_syscall); |
@@ -650,6 +653,10 @@ u32 seccomp_phase1(struct seccomp_data *sd) | |||
650 | int this_syscall = sd ? sd->nr : | 653 | int this_syscall = sd ? sd->nr : |
651 | syscall_get_nr(current, task_pt_regs(current)); | 654 | syscall_get_nr(current, task_pt_regs(current)); |
652 | 655 | ||
656 | if (config_enabled(CONFIG_CHECKPOINT_RESTORE) && | ||
657 | unlikely(current->ptrace & PT_SUSPEND_SECCOMP)) | ||
658 | return SECCOMP_PHASE1_OK; | ||
659 | |||
653 | switch (mode) { | 660 | switch (mode) { |
654 | case SECCOMP_MODE_STRICT: | 661 | case SECCOMP_MODE_STRICT: |
655 | __secure_computing_strict(this_syscall); /* may call do_exit */ | 662 | __secure_computing_strict(this_syscall); /* may call do_exit */ |
diff --git a/kernel/signal.c b/kernel/signal.c index 836df8dac6cc..0f6bbbe77b46 100644 --- a/kernel/signal.c +++ b/kernel/signal.c | |||
@@ -2748,12 +2748,15 @@ int copy_siginfo_to_user(siginfo_t __user *to, const siginfo_t *from) | |||
2748 | * Other callers might not initialize the si_lsb field, | 2748 | * Other callers might not initialize the si_lsb field, |
2749 | * so check explicitly for the right codes here. | 2749 | * so check explicitly for the right codes here. |
2750 | */ | 2750 | */ |
2751 | if (from->si_code == BUS_MCEERR_AR || from->si_code == BUS_MCEERR_AO) | 2751 | if (from->si_signo == SIGBUS && |
2752 | (from->si_code == BUS_MCEERR_AR || from->si_code == BUS_MCEERR_AO)) | ||
2752 | err |= __put_user(from->si_addr_lsb, &to->si_addr_lsb); | 2753 | err |= __put_user(from->si_addr_lsb, &to->si_addr_lsb); |
2753 | #endif | 2754 | #endif |
2754 | #ifdef SEGV_BNDERR | 2755 | #ifdef SEGV_BNDERR |
2755 | err |= __put_user(from->si_lower, &to->si_lower); | 2756 | if (from->si_signo == SIGSEGV && from->si_code == SEGV_BNDERR) { |
2756 | err |= __put_user(from->si_upper, &to->si_upper); | 2757 | err |= __put_user(from->si_lower, &to->si_lower); |
2758 | err |= __put_user(from->si_upper, &to->si_upper); | ||
2759 | } | ||
2757 | #endif | 2760 | #endif |
2758 | break; | 2761 | break; |
2759 | case __SI_CHLD: | 2762 | case __SI_CHLD: |
@@ -3017,7 +3020,7 @@ COMPAT_SYSCALL_DEFINE3(rt_sigqueueinfo, | |||
3017 | int, sig, | 3020 | int, sig, |
3018 | struct compat_siginfo __user *, uinfo) | 3021 | struct compat_siginfo __user *, uinfo) |
3019 | { | 3022 | { |
3020 | siginfo_t info; | 3023 | siginfo_t info = {}; |
3021 | int ret = copy_siginfo_from_user32(&info, uinfo); | 3024 | int ret = copy_siginfo_from_user32(&info, uinfo); |
3022 | if (unlikely(ret)) | 3025 | if (unlikely(ret)) |
3023 | return ret; | 3026 | return ret; |
@@ -3061,7 +3064,7 @@ COMPAT_SYSCALL_DEFINE4(rt_tgsigqueueinfo, | |||
3061 | int, sig, | 3064 | int, sig, |
3062 | struct compat_siginfo __user *, uinfo) | 3065 | struct compat_siginfo __user *, uinfo) |
3063 | { | 3066 | { |
3064 | siginfo_t info; | 3067 | siginfo_t info = {}; |
3065 | 3068 | ||
3066 | if (copy_siginfo_from_user32(&info, uinfo)) | 3069 | if (copy_siginfo_from_user32(&info, uinfo)) |
3067 | return -EFAULT; | 3070 | return -EFAULT; |
diff --git a/kernel/smpboot.c b/kernel/smpboot.c index 7c434c39f02a..a818cbc73e14 100644 --- a/kernel/smpboot.c +++ b/kernel/smpboot.c | |||
@@ -113,7 +113,8 @@ static int smpboot_thread_fn(void *data) | |||
113 | if (kthread_should_stop()) { | 113 | if (kthread_should_stop()) { |
114 | __set_current_state(TASK_RUNNING); | 114 | __set_current_state(TASK_RUNNING); |
115 | preempt_enable(); | 115 | preempt_enable(); |
116 | if (ht->cleanup) | 116 | /* cleanup must mirror setup */ |
117 | if (ht->cleanup && td->status != HP_THREAD_NONE) | ||
117 | ht->cleanup(td->cpu, cpu_online(td->cpu)); | 118 | ht->cleanup(td->cpu, cpu_online(td->cpu)); |
118 | kfree(td); | 119 | kfree(td); |
119 | return 0; | 120 | return 0; |
@@ -259,15 +260,6 @@ static void smpboot_destroy_threads(struct smp_hotplug_thread *ht) | |||
259 | { | 260 | { |
260 | unsigned int cpu; | 261 | unsigned int cpu; |
261 | 262 | ||
262 | /* Unpark any threads that were voluntarily parked. */ | ||
263 | for_each_cpu_not(cpu, ht->cpumask) { | ||
264 | if (cpu_online(cpu)) { | ||
265 | struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu); | ||
266 | if (tsk) | ||
267 | kthread_unpark(tsk); | ||
268 | } | ||
269 | } | ||
270 | |||
271 | /* We need to destroy also the parked threads of offline cpus */ | 263 | /* We need to destroy also the parked threads of offline cpus */ |
272 | for_each_possible_cpu(cpu) { | 264 | for_each_possible_cpu(cpu) { |
273 | struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu); | 265 | struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu); |
@@ -281,19 +273,22 @@ static void smpboot_destroy_threads(struct smp_hotplug_thread *ht) | |||
281 | } | 273 | } |
282 | 274 | ||
283 | /** | 275 | /** |
284 | * smpboot_register_percpu_thread - Register a per_cpu thread related to hotplug | 276 | * smpboot_register_percpu_thread_cpumask - Register a per_cpu thread related |
277 | * to hotplug | ||
285 | * @plug_thread: Hotplug thread descriptor | 278 | * @plug_thread: Hotplug thread descriptor |
279 | * @cpumask: The cpumask where threads run | ||
286 | * | 280 | * |
287 | * Creates and starts the threads on all online cpus. | 281 | * Creates and starts the threads on all online cpus. |
288 | */ | 282 | */ |
289 | int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread) | 283 | int smpboot_register_percpu_thread_cpumask(struct smp_hotplug_thread *plug_thread, |
284 | const struct cpumask *cpumask) | ||
290 | { | 285 | { |
291 | unsigned int cpu; | 286 | unsigned int cpu; |
292 | int ret = 0; | 287 | int ret = 0; |
293 | 288 | ||
294 | if (!alloc_cpumask_var(&plug_thread->cpumask, GFP_KERNEL)) | 289 | if (!alloc_cpumask_var(&plug_thread->cpumask, GFP_KERNEL)) |
295 | return -ENOMEM; | 290 | return -ENOMEM; |
296 | cpumask_copy(plug_thread->cpumask, cpu_possible_mask); | 291 | cpumask_copy(plug_thread->cpumask, cpumask); |
297 | 292 | ||
298 | get_online_cpus(); | 293 | get_online_cpus(); |
299 | mutex_lock(&smpboot_threads_lock); | 294 | mutex_lock(&smpboot_threads_lock); |
@@ -301,9 +296,11 @@ int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread) | |||
301 | ret = __smpboot_create_thread(plug_thread, cpu); | 296 | ret = __smpboot_create_thread(plug_thread, cpu); |
302 | if (ret) { | 297 | if (ret) { |
303 | smpboot_destroy_threads(plug_thread); | 298 | smpboot_destroy_threads(plug_thread); |
299 | free_cpumask_var(plug_thread->cpumask); | ||
304 | goto out; | 300 | goto out; |
305 | } | 301 | } |
306 | smpboot_unpark_thread(plug_thread, cpu); | 302 | if (cpumask_test_cpu(cpu, cpumask)) |
303 | smpboot_unpark_thread(plug_thread, cpu); | ||
307 | } | 304 | } |
308 | list_add(&plug_thread->list, &hotplug_threads); | 305 | list_add(&plug_thread->list, &hotplug_threads); |
309 | out: | 306 | out: |
@@ -311,7 +308,7 @@ out: | |||
311 | put_online_cpus(); | 308 | put_online_cpus(); |
312 | return ret; | 309 | return ret; |
313 | } | 310 | } |
314 | EXPORT_SYMBOL_GPL(smpboot_register_percpu_thread); | 311 | EXPORT_SYMBOL_GPL(smpboot_register_percpu_thread_cpumask); |
315 | 312 | ||
316 | /** | 313 | /** |
317 | * smpboot_unregister_percpu_thread - Unregister a per_cpu thread related to hotplug | 314 | * smpboot_unregister_percpu_thread - Unregister a per_cpu thread related to hotplug |
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index fd643d8c4b42..12484e5d5c88 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c | |||
@@ -35,13 +35,16 @@ struct cpu_stop_done { | |||
35 | 35 | ||
36 | /* the actual stopper, one per every possible cpu, enabled on online cpus */ | 36 | /* the actual stopper, one per every possible cpu, enabled on online cpus */ |
37 | struct cpu_stopper { | 37 | struct cpu_stopper { |
38 | struct task_struct *thread; | ||
39 | |||
38 | spinlock_t lock; | 40 | spinlock_t lock; |
39 | bool enabled; /* is this stopper enabled? */ | 41 | bool enabled; /* is this stopper enabled? */ |
40 | struct list_head works; /* list of pending works */ | 42 | struct list_head works; /* list of pending works */ |
43 | |||
44 | struct cpu_stop_work stop_work; /* for stop_cpus */ | ||
41 | }; | 45 | }; |
42 | 46 | ||
43 | static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper); | 47 | static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper); |
44 | static DEFINE_PER_CPU(struct task_struct *, cpu_stopper_task); | ||
45 | static bool stop_machine_initialized = false; | 48 | static bool stop_machine_initialized = false; |
46 | 49 | ||
47 | /* | 50 | /* |
@@ -74,7 +77,6 @@ static void cpu_stop_signal_done(struct cpu_stop_done *done, bool executed) | |||
74 | static void cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work) | 77 | static void cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work) |
75 | { | 78 | { |
76 | struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); | 79 | struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); |
77 | struct task_struct *p = per_cpu(cpu_stopper_task, cpu); | ||
78 | 80 | ||
79 | unsigned long flags; | 81 | unsigned long flags; |
80 | 82 | ||
@@ -82,7 +84,7 @@ static void cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work) | |||
82 | 84 | ||
83 | if (stopper->enabled) { | 85 | if (stopper->enabled) { |
84 | list_add_tail(&work->list, &stopper->works); | 86 | list_add_tail(&work->list, &stopper->works); |
85 | wake_up_process(p); | 87 | wake_up_process(stopper->thread); |
86 | } else | 88 | } else |
87 | cpu_stop_signal_done(work->done, false); | 89 | cpu_stop_signal_done(work->done, false); |
88 | 90 | ||
@@ -139,7 +141,7 @@ enum multi_stop_state { | |||
139 | }; | 141 | }; |
140 | 142 | ||
141 | struct multi_stop_data { | 143 | struct multi_stop_data { |
142 | int (*fn)(void *); | 144 | cpu_stop_fn_t fn; |
143 | void *data; | 145 | void *data; |
144 | /* Like num_online_cpus(), but hotplug cpu uses us, so we need this. */ | 146 | /* Like num_online_cpus(), but hotplug cpu uses us, so we need this. */ |
145 | unsigned int num_threads; | 147 | unsigned int num_threads; |
@@ -293,7 +295,6 @@ void stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg, | |||
293 | 295 | ||
294 | /* static data for stop_cpus */ | 296 | /* static data for stop_cpus */ |
295 | static DEFINE_MUTEX(stop_cpus_mutex); | 297 | static DEFINE_MUTEX(stop_cpus_mutex); |
296 | static DEFINE_PER_CPU(struct cpu_stop_work, stop_cpus_work); | ||
297 | 298 | ||
298 | static void queue_stop_cpus_work(const struct cpumask *cpumask, | 299 | static void queue_stop_cpus_work(const struct cpumask *cpumask, |
299 | cpu_stop_fn_t fn, void *arg, | 300 | cpu_stop_fn_t fn, void *arg, |
@@ -302,22 +303,19 @@ static void queue_stop_cpus_work(const struct cpumask *cpumask, | |||
302 | struct cpu_stop_work *work; | 303 | struct cpu_stop_work *work; |
303 | unsigned int cpu; | 304 | unsigned int cpu; |
304 | 305 | ||
305 | /* initialize works and done */ | ||
306 | for_each_cpu(cpu, cpumask) { | ||
307 | work = &per_cpu(stop_cpus_work, cpu); | ||
308 | work->fn = fn; | ||
309 | work->arg = arg; | ||
310 | work->done = done; | ||
311 | } | ||
312 | |||
313 | /* | 306 | /* |
314 | * Disable preemption while queueing to avoid getting | 307 | * Disable preemption while queueing to avoid getting |
315 | * preempted by a stopper which might wait for other stoppers | 308 | * preempted by a stopper which might wait for other stoppers |
316 | * to enter @fn which can lead to deadlock. | 309 | * to enter @fn which can lead to deadlock. |
317 | */ | 310 | */ |
318 | lg_global_lock(&stop_cpus_lock); | 311 | lg_global_lock(&stop_cpus_lock); |
319 | for_each_cpu(cpu, cpumask) | 312 | for_each_cpu(cpu, cpumask) { |
320 | cpu_stop_queue_work(cpu, &per_cpu(stop_cpus_work, cpu)); | 313 | work = &per_cpu(cpu_stopper.stop_work, cpu); |
314 | work->fn = fn; | ||
315 | work->arg = arg; | ||
316 | work->done = done; | ||
317 | cpu_stop_queue_work(cpu, work); | ||
318 | } | ||
321 | lg_global_unlock(&stop_cpus_lock); | 319 | lg_global_unlock(&stop_cpus_lock); |
322 | } | 320 | } |
323 | 321 | ||
@@ -458,19 +456,21 @@ extern void sched_set_stop_task(int cpu, struct task_struct *stop); | |||
458 | 456 | ||
459 | static void cpu_stop_create(unsigned int cpu) | 457 | static void cpu_stop_create(unsigned int cpu) |
460 | { | 458 | { |
461 | sched_set_stop_task(cpu, per_cpu(cpu_stopper_task, cpu)); | 459 | sched_set_stop_task(cpu, per_cpu(cpu_stopper.thread, cpu)); |
462 | } | 460 | } |
463 | 461 | ||
464 | static void cpu_stop_park(unsigned int cpu) | 462 | static void cpu_stop_park(unsigned int cpu) |
465 | { | 463 | { |
466 | struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); | 464 | struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); |
467 | struct cpu_stop_work *work; | 465 | struct cpu_stop_work *work, *tmp; |
468 | unsigned long flags; | 466 | unsigned long flags; |
469 | 467 | ||
470 | /* drain remaining works */ | 468 | /* drain remaining works */ |
471 | spin_lock_irqsave(&stopper->lock, flags); | 469 | spin_lock_irqsave(&stopper->lock, flags); |
472 | list_for_each_entry(work, &stopper->works, list) | 470 | list_for_each_entry_safe(work, tmp, &stopper->works, list) { |
471 | list_del_init(&work->list); | ||
473 | cpu_stop_signal_done(work->done, false); | 472 | cpu_stop_signal_done(work->done, false); |
473 | } | ||
474 | stopper->enabled = false; | 474 | stopper->enabled = false; |
475 | spin_unlock_irqrestore(&stopper->lock, flags); | 475 | spin_unlock_irqrestore(&stopper->lock, flags); |
476 | } | 476 | } |
@@ -485,7 +485,7 @@ static void cpu_stop_unpark(unsigned int cpu) | |||
485 | } | 485 | } |
486 | 486 | ||
487 | static struct smp_hotplug_thread cpu_stop_threads = { | 487 | static struct smp_hotplug_thread cpu_stop_threads = { |
488 | .store = &cpu_stopper_task, | 488 | .store = &cpu_stopper.thread, |
489 | .thread_should_run = cpu_stop_should_run, | 489 | .thread_should_run = cpu_stop_should_run, |
490 | .thread_fn = cpu_stopper_thread, | 490 | .thread_fn = cpu_stopper_thread, |
491 | .thread_comm = "migration/%u", | 491 | .thread_comm = "migration/%u", |
@@ -515,7 +515,7 @@ early_initcall(cpu_stop_init); | |||
515 | 515 | ||
516 | #ifdef CONFIG_STOP_MACHINE | 516 | #ifdef CONFIG_STOP_MACHINE |
517 | 517 | ||
518 | int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus) | 518 | static int __stop_machine(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus) |
519 | { | 519 | { |
520 | struct multi_stop_data msdata = { | 520 | struct multi_stop_data msdata = { |
521 | .fn = fn, | 521 | .fn = fn, |
@@ -548,7 +548,7 @@ int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus) | |||
548 | return stop_cpus(cpu_online_mask, multi_cpu_stop, &msdata); | 548 | return stop_cpus(cpu_online_mask, multi_cpu_stop, &msdata); |
549 | } | 549 | } |
550 | 550 | ||
551 | int stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus) | 551 | int stop_machine(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus) |
552 | { | 552 | { |
553 | int ret; | 553 | int ret; |
554 | 554 | ||
@@ -582,7 +582,7 @@ EXPORT_SYMBOL_GPL(stop_machine); | |||
582 | * 0 if all executions of @fn returned 0, any non zero return value if any | 582 | * 0 if all executions of @fn returned 0, any non zero return value if any |
583 | * returned non zero. | 583 | * returned non zero. |
584 | */ | 584 | */ |
585 | int stop_machine_from_inactive_cpu(int (*fn)(void *), void *data, | 585 | int stop_machine_from_inactive_cpu(cpu_stop_fn_t fn, void *data, |
586 | const struct cpumask *cpus) | 586 | const struct cpumask *cpus) |
587 | { | 587 | { |
588 | struct multi_stop_data msdata = { .fn = fn, .data = data, | 588 | struct multi_stop_data msdata = { .fn = fn, .data = data, |
diff --git a/kernel/sys.c b/kernel/sys.c index 259fda25eb6b..fa2f2f671a5c 100644 --- a/kernel/sys.c +++ b/kernel/sys.c | |||
@@ -1668,8 +1668,7 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) | |||
1668 | * overall picture. | 1668 | * overall picture. |
1669 | */ | 1669 | */ |
1670 | err = -EACCES; | 1670 | err = -EACCES; |
1671 | if (!S_ISREG(inode->i_mode) || | 1671 | if (!S_ISREG(inode->i_mode) || path_noexec(&exe.file->f_path)) |
1672 | exe.file->f_path.mnt->mnt_flags & MNT_NOEXEC) | ||
1673 | goto exit; | 1672 | goto exit; |
1674 | 1673 | ||
1675 | err = inode_permission(inode, MAY_EXEC); | 1674 | err = inode_permission(inode, MAY_EXEC); |
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 7995ef5868d8..a02decf15583 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c | |||
@@ -140,6 +140,7 @@ cond_syscall(sys_sgetmask); | |||
140 | cond_syscall(sys_ssetmask); | 140 | cond_syscall(sys_ssetmask); |
141 | cond_syscall(sys_vm86old); | 141 | cond_syscall(sys_vm86old); |
142 | cond_syscall(sys_vm86); | 142 | cond_syscall(sys_vm86); |
143 | cond_syscall(sys_modify_ldt); | ||
143 | cond_syscall(sys_ipc); | 144 | cond_syscall(sys_ipc); |
144 | cond_syscall(compat_sys_ipc); | 145 | cond_syscall(compat_sys_ipc); |
145 | cond_syscall(compat_sys_sysctl); | 146 | cond_syscall(compat_sys_sysctl); |
@@ -218,6 +219,7 @@ cond_syscall(compat_sys_timerfd_gettime); | |||
218 | cond_syscall(sys_eventfd); | 219 | cond_syscall(sys_eventfd); |
219 | cond_syscall(sys_eventfd2); | 220 | cond_syscall(sys_eventfd2); |
220 | cond_syscall(sys_memfd_create); | 221 | cond_syscall(sys_memfd_create); |
222 | cond_syscall(sys_userfaultfd); | ||
221 | 223 | ||
222 | /* performance counters: */ | 224 | /* performance counters: */ |
223 | cond_syscall(sys_perf_event_open); | 225 | cond_syscall(sys_perf_event_open); |
@@ -243,3 +245,6 @@ cond_syscall(sys_bpf); | |||
243 | 245 | ||
244 | /* execveat */ | 246 | /* execveat */ |
245 | cond_syscall(sys_execveat); | 247 | cond_syscall(sys_execveat); |
248 | |||
249 | /* membarrier */ | ||
250 | cond_syscall(sys_membarrier); | ||
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 19b62b522158..e69201d8094e 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
@@ -621,7 +621,7 @@ static struct ctl_table kern_table[] = { | |||
621 | .proc_handler = proc_dointvec, | 621 | .proc_handler = proc_dointvec, |
622 | }, | 622 | }, |
623 | #endif | 623 | #endif |
624 | #ifdef CONFIG_KEXEC | 624 | #ifdef CONFIG_KEXEC_CORE |
625 | { | 625 | { |
626 | .procname = "kexec_load_disabled", | 626 | .procname = "kexec_load_disabled", |
627 | .data = &kexec_load_disabled, | 627 | .data = &kexec_load_disabled, |
@@ -1995,7 +1995,7 @@ static int do_proc_dointvec_conv(bool *negp, unsigned long *lvalp, | |||
1995 | int val = *valp; | 1995 | int val = *valp; |
1996 | if (val < 0) { | 1996 | if (val < 0) { |
1997 | *negp = true; | 1997 | *negp = true; |
1998 | *lvalp = (unsigned long)-val; | 1998 | *lvalp = -(unsigned long)val; |
1999 | } else { | 1999 | } else { |
2000 | *negp = false; | 2000 | *negp = false; |
2001 | *lvalp = (unsigned long)val; | 2001 | *lvalp = (unsigned long)val; |
@@ -2201,7 +2201,7 @@ static int do_proc_dointvec_minmax_conv(bool *negp, unsigned long *lvalp, | |||
2201 | int val = *valp; | 2201 | int val = *valp; |
2202 | if (val < 0) { | 2202 | if (val < 0) { |
2203 | *negp = true; | 2203 | *negp = true; |
2204 | *lvalp = (unsigned long)-val; | 2204 | *lvalp = -(unsigned long)val; |
2205 | } else { | 2205 | } else { |
2206 | *negp = false; | 2206 | *negp = false; |
2207 | *lvalp = (unsigned long)val; | 2207 | *lvalp = (unsigned long)val; |
@@ -2436,7 +2436,7 @@ static int do_proc_dointvec_jiffies_conv(bool *negp, unsigned long *lvalp, | |||
2436 | unsigned long lval; | 2436 | unsigned long lval; |
2437 | if (val < 0) { | 2437 | if (val < 0) { |
2438 | *negp = true; | 2438 | *negp = true; |
2439 | lval = (unsigned long)-val; | 2439 | lval = -(unsigned long)val; |
2440 | } else { | 2440 | } else { |
2441 | *negp = false; | 2441 | *negp = false; |
2442 | lval = (unsigned long)val; | 2442 | lval = (unsigned long)val; |
@@ -2459,7 +2459,7 @@ static int do_proc_dointvec_userhz_jiffies_conv(bool *negp, unsigned long *lvalp | |||
2459 | unsigned long lval; | 2459 | unsigned long lval; |
2460 | if (val < 0) { | 2460 | if (val < 0) { |
2461 | *negp = true; | 2461 | *negp = true; |
2462 | lval = (unsigned long)-val; | 2462 | lval = -(unsigned long)val; |
2463 | } else { | 2463 | } else { |
2464 | *negp = false; | 2464 | *negp = false; |
2465 | lval = (unsigned long)val; | 2465 | lval = (unsigned long)val; |
@@ -2484,7 +2484,7 @@ static int do_proc_dointvec_ms_jiffies_conv(bool *negp, unsigned long *lvalp, | |||
2484 | unsigned long lval; | 2484 | unsigned long lval; |
2485 | if (val < 0) { | 2485 | if (val < 0) { |
2486 | *negp = true; | 2486 | *negp = true; |
2487 | lval = (unsigned long)-val; | 2487 | lval = -(unsigned long)val; |
2488 | } else { | 2488 | } else { |
2489 | *negp = false; | 2489 | *negp = false; |
2490 | lval = (unsigned long)val; | 2490 | lval = (unsigned long)val; |
diff --git a/kernel/system_certificates.S b/kernel/system_certificates.S deleted file mode 100644 index 3e9868d47535..000000000000 --- a/kernel/system_certificates.S +++ /dev/null | |||
@@ -1,20 +0,0 @@ | |||
1 | #include <linux/export.h> | ||
2 | #include <linux/init.h> | ||
3 | |||
4 | __INITRODATA | ||
5 | |||
6 | .align 8 | ||
7 | .globl VMLINUX_SYMBOL(system_certificate_list) | ||
8 | VMLINUX_SYMBOL(system_certificate_list): | ||
9 | __cert_list_start: | ||
10 | .incbin "kernel/x509_certificate_list" | ||
11 | __cert_list_end: | ||
12 | |||
13 | .align 8 | ||
14 | .globl VMLINUX_SYMBOL(system_certificate_list_size) | ||
15 | VMLINUX_SYMBOL(system_certificate_list_size): | ||
16 | #ifdef CONFIG_64BIT | ||
17 | .quad __cert_list_end - __cert_list_start | ||
18 | #else | ||
19 | .long __cert_list_end - __cert_list_start | ||
20 | #endif | ||
diff --git a/kernel/system_keyring.c b/kernel/system_keyring.c deleted file mode 100644 index 875f64e8935b..000000000000 --- a/kernel/system_keyring.c +++ /dev/null | |||
@@ -1,106 +0,0 @@ | |||
1 | /* System trusted keyring for trusted public keys | ||
2 | * | ||
3 | * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved. | ||
4 | * Written by David Howells (dhowells@redhat.com) | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or | ||
7 | * modify it under the terms of the GNU General Public Licence | ||
8 | * as published by the Free Software Foundation; either version | ||
9 | * 2 of the Licence, or (at your option) any later version. | ||
10 | */ | ||
11 | |||
12 | #include <linux/export.h> | ||
13 | #include <linux/kernel.h> | ||
14 | #include <linux/sched.h> | ||
15 | #include <linux/cred.h> | ||
16 | #include <linux/err.h> | ||
17 | #include <keys/asymmetric-type.h> | ||
18 | #include <keys/system_keyring.h> | ||
19 | #include "module-internal.h" | ||
20 | |||
21 | struct key *system_trusted_keyring; | ||
22 | EXPORT_SYMBOL_GPL(system_trusted_keyring); | ||
23 | |||
24 | extern __initconst const u8 system_certificate_list[]; | ||
25 | extern __initconst const unsigned long system_certificate_list_size; | ||
26 | |||
27 | /* | ||
28 | * Load the compiled-in keys | ||
29 | */ | ||
30 | static __init int system_trusted_keyring_init(void) | ||
31 | { | ||
32 | pr_notice("Initialise system trusted keyring\n"); | ||
33 | |||
34 | system_trusted_keyring = | ||
35 | keyring_alloc(".system_keyring", | ||
36 | KUIDT_INIT(0), KGIDT_INIT(0), current_cred(), | ||
37 | ((KEY_POS_ALL & ~KEY_POS_SETATTR) | | ||
38 | KEY_USR_VIEW | KEY_USR_READ | KEY_USR_SEARCH), | ||
39 | KEY_ALLOC_NOT_IN_QUOTA, NULL); | ||
40 | if (IS_ERR(system_trusted_keyring)) | ||
41 | panic("Can't allocate system trusted keyring\n"); | ||
42 | |||
43 | set_bit(KEY_FLAG_TRUSTED_ONLY, &system_trusted_keyring->flags); | ||
44 | return 0; | ||
45 | } | ||
46 | |||
47 | /* | ||
48 | * Must be initialised before we try and load the keys into the keyring. | ||
49 | */ | ||
50 | device_initcall(system_trusted_keyring_init); | ||
51 | |||
52 | /* | ||
53 | * Load the compiled-in list of X.509 certificates. | ||
54 | */ | ||
55 | static __init int load_system_certificate_list(void) | ||
56 | { | ||
57 | key_ref_t key; | ||
58 | const u8 *p, *end; | ||
59 | size_t plen; | ||
60 | |||
61 | pr_notice("Loading compiled-in X.509 certificates\n"); | ||
62 | |||
63 | p = system_certificate_list; | ||
64 | end = p + system_certificate_list_size; | ||
65 | while (p < end) { | ||
66 | /* Each cert begins with an ASN.1 SEQUENCE tag and must be more | ||
67 | * than 256 bytes in size. | ||
68 | */ | ||
69 | if (end - p < 4) | ||
70 | goto dodgy_cert; | ||
71 | if (p[0] != 0x30 && | ||
72 | p[1] != 0x82) | ||
73 | goto dodgy_cert; | ||
74 | plen = (p[2] << 8) | p[3]; | ||
75 | plen += 4; | ||
76 | if (plen > end - p) | ||
77 | goto dodgy_cert; | ||
78 | |||
79 | key = key_create_or_update(make_key_ref(system_trusted_keyring, 1), | ||
80 | "asymmetric", | ||
81 | NULL, | ||
82 | p, | ||
83 | plen, | ||
84 | ((KEY_POS_ALL & ~KEY_POS_SETATTR) | | ||
85 | KEY_USR_VIEW | KEY_USR_READ), | ||
86 | KEY_ALLOC_NOT_IN_QUOTA | | ||
87 | KEY_ALLOC_TRUSTED); | ||
88 | if (IS_ERR(key)) { | ||
89 | pr_err("Problem loading in-kernel X.509 certificate (%ld)\n", | ||
90 | PTR_ERR(key)); | ||
91 | } else { | ||
92 | set_bit(KEY_FLAG_BUILTIN, &key_ref_to_ptr(key)->flags); | ||
93 | pr_notice("Loaded X.509 cert '%s'\n", | ||
94 | key_ref_to_ptr(key)->description); | ||
95 | key_ref_put(key); | ||
96 | } | ||
97 | p += plen; | ||
98 | } | ||
99 | |||
100 | return 0; | ||
101 | |||
102 | dodgy_cert: | ||
103 | pr_err("Problem parsing in-kernel X.509 certificate list\n"); | ||
104 | return 0; | ||
105 | } | ||
106 | late_initcall(load_system_certificate_list); | ||
diff --git a/kernel/task_work.c b/kernel/task_work.c index 8727032e3a6f..53fa971d000d 100644 --- a/kernel/task_work.c +++ b/kernel/task_work.c | |||
@@ -18,6 +18,8 @@ static struct callback_head work_exited; /* all we need is ->next == NULL */ | |||
18 | * This is like the signal handler which runs in kernel mode, but it doesn't | 18 | * This is like the signal handler which runs in kernel mode, but it doesn't |
19 | * try to wake up the @task. | 19 | * try to wake up the @task. |
20 | * | 20 | * |
21 | * Note: there is no ordering guarantee on works queued here. | ||
22 | * | ||
21 | * RETURNS: | 23 | * RETURNS: |
22 | * 0 if succeeds or -ESRCH. | 24 | * 0 if succeeds or -ESRCH. |
23 | */ | 25 | */ |
@@ -108,16 +110,6 @@ void task_work_run(void) | |||
108 | raw_spin_unlock_wait(&task->pi_lock); | 110 | raw_spin_unlock_wait(&task->pi_lock); |
109 | smp_mb(); | 111 | smp_mb(); |
110 | 112 | ||
111 | /* Reverse the list to run the works in fifo order */ | ||
112 | head = NULL; | ||
113 | do { | ||
114 | next = work->next; | ||
115 | work->next = head; | ||
116 | head = work; | ||
117 | work = next; | ||
118 | } while (work); | ||
119 | |||
120 | work = head; | ||
121 | do { | 113 | do { |
122 | next = work->next; | 114 | next = work->next; |
123 | work->func(work); | 115 | work->func(work); |
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index 579ce1b929af..4008d9f95dd7 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig | |||
@@ -92,12 +92,10 @@ config NO_HZ_FULL | |||
92 | depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS | 92 | depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS |
93 | # We need at least one periodic CPU for timekeeping | 93 | # We need at least one periodic CPU for timekeeping |
94 | depends on SMP | 94 | depends on SMP |
95 | # RCU_USER_QS dependency | ||
96 | depends on HAVE_CONTEXT_TRACKING | 95 | depends on HAVE_CONTEXT_TRACKING |
97 | # VIRT_CPU_ACCOUNTING_GEN dependency | 96 | # VIRT_CPU_ACCOUNTING_GEN dependency |
98 | depends on HAVE_VIRT_CPU_ACCOUNTING_GEN | 97 | depends on HAVE_VIRT_CPU_ACCOUNTING_GEN |
99 | select NO_HZ_COMMON | 98 | select NO_HZ_COMMON |
100 | select RCU_USER_QS | ||
101 | select RCU_NOCB_CPU | 99 | select RCU_NOCB_CPU |
102 | select VIRT_CPU_ACCOUNTING_GEN | 100 | select VIRT_CPU_ACCOUNTING_GEN |
103 | select IRQ_WORK | 101 | select IRQ_WORK |
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 5c7ae4b641c4..457a373e2181 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c | |||
@@ -183,7 +183,7 @@ struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base, | |||
183 | int pinned) | 183 | int pinned) |
184 | { | 184 | { |
185 | if (pinned || !base->migration_enabled) | 185 | if (pinned || !base->migration_enabled) |
186 | return this_cpu_ptr(&hrtimer_bases); | 186 | return base; |
187 | return &per_cpu(hrtimer_bases, get_nohz_timer_target()); | 187 | return &per_cpu(hrtimer_bases, get_nohz_timer_target()); |
188 | } | 188 | } |
189 | #else | 189 | #else |
@@ -191,23 +191,32 @@ static inline | |||
191 | struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base, | 191 | struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base, |
192 | int pinned) | 192 | int pinned) |
193 | { | 193 | { |
194 | return this_cpu_ptr(&hrtimer_bases); | 194 | return base; |
195 | } | 195 | } |
196 | #endif | 196 | #endif |
197 | 197 | ||
198 | /* | 198 | /* |
199 | * Switch the timer base to the current CPU when possible. | 199 | * We switch the timer base to a power-optimized selected CPU target, |
200 | * if: | ||
201 | * - NO_HZ_COMMON is enabled | ||
202 | * - timer migration is enabled | ||
203 | * - the timer callback is not running | ||
204 | * - the timer is not the first expiring timer on the new target | ||
205 | * | ||
206 | * If one of the above requirements is not fulfilled we move the timer | ||
207 | * to the current CPU or leave it on the previously assigned CPU if | ||
208 | * the timer callback is currently running. | ||
200 | */ | 209 | */ |
201 | static inline struct hrtimer_clock_base * | 210 | static inline struct hrtimer_clock_base * |
202 | switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base, | 211 | switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base, |
203 | int pinned) | 212 | int pinned) |
204 | { | 213 | { |
205 | struct hrtimer_cpu_base *new_cpu_base, *this_base; | 214 | struct hrtimer_cpu_base *new_cpu_base, *this_cpu_base; |
206 | struct hrtimer_clock_base *new_base; | 215 | struct hrtimer_clock_base *new_base; |
207 | int basenum = base->index; | 216 | int basenum = base->index; |
208 | 217 | ||
209 | this_base = this_cpu_ptr(&hrtimer_bases); | 218 | this_cpu_base = this_cpu_ptr(&hrtimer_bases); |
210 | new_cpu_base = get_target_base(this_base, pinned); | 219 | new_cpu_base = get_target_base(this_cpu_base, pinned); |
211 | again: | 220 | again: |
212 | new_base = &new_cpu_base->clock_base[basenum]; | 221 | new_base = &new_cpu_base->clock_base[basenum]; |
213 | 222 | ||
@@ -229,19 +238,19 @@ again: | |||
229 | raw_spin_unlock(&base->cpu_base->lock); | 238 | raw_spin_unlock(&base->cpu_base->lock); |
230 | raw_spin_lock(&new_base->cpu_base->lock); | 239 | raw_spin_lock(&new_base->cpu_base->lock); |
231 | 240 | ||
232 | if (new_cpu_base != this_base && | 241 | if (new_cpu_base != this_cpu_base && |
233 | hrtimer_check_target(timer, new_base)) { | 242 | hrtimer_check_target(timer, new_base)) { |
234 | raw_spin_unlock(&new_base->cpu_base->lock); | 243 | raw_spin_unlock(&new_base->cpu_base->lock); |
235 | raw_spin_lock(&base->cpu_base->lock); | 244 | raw_spin_lock(&base->cpu_base->lock); |
236 | new_cpu_base = this_base; | 245 | new_cpu_base = this_cpu_base; |
237 | timer->base = base; | 246 | timer->base = base; |
238 | goto again; | 247 | goto again; |
239 | } | 248 | } |
240 | timer->base = new_base; | 249 | timer->base = new_base; |
241 | } else { | 250 | } else { |
242 | if (new_cpu_base != this_base && | 251 | if (new_cpu_base != this_cpu_base && |
243 | hrtimer_check_target(timer, new_base)) { | 252 | hrtimer_check_target(timer, new_base)) { |
244 | new_cpu_base = this_base; | 253 | new_cpu_base = this_cpu_base; |
245 | goto again; | 254 | goto again; |
246 | } | 255 | } |
247 | } | 256 | } |
@@ -679,14 +688,14 @@ static void retrigger_next_event(void *arg) | |||
679 | /* | 688 | /* |
680 | * Switch to high resolution mode | 689 | * Switch to high resolution mode |
681 | */ | 690 | */ |
682 | static int hrtimer_switch_to_hres(void) | 691 | static void hrtimer_switch_to_hres(void) |
683 | { | 692 | { |
684 | struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases); | 693 | struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases); |
685 | 694 | ||
686 | if (tick_init_highres()) { | 695 | if (tick_init_highres()) { |
687 | printk(KERN_WARNING "Could not switch to high resolution " | 696 | printk(KERN_WARNING "Could not switch to high resolution " |
688 | "mode on CPU %d\n", base->cpu); | 697 | "mode on CPU %d\n", base->cpu); |
689 | return 0; | 698 | return; |
690 | } | 699 | } |
691 | base->hres_active = 1; | 700 | base->hres_active = 1; |
692 | hrtimer_resolution = HIGH_RES_NSEC; | 701 | hrtimer_resolution = HIGH_RES_NSEC; |
@@ -694,7 +703,6 @@ static int hrtimer_switch_to_hres(void) | |||
694 | tick_setup_sched_timer(); | 703 | tick_setup_sched_timer(); |
695 | /* "Retrigger" the interrupt to get things going */ | 704 | /* "Retrigger" the interrupt to get things going */ |
696 | retrigger_next_event(NULL); | 705 | retrigger_next_event(NULL); |
697 | return 1; | ||
698 | } | 706 | } |
699 | 707 | ||
700 | static void clock_was_set_work(struct work_struct *work) | 708 | static void clock_was_set_work(struct work_struct *work) |
@@ -718,7 +726,7 @@ void clock_was_set_delayed(void) | |||
718 | static inline int __hrtimer_hres_active(struct hrtimer_cpu_base *b) { return 0; } | 726 | static inline int __hrtimer_hres_active(struct hrtimer_cpu_base *b) { return 0; } |
719 | static inline int hrtimer_hres_active(void) { return 0; } | 727 | static inline int hrtimer_hres_active(void) { return 0; } |
720 | static inline int hrtimer_is_hres_enabled(void) { return 0; } | 728 | static inline int hrtimer_is_hres_enabled(void) { return 0; } |
721 | static inline int hrtimer_switch_to_hres(void) { return 0; } | 729 | static inline void hrtimer_switch_to_hres(void) { } |
722 | static inline void | 730 | static inline void |
723 | hrtimer_force_reprogram(struct hrtimer_cpu_base *base, int skip_equal) { } | 731 | hrtimer_force_reprogram(struct hrtimer_cpu_base *base, int skip_equal) { } |
724 | static inline int hrtimer_reprogram(struct hrtimer *timer, | 732 | static inline int hrtimer_reprogram(struct hrtimer *timer, |
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index fb4d98c7fd43..df68cb875248 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c | |||
@@ -487,6 +487,11 @@ out: | |||
487 | } | 487 | } |
488 | 488 | ||
489 | #ifdef CONFIG_GENERIC_CMOS_UPDATE | 489 | #ifdef CONFIG_GENERIC_CMOS_UPDATE |
490 | int __weak update_persistent_clock(struct timespec now) | ||
491 | { | ||
492 | return -ENODEV; | ||
493 | } | ||
494 | |||
490 | int __weak update_persistent_clock64(struct timespec64 now64) | 495 | int __weak update_persistent_clock64(struct timespec64 now64) |
491 | { | 496 | { |
492 | struct timespec now; | 497 | struct timespec now; |
diff --git a/kernel/time/tick-broadcast-hrtimer.c b/kernel/time/tick-broadcast-hrtimer.c index 3e7db49a2381..53d7184da0be 100644 --- a/kernel/time/tick-broadcast-hrtimer.c +++ b/kernel/time/tick-broadcast-hrtimer.c | |||
@@ -18,30 +18,23 @@ | |||
18 | 18 | ||
19 | static struct hrtimer bctimer; | 19 | static struct hrtimer bctimer; |
20 | 20 | ||
21 | static void bc_set_mode(enum clock_event_mode mode, | 21 | static int bc_shutdown(struct clock_event_device *evt) |
22 | struct clock_event_device *bc) | ||
23 | { | 22 | { |
24 | switch (mode) { | 23 | /* |
25 | case CLOCK_EVT_MODE_UNUSED: | 24 | * Note, we cannot cancel the timer here as we might |
26 | case CLOCK_EVT_MODE_SHUTDOWN: | 25 | * run into the following live lock scenario: |
27 | /* | 26 | * |
28 | * Note, we cannot cancel the timer here as we might | 27 | * cpu 0 cpu1 |
29 | * run into the following live lock scenario: | 28 | * lock(broadcast_lock); |
30 | * | 29 | * hrtimer_interrupt() |
31 | * cpu 0 cpu1 | 30 | * bc_handler() |
32 | * lock(broadcast_lock); | 31 | * tick_handle_oneshot_broadcast(); |
33 | * hrtimer_interrupt() | 32 | * lock(broadcast_lock); |
34 | * bc_handler() | 33 | * hrtimer_cancel() |
35 | * tick_handle_oneshot_broadcast(); | 34 | * wait_for_callback() |
36 | * lock(broadcast_lock); | 35 | */ |
37 | * hrtimer_cancel() | 36 | hrtimer_try_to_cancel(&bctimer); |
38 | * wait_for_callback() | 37 | return 0; |
39 | */ | ||
40 | hrtimer_try_to_cancel(&bctimer); | ||
41 | break; | ||
42 | default: | ||
43 | break; | ||
44 | } | ||
45 | } | 38 | } |
46 | 39 | ||
47 | /* | 40 | /* |
@@ -82,7 +75,7 @@ static int bc_set_next(ktime_t expires, struct clock_event_device *bc) | |||
82 | } | 75 | } |
83 | 76 | ||
84 | static struct clock_event_device ce_broadcast_hrtimer = { | 77 | static struct clock_event_device ce_broadcast_hrtimer = { |
85 | .set_mode = bc_set_mode, | 78 | .set_state_shutdown = bc_shutdown, |
86 | .set_next_ktime = bc_set_next, | 79 | .set_next_ktime = bc_set_next, |
87 | .features = CLOCK_EVT_FEAT_ONESHOT | | 80 | .features = CLOCK_EVT_FEAT_ONESHOT | |
88 | CLOCK_EVT_FEAT_KTIME | | 81 | CLOCK_EVT_FEAT_KTIME | |
@@ -102,13 +95,11 @@ static enum hrtimer_restart bc_handler(struct hrtimer *t) | |||
102 | { | 95 | { |
103 | ce_broadcast_hrtimer.event_handler(&ce_broadcast_hrtimer); | 96 | ce_broadcast_hrtimer.event_handler(&ce_broadcast_hrtimer); |
104 | 97 | ||
105 | switch (ce_broadcast_hrtimer.mode) { | 98 | if (clockevent_state_oneshot(&ce_broadcast_hrtimer)) |
106 | case CLOCK_EVT_MODE_ONESHOT: | ||
107 | if (ce_broadcast_hrtimer.next_event.tv64 != KTIME_MAX) | 99 | if (ce_broadcast_hrtimer.next_event.tv64 != KTIME_MAX) |
108 | return HRTIMER_RESTART; | 100 | return HRTIMER_RESTART; |
109 | default: | 101 | |
110 | return HRTIMER_NORESTART; | 102 | return HRTIMER_NORESTART; |
111 | } | ||
112 | } | 103 | } |
113 | 104 | ||
114 | void tick_setup_hrtimer_broadcast(void) | 105 | void tick_setup_hrtimer_broadcast(void) |
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 52b9e199b5ac..f6aae7977824 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c | |||
@@ -839,7 +839,6 @@ out: | |||
839 | raw_spin_unlock(&tick_broadcast_lock); | 839 | raw_spin_unlock(&tick_broadcast_lock); |
840 | return ret; | 840 | return ret; |
841 | } | 841 | } |
842 | EXPORT_SYMBOL_GPL(tick_broadcast_oneshot_control); | ||
843 | 842 | ||
844 | /* | 843 | /* |
845 | * Reset the one shot broadcast for a cpu | 844 | * Reset the one shot broadcast for a cpu |
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 55e13efff1ab..d11c55b6ab7d 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c | |||
@@ -304,9 +304,6 @@ void tick_check_new_device(struct clock_event_device *newdev) | |||
304 | int cpu; | 304 | int cpu; |
305 | 305 | ||
306 | cpu = smp_processor_id(); | 306 | cpu = smp_processor_id(); |
307 | if (!cpumask_test_cpu(cpu, newdev->cpumask)) | ||
308 | goto out_bc; | ||
309 | |||
310 | td = &per_cpu(tick_cpu_device, cpu); | 307 | td = &per_cpu(tick_cpu_device, cpu); |
311 | curdev = td->evtdev; | 308 | curdev = td->evtdev; |
312 | 309 | ||
@@ -363,6 +360,7 @@ int tick_broadcast_oneshot_control(enum tick_broadcast_state state) | |||
363 | 360 | ||
364 | return __tick_broadcast_oneshot_control(state); | 361 | return __tick_broadcast_oneshot_control(state); |
365 | } | 362 | } |
363 | EXPORT_SYMBOL_GPL(tick_broadcast_oneshot_control); | ||
366 | 364 | ||
367 | #ifdef CONFIG_HOTPLUG_CPU | 365 | #ifdef CONFIG_HOTPLUG_CPU |
368 | /* | 366 | /* |
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index c792429e98c6..3319e16f31e5 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c | |||
@@ -197,27 +197,9 @@ static bool can_stop_full_tick(void) | |||
197 | return true; | 197 | return true; |
198 | } | 198 | } |
199 | 199 | ||
200 | static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now); | ||
201 | |||
202 | /* | ||
203 | * Re-evaluate the need for the tick on the current CPU | ||
204 | * and restart it if necessary. | ||
205 | */ | ||
206 | void __tick_nohz_full_check(void) | ||
207 | { | ||
208 | struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); | ||
209 | |||
210 | if (tick_nohz_full_cpu(smp_processor_id())) { | ||
211 | if (ts->tick_stopped && !is_idle_task(current)) { | ||
212 | if (!can_stop_full_tick()) | ||
213 | tick_nohz_restart_sched_tick(ts, ktime_get()); | ||
214 | } | ||
215 | } | ||
216 | } | ||
217 | |||
218 | static void nohz_full_kick_work_func(struct irq_work *work) | 200 | static void nohz_full_kick_work_func(struct irq_work *work) |
219 | { | 201 | { |
220 | __tick_nohz_full_check(); | 202 | /* Empty, the tick restart happens on tick_nohz_irq_exit() */ |
221 | } | 203 | } |
222 | 204 | ||
223 | static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = { | 205 | static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = { |
@@ -252,7 +234,7 @@ void tick_nohz_full_kick_cpu(int cpu) | |||
252 | 234 | ||
253 | static void nohz_full_kick_ipi(void *info) | 235 | static void nohz_full_kick_ipi(void *info) |
254 | { | 236 | { |
255 | __tick_nohz_full_check(); | 237 | /* Empty, the tick restart happens on tick_nohz_irq_exit() */ |
256 | } | 238 | } |
257 | 239 | ||
258 | /* | 240 | /* |
@@ -276,7 +258,7 @@ void tick_nohz_full_kick_all(void) | |||
276 | * It might need the tick due to per task/process properties: | 258 | * It might need the tick due to per task/process properties: |
277 | * perf events, posix cpu timers, ... | 259 | * perf events, posix cpu timers, ... |
278 | */ | 260 | */ |
279 | void __tick_nohz_task_switch(struct task_struct *tsk) | 261 | void __tick_nohz_task_switch(void) |
280 | { | 262 | { |
281 | unsigned long flags; | 263 | unsigned long flags; |
282 | 264 | ||
@@ -705,21 +687,38 @@ out: | |||
705 | return tick; | 687 | return tick; |
706 | } | 688 | } |
707 | 689 | ||
708 | static void tick_nohz_full_stop_tick(struct tick_sched *ts) | 690 | static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now) |
691 | { | ||
692 | /* Update jiffies first */ | ||
693 | tick_do_update_jiffies64(now); | ||
694 | update_cpu_load_nohz(); | ||
695 | |||
696 | calc_load_exit_idle(); | ||
697 | touch_softlockup_watchdog(); | ||
698 | /* | ||
699 | * Cancel the scheduled timer and restore the tick | ||
700 | */ | ||
701 | ts->tick_stopped = 0; | ||
702 | ts->idle_exittime = now; | ||
703 | |||
704 | tick_nohz_restart(ts, now); | ||
705 | } | ||
706 | |||
707 | static void tick_nohz_full_update_tick(struct tick_sched *ts) | ||
709 | { | 708 | { |
710 | #ifdef CONFIG_NO_HZ_FULL | 709 | #ifdef CONFIG_NO_HZ_FULL |
711 | int cpu = smp_processor_id(); | 710 | int cpu = smp_processor_id(); |
712 | 711 | ||
713 | if (!tick_nohz_full_cpu(cpu) || is_idle_task(current)) | 712 | if (!tick_nohz_full_cpu(cpu)) |
714 | return; | 713 | return; |
715 | 714 | ||
716 | if (!ts->tick_stopped && ts->nohz_mode == NOHZ_MODE_INACTIVE) | 715 | if (!ts->tick_stopped && ts->nohz_mode == NOHZ_MODE_INACTIVE) |
717 | return; | 716 | return; |
718 | 717 | ||
719 | if (!can_stop_full_tick()) | 718 | if (can_stop_full_tick()) |
720 | return; | 719 | tick_nohz_stop_sched_tick(ts, ktime_get(), cpu); |
721 | 720 | else if (ts->tick_stopped) | |
722 | tick_nohz_stop_sched_tick(ts, ktime_get(), cpu); | 721 | tick_nohz_restart_sched_tick(ts, ktime_get()); |
723 | #endif | 722 | #endif |
724 | } | 723 | } |
725 | 724 | ||
@@ -849,7 +848,7 @@ void tick_nohz_irq_exit(void) | |||
849 | if (ts->inidle) | 848 | if (ts->inidle) |
850 | __tick_nohz_idle_enter(ts); | 849 | __tick_nohz_idle_enter(ts); |
851 | else | 850 | else |
852 | tick_nohz_full_stop_tick(ts); | 851 | tick_nohz_full_update_tick(ts); |
853 | } | 852 | } |
854 | 853 | ||
855 | /** | 854 | /** |
@@ -864,23 +863,6 @@ ktime_t tick_nohz_get_sleep_length(void) | |||
864 | return ts->sleep_length; | 863 | return ts->sleep_length; |
865 | } | 864 | } |
866 | 865 | ||
867 | static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now) | ||
868 | { | ||
869 | /* Update jiffies first */ | ||
870 | tick_do_update_jiffies64(now); | ||
871 | update_cpu_load_nohz(); | ||
872 | |||
873 | calc_load_exit_idle(); | ||
874 | touch_softlockup_watchdog(); | ||
875 | /* | ||
876 | * Cancel the scheduled timer and restore the tick | ||
877 | */ | ||
878 | ts->tick_stopped = 0; | ||
879 | ts->idle_exittime = now; | ||
880 | |||
881 | tick_nohz_restart(ts, now); | ||
882 | } | ||
883 | |||
884 | static void tick_nohz_account_idle_ticks(struct tick_sched *ts) | 866 | static void tick_nohz_account_idle_ticks(struct tick_sched *ts) |
885 | { | 867 | { |
886 | #ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE | 868 | #ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE |
diff --git a/kernel/time/time.c b/kernel/time/time.c index 85d5bb1d67eb..86751c68e08d 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c | |||
@@ -268,10 +268,14 @@ EXPORT_SYMBOL(jiffies_to_msecs); | |||
268 | 268 | ||
269 | unsigned int jiffies_to_usecs(const unsigned long j) | 269 | unsigned int jiffies_to_usecs(const unsigned long j) |
270 | { | 270 | { |
271 | #if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ) | 271 | /* |
272 | * Hz usually doesn't go much further MSEC_PER_SEC. | ||
273 | * jiffies_to_usecs() and usecs_to_jiffies() depend on that. | ||
274 | */ | ||
275 | BUILD_BUG_ON(HZ > USEC_PER_SEC); | ||
276 | |||
277 | #if !(USEC_PER_SEC % HZ) | ||
272 | return (USEC_PER_SEC / HZ) * j; | 278 | return (USEC_PER_SEC / HZ) * j; |
273 | #elif HZ > USEC_PER_SEC && !(HZ % USEC_PER_SEC) | ||
274 | return (j + (HZ / USEC_PER_SEC) - 1)/(HZ / USEC_PER_SEC); | ||
275 | #else | 279 | #else |
276 | # if BITS_PER_LONG == 32 | 280 | # if BITS_PER_LONG == 32 |
277 | return (HZ_TO_USEC_MUL32 * j) >> HZ_TO_USEC_SHR32; | 281 | return (HZ_TO_USEC_MUL32 * j) >> HZ_TO_USEC_SHR32; |
@@ -287,26 +291,20 @@ EXPORT_SYMBOL(jiffies_to_usecs); | |||
287 | * @t: Timespec | 291 | * @t: Timespec |
288 | * @gran: Granularity in ns. | 292 | * @gran: Granularity in ns. |
289 | * | 293 | * |
290 | * Truncate a timespec to a granularity. gran must be smaller than a second. | 294 | * Truncate a timespec to a granularity. Always rounds down. gran must |
291 | * Always rounds down. | 295 | * not be 0 nor greater than a second (NSEC_PER_SEC, or 10^9 ns). |
292 | * | ||
293 | * This function should be only used for timestamps returned by | ||
294 | * current_kernel_time() or CURRENT_TIME, not with do_gettimeofday() because | ||
295 | * it doesn't handle the better resolution of the latter. | ||
296 | */ | 296 | */ |
297 | struct timespec timespec_trunc(struct timespec t, unsigned gran) | 297 | struct timespec timespec_trunc(struct timespec t, unsigned gran) |
298 | { | 298 | { |
299 | /* | 299 | /* Avoid division in the common cases 1 ns and 1 s. */ |
300 | * Division is pretty slow so avoid it for common cases. | 300 | if (gran == 1) { |
301 | * Currently current_kernel_time() never returns better than | ||
302 | * jiffies resolution. Exploit that. | ||
303 | */ | ||
304 | if (gran <= jiffies_to_usecs(1) * 1000) { | ||
305 | /* nothing */ | 301 | /* nothing */ |
306 | } else if (gran == 1000000000) { | 302 | } else if (gran == NSEC_PER_SEC) { |
307 | t.tv_nsec = 0; | 303 | t.tv_nsec = 0; |
308 | } else { | 304 | } else if (gran > 1 && gran < NSEC_PER_SEC) { |
309 | t.tv_nsec -= t.tv_nsec % gran; | 305 | t.tv_nsec -= t.tv_nsec % gran; |
306 | } else { | ||
307 | WARN(1, "illegal file time granularity: %u", gran); | ||
310 | } | 308 | } |
311 | return t; | 309 | return t; |
312 | } | 310 | } |
@@ -546,7 +544,7 @@ EXPORT_SYMBOL(__usecs_to_jiffies); | |||
546 | * value to a scaled second value. | 544 | * value to a scaled second value. |
547 | */ | 545 | */ |
548 | static unsigned long | 546 | static unsigned long |
549 | __timespec_to_jiffies(unsigned long sec, long nsec) | 547 | __timespec64_to_jiffies(u64 sec, long nsec) |
550 | { | 548 | { |
551 | nsec = nsec + TICK_NSEC - 1; | 549 | nsec = nsec + TICK_NSEC - 1; |
552 | 550 | ||
@@ -554,22 +552,27 @@ __timespec_to_jiffies(unsigned long sec, long nsec) | |||
554 | sec = MAX_SEC_IN_JIFFIES; | 552 | sec = MAX_SEC_IN_JIFFIES; |
555 | nsec = 0; | 553 | nsec = 0; |
556 | } | 554 | } |
557 | return (((u64)sec * SEC_CONVERSION) + | 555 | return ((sec * SEC_CONVERSION) + |
558 | (((u64)nsec * NSEC_CONVERSION) >> | 556 | (((u64)nsec * NSEC_CONVERSION) >> |
559 | (NSEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC; | 557 | (NSEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC; |
560 | 558 | ||
561 | } | 559 | } |
562 | 560 | ||
563 | unsigned long | 561 | static unsigned long |
564 | timespec_to_jiffies(const struct timespec *value) | 562 | __timespec_to_jiffies(unsigned long sec, long nsec) |
565 | { | 563 | { |
566 | return __timespec_to_jiffies(value->tv_sec, value->tv_nsec); | 564 | return __timespec64_to_jiffies((u64)sec, nsec); |
567 | } | 565 | } |
568 | 566 | ||
569 | EXPORT_SYMBOL(timespec_to_jiffies); | 567 | unsigned long |
568 | timespec64_to_jiffies(const struct timespec64 *value) | ||
569 | { | ||
570 | return __timespec64_to_jiffies(value->tv_sec, value->tv_nsec); | ||
571 | } | ||
572 | EXPORT_SYMBOL(timespec64_to_jiffies); | ||
570 | 573 | ||
571 | void | 574 | void |
572 | jiffies_to_timespec(const unsigned long jiffies, struct timespec *value) | 575 | jiffies_to_timespec64(const unsigned long jiffies, struct timespec64 *value) |
573 | { | 576 | { |
574 | /* | 577 | /* |
575 | * Convert jiffies to nanoseconds and separate with | 578 | * Convert jiffies to nanoseconds and separate with |
@@ -580,7 +583,7 @@ jiffies_to_timespec(const unsigned long jiffies, struct timespec *value) | |||
580 | NSEC_PER_SEC, &rem); | 583 | NSEC_PER_SEC, &rem); |
581 | value->tv_nsec = rem; | 584 | value->tv_nsec = rem; |
582 | } | 585 | } |
583 | EXPORT_SYMBOL(jiffies_to_timespec); | 586 | EXPORT_SYMBOL(jiffies_to_timespec64); |
584 | 587 | ||
585 | /* | 588 | /* |
586 | * We could use a similar algorithm to timespec_to_jiffies (with a | 589 | * We could use a similar algorithm to timespec_to_jiffies (with a |
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index bca3667a2de1..f6ee2e6b6f5d 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c | |||
@@ -911,6 +911,7 @@ int do_settimeofday64(const struct timespec64 *ts) | |||
911 | struct timekeeper *tk = &tk_core.timekeeper; | 911 | struct timekeeper *tk = &tk_core.timekeeper; |
912 | struct timespec64 ts_delta, xt; | 912 | struct timespec64 ts_delta, xt; |
913 | unsigned long flags; | 913 | unsigned long flags; |
914 | int ret = 0; | ||
914 | 915 | ||
915 | if (!timespec64_valid_strict(ts)) | 916 | if (!timespec64_valid_strict(ts)) |
916 | return -EINVAL; | 917 | return -EINVAL; |
@@ -924,10 +925,15 @@ int do_settimeofday64(const struct timespec64 *ts) | |||
924 | ts_delta.tv_sec = ts->tv_sec - xt.tv_sec; | 925 | ts_delta.tv_sec = ts->tv_sec - xt.tv_sec; |
925 | ts_delta.tv_nsec = ts->tv_nsec - xt.tv_nsec; | 926 | ts_delta.tv_nsec = ts->tv_nsec - xt.tv_nsec; |
926 | 927 | ||
928 | if (timespec64_compare(&tk->wall_to_monotonic, &ts_delta) > 0) { | ||
929 | ret = -EINVAL; | ||
930 | goto out; | ||
931 | } | ||
932 | |||
927 | tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, ts_delta)); | 933 | tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, ts_delta)); |
928 | 934 | ||
929 | tk_set_xtime(tk, ts); | 935 | tk_set_xtime(tk, ts); |
930 | 936 | out: | |
931 | timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET); | 937 | timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET); |
932 | 938 | ||
933 | write_seqcount_end(&tk_core.seq); | 939 | write_seqcount_end(&tk_core.seq); |
@@ -936,7 +942,7 @@ int do_settimeofday64(const struct timespec64 *ts) | |||
936 | /* signal hrtimers about time change */ | 942 | /* signal hrtimers about time change */ |
937 | clock_was_set(); | 943 | clock_was_set(); |
938 | 944 | ||
939 | return 0; | 945 | return ret; |
940 | } | 946 | } |
941 | EXPORT_SYMBOL(do_settimeofday64); | 947 | EXPORT_SYMBOL(do_settimeofday64); |
942 | 948 | ||
@@ -965,7 +971,8 @@ int timekeeping_inject_offset(struct timespec *ts) | |||
965 | 971 | ||
966 | /* Make sure the proposed value is valid */ | 972 | /* Make sure the proposed value is valid */ |
967 | tmp = timespec64_add(tk_xtime(tk), ts64); | 973 | tmp = timespec64_add(tk_xtime(tk), ts64); |
968 | if (!timespec64_valid_strict(&tmp)) { | 974 | if (timespec64_compare(&tk->wall_to_monotonic, &ts64) > 0 || |
975 | !timespec64_valid_strict(&tmp)) { | ||
969 | ret = -EINVAL; | 976 | ret = -EINVAL; |
970 | goto error; | 977 | goto error; |
971 | } | 978 | } |
@@ -1874,7 +1881,7 @@ struct timespec __current_kernel_time(void) | |||
1874 | return timespec64_to_timespec(tk_xtime(tk)); | 1881 | return timespec64_to_timespec(tk_xtime(tk)); |
1875 | } | 1882 | } |
1876 | 1883 | ||
1877 | struct timespec current_kernel_time(void) | 1884 | struct timespec64 current_kernel_time64(void) |
1878 | { | 1885 | { |
1879 | struct timekeeper *tk = &tk_core.timekeeper; | 1886 | struct timekeeper *tk = &tk_core.timekeeper; |
1880 | struct timespec64 now; | 1887 | struct timespec64 now; |
@@ -1886,9 +1893,9 @@ struct timespec current_kernel_time(void) | |||
1886 | now = tk_xtime(tk); | 1893 | now = tk_xtime(tk); |
1887 | } while (read_seqcount_retry(&tk_core.seq, seq)); | 1894 | } while (read_seqcount_retry(&tk_core.seq, seq)); |
1888 | 1895 | ||
1889 | return timespec64_to_timespec(now); | 1896 | return now; |
1890 | } | 1897 | } |
1891 | EXPORT_SYMBOL(current_kernel_time); | 1898 | EXPORT_SYMBOL(current_kernel_time64); |
1892 | 1899 | ||
1893 | struct timespec64 get_monotonic_coarse64(void) | 1900 | struct timespec64 get_monotonic_coarse64(void) |
1894 | { | 1901 | { |
diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 5e097fa9faf7..84190f02b521 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c | |||
@@ -807,8 +807,8 @@ __mod_timer(struct timer_list *timer, unsigned long expires, | |||
807 | spin_unlock(&base->lock); | 807 | spin_unlock(&base->lock); |
808 | base = new_base; | 808 | base = new_base; |
809 | spin_lock(&base->lock); | 809 | spin_lock(&base->lock); |
810 | timer->flags &= ~TIMER_BASEMASK; | 810 | WRITE_ONCE(timer->flags, |
811 | timer->flags |= base->cpu; | 811 | (timer->flags & ~TIMER_BASEMASK) | base->cpu); |
812 | } | 812 | } |
813 | } | 813 | } |
814 | 814 | ||
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index a4536e1e3e2a..129c96033e46 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c | |||
@@ -137,7 +137,7 @@ print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now) | |||
137 | (unsigned long long) ktime_to_ns(base->offset)); | 137 | (unsigned long long) ktime_to_ns(base->offset)); |
138 | #endif | 138 | #endif |
139 | SEQ_printf(m, "active timers:\n"); | 139 | SEQ_printf(m, "active timers:\n"); |
140 | print_active_timers(m, base, now); | 140 | print_active_timers(m, base, now + ktime_to_ns(base->offset)); |
141 | } | 141 | } |
142 | 142 | ||
143 | static void print_cpu(struct seq_file *m, int cpu, u64 now) | 143 | static void print_cpu(struct seq_file *m, int cpu, u64 now) |
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 3b9a48ae153a..1153c43428f3 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig | |||
@@ -434,7 +434,7 @@ config UPROBE_EVENT | |||
434 | 434 | ||
435 | config BPF_EVENTS | 435 | config BPF_EVENTS |
436 | depends on BPF_SYSCALL | 436 | depends on BPF_SYSCALL |
437 | depends on KPROBE_EVENT | 437 | depends on KPROBE_EVENT || UPROBE_EVENT |
438 | bool | 438 | bool |
439 | default y | 439 | default y |
440 | help | 440 | help |
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index b3e6b39b6cf9..90e72a0c3047 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c | |||
@@ -778,9 +778,6 @@ static void blk_add_trace_bio(struct request_queue *q, struct bio *bio, | |||
778 | if (likely(!bt)) | 778 | if (likely(!bt)) |
779 | return; | 779 | return; |
780 | 780 | ||
781 | if (!error && !bio_flagged(bio, BIO_UPTODATE)) | ||
782 | error = EIO; | ||
783 | |||
784 | __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, | 781 | __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, |
785 | bio->bi_rw, what, error, 0, NULL); | 782 | bio->bi_rw, what, error, 0, NULL); |
786 | } | 783 | } |
@@ -887,8 +884,7 @@ static void blk_add_trace_split(void *ignore, | |||
887 | 884 | ||
888 | __blk_add_trace(bt, bio->bi_iter.bi_sector, | 885 | __blk_add_trace(bt, bio->bi_iter.bi_sector, |
889 | bio->bi_iter.bi_size, bio->bi_rw, BLK_TA_SPLIT, | 886 | bio->bi_iter.bi_size, bio->bi_rw, BLK_TA_SPLIT, |
890 | !bio_flagged(bio, BIO_UPTODATE), | 887 | bio->bi_error, sizeof(rpdu), &rpdu); |
891 | sizeof(rpdu), &rpdu); | ||
892 | } | 888 | } |
893 | } | 889 | } |
894 | 890 | ||
@@ -920,8 +916,8 @@ static void blk_add_trace_bio_remap(void *ignore, | |||
920 | r.sector_from = cpu_to_be64(from); | 916 | r.sector_from = cpu_to_be64(from); |
921 | 917 | ||
922 | __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, | 918 | __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, |
923 | bio->bi_rw, BLK_TA_REMAP, | 919 | bio->bi_rw, BLK_TA_REMAP, bio->bi_error, |
924 | !bio_flagged(bio, BIO_UPTODATE), sizeof(r), &r); | 920 | sizeof(r), &r); |
925 | } | 921 | } |
926 | 922 | ||
927 | /** | 923 | /** |
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 88a041adee90..0fe96c7c8803 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c | |||
@@ -81,13 +81,16 @@ static const struct bpf_func_proto bpf_probe_read_proto = { | |||
81 | 81 | ||
82 | /* | 82 | /* |
83 | * limited trace_printk() | 83 | * limited trace_printk() |
84 | * only %d %u %x %ld %lu %lx %lld %llu %llx %p conversion specifiers allowed | 84 | * only %d %u %x %ld %lu %lx %lld %llu %llx %p %s conversion specifiers allowed |
85 | */ | 85 | */ |
86 | static u64 bpf_trace_printk(u64 r1, u64 fmt_size, u64 r3, u64 r4, u64 r5) | 86 | static u64 bpf_trace_printk(u64 r1, u64 fmt_size, u64 r3, u64 r4, u64 r5) |
87 | { | 87 | { |
88 | char *fmt = (char *) (long) r1; | 88 | char *fmt = (char *) (long) r1; |
89 | bool str_seen = false; | ||
89 | int mod[3] = {}; | 90 | int mod[3] = {}; |
90 | int fmt_cnt = 0; | 91 | int fmt_cnt = 0; |
92 | u64 unsafe_addr; | ||
93 | char buf[64]; | ||
91 | int i; | 94 | int i; |
92 | 95 | ||
93 | /* | 96 | /* |
@@ -114,12 +117,37 @@ static u64 bpf_trace_printk(u64 r1, u64 fmt_size, u64 r3, u64 r4, u64 r5) | |||
114 | if (fmt[i] == 'l') { | 117 | if (fmt[i] == 'l') { |
115 | mod[fmt_cnt]++; | 118 | mod[fmt_cnt]++; |
116 | i++; | 119 | i++; |
117 | } else if (fmt[i] == 'p') { | 120 | } else if (fmt[i] == 'p' || fmt[i] == 's') { |
118 | mod[fmt_cnt]++; | 121 | mod[fmt_cnt]++; |
119 | i++; | 122 | i++; |
120 | if (!isspace(fmt[i]) && !ispunct(fmt[i]) && fmt[i] != 0) | 123 | if (!isspace(fmt[i]) && !ispunct(fmt[i]) && fmt[i] != 0) |
121 | return -EINVAL; | 124 | return -EINVAL; |
122 | fmt_cnt++; | 125 | fmt_cnt++; |
126 | if (fmt[i - 1] == 's') { | ||
127 | if (str_seen) | ||
128 | /* allow only one '%s' per fmt string */ | ||
129 | return -EINVAL; | ||
130 | str_seen = true; | ||
131 | |||
132 | switch (fmt_cnt) { | ||
133 | case 1: | ||
134 | unsafe_addr = r3; | ||
135 | r3 = (long) buf; | ||
136 | break; | ||
137 | case 2: | ||
138 | unsafe_addr = r4; | ||
139 | r4 = (long) buf; | ||
140 | break; | ||
141 | case 3: | ||
142 | unsafe_addr = r5; | ||
143 | r5 = (long) buf; | ||
144 | break; | ||
145 | } | ||
146 | buf[0] = 0; | ||
147 | strncpy_from_unsafe(buf, | ||
148 | (void *) (long) unsafe_addr, | ||
149 | sizeof(buf)); | ||
150 | } | ||
123 | continue; | 151 | continue; |
124 | } | 152 | } |
125 | 153 | ||
@@ -158,6 +186,35 @@ const struct bpf_func_proto *bpf_get_trace_printk_proto(void) | |||
158 | return &bpf_trace_printk_proto; | 186 | return &bpf_trace_printk_proto; |
159 | } | 187 | } |
160 | 188 | ||
189 | static u64 bpf_perf_event_read(u64 r1, u64 index, u64 r3, u64 r4, u64 r5) | ||
190 | { | ||
191 | struct bpf_map *map = (struct bpf_map *) (unsigned long) r1; | ||
192 | struct bpf_array *array = container_of(map, struct bpf_array, map); | ||
193 | struct perf_event *event; | ||
194 | |||
195 | if (unlikely(index >= array->map.max_entries)) | ||
196 | return -E2BIG; | ||
197 | |||
198 | event = (struct perf_event *)array->ptrs[index]; | ||
199 | if (!event) | ||
200 | return -ENOENT; | ||
201 | |||
202 | /* | ||
203 | * we don't know if the function is run successfully by the | ||
204 | * return value. It can be judged in other places, such as | ||
205 | * eBPF programs. | ||
206 | */ | ||
207 | return perf_event_read_local(event); | ||
208 | } | ||
209 | |||
210 | const struct bpf_func_proto bpf_perf_event_read_proto = { | ||
211 | .func = bpf_perf_event_read, | ||
212 | .gpl_only = false, | ||
213 | .ret_type = RET_INTEGER, | ||
214 | .arg1_type = ARG_CONST_MAP_PTR, | ||
215 | .arg2_type = ARG_ANYTHING, | ||
216 | }; | ||
217 | |||
161 | static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func_id) | 218 | static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func_id) |
162 | { | 219 | { |
163 | switch (func_id) { | 220 | switch (func_id) { |
@@ -183,6 +240,8 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func | |||
183 | return bpf_get_trace_printk_proto(); | 240 | return bpf_get_trace_printk_proto(); |
184 | case BPF_FUNC_get_smp_processor_id: | 241 | case BPF_FUNC_get_smp_processor_id: |
185 | return &bpf_get_smp_processor_id_proto; | 242 | return &bpf_get_smp_processor_id_proto; |
243 | case BPF_FUNC_perf_event_read: | ||
244 | return &bpf_perf_event_read_proto; | ||
186 | default: | 245 | default: |
187 | return NULL; | 246 | return NULL; |
188 | } | 247 | } |
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 02bece4a99ea..b0623ac785a2 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c | |||
@@ -98,6 +98,13 @@ struct ftrace_pid { | |||
98 | struct pid *pid; | 98 | struct pid *pid; |
99 | }; | 99 | }; |
100 | 100 | ||
101 | static bool ftrace_pids_enabled(void) | ||
102 | { | ||
103 | return !list_empty(&ftrace_pids); | ||
104 | } | ||
105 | |||
106 | static void ftrace_update_trampoline(struct ftrace_ops *ops); | ||
107 | |||
101 | /* | 108 | /* |
102 | * ftrace_disabled is set when an anomaly is discovered. | 109 | * ftrace_disabled is set when an anomaly is discovered. |
103 | * ftrace_disabled is much stronger than ftrace_enabled. | 110 | * ftrace_disabled is much stronger than ftrace_enabled. |
@@ -109,7 +116,6 @@ static DEFINE_MUTEX(ftrace_lock); | |||
109 | static struct ftrace_ops *ftrace_control_list __read_mostly = &ftrace_list_end; | 116 | static struct ftrace_ops *ftrace_control_list __read_mostly = &ftrace_list_end; |
110 | static struct ftrace_ops *ftrace_ops_list __read_mostly = &ftrace_list_end; | 117 | static struct ftrace_ops *ftrace_ops_list __read_mostly = &ftrace_list_end; |
111 | ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub; | 118 | ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub; |
112 | ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub; | ||
113 | static struct ftrace_ops global_ops; | 119 | static struct ftrace_ops global_ops; |
114 | static struct ftrace_ops control_ops; | 120 | static struct ftrace_ops control_ops; |
115 | 121 | ||
@@ -183,14 +189,7 @@ static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip, | |||
183 | if (!test_tsk_trace_trace(current)) | 189 | if (!test_tsk_trace_trace(current)) |
184 | return; | 190 | return; |
185 | 191 | ||
186 | ftrace_pid_function(ip, parent_ip, op, regs); | 192 | op->saved_func(ip, parent_ip, op, regs); |
187 | } | ||
188 | |||
189 | static void set_ftrace_pid_function(ftrace_func_t func) | ||
190 | { | ||
191 | /* do not set ftrace_pid_function to itself! */ | ||
192 | if (func != ftrace_pid_func) | ||
193 | ftrace_pid_function = func; | ||
194 | } | 193 | } |
195 | 194 | ||
196 | /** | 195 | /** |
@@ -202,7 +201,6 @@ static void set_ftrace_pid_function(ftrace_func_t func) | |||
202 | void clear_ftrace_function(void) | 201 | void clear_ftrace_function(void) |
203 | { | 202 | { |
204 | ftrace_trace_function = ftrace_stub; | 203 | ftrace_trace_function = ftrace_stub; |
205 | ftrace_pid_function = ftrace_stub; | ||
206 | } | 204 | } |
207 | 205 | ||
208 | static void control_ops_disable_all(struct ftrace_ops *ops) | 206 | static void control_ops_disable_all(struct ftrace_ops *ops) |
@@ -436,6 +434,12 @@ static int __register_ftrace_function(struct ftrace_ops *ops) | |||
436 | } else | 434 | } else |
437 | add_ftrace_ops(&ftrace_ops_list, ops); | 435 | add_ftrace_ops(&ftrace_ops_list, ops); |
438 | 436 | ||
437 | /* Always save the function, and reset at unregistering */ | ||
438 | ops->saved_func = ops->func; | ||
439 | |||
440 | if (ops->flags & FTRACE_OPS_FL_PID && ftrace_pids_enabled()) | ||
441 | ops->func = ftrace_pid_func; | ||
442 | |||
439 | ftrace_update_trampoline(ops); | 443 | ftrace_update_trampoline(ops); |
440 | 444 | ||
441 | if (ftrace_enabled) | 445 | if (ftrace_enabled) |
@@ -463,15 +467,28 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops) | |||
463 | if (ftrace_enabled) | 467 | if (ftrace_enabled) |
464 | update_ftrace_function(); | 468 | update_ftrace_function(); |
465 | 469 | ||
470 | ops->func = ops->saved_func; | ||
471 | |||
466 | return 0; | 472 | return 0; |
467 | } | 473 | } |
468 | 474 | ||
469 | static void ftrace_update_pid_func(void) | 475 | static void ftrace_update_pid_func(void) |
470 | { | 476 | { |
477 | bool enabled = ftrace_pids_enabled(); | ||
478 | struct ftrace_ops *op; | ||
479 | |||
471 | /* Only do something if we are tracing something */ | 480 | /* Only do something if we are tracing something */ |
472 | if (ftrace_trace_function == ftrace_stub) | 481 | if (ftrace_trace_function == ftrace_stub) |
473 | return; | 482 | return; |
474 | 483 | ||
484 | do_for_each_ftrace_op(op, ftrace_ops_list) { | ||
485 | if (op->flags & FTRACE_OPS_FL_PID) { | ||
486 | op->func = enabled ? ftrace_pid_func : | ||
487 | op->saved_func; | ||
488 | ftrace_update_trampoline(op); | ||
489 | } | ||
490 | } while_for_each_ftrace_op(op); | ||
491 | |||
475 | update_ftrace_function(); | 492 | update_ftrace_function(); |
476 | } | 493 | } |
477 | 494 | ||
@@ -613,13 +630,18 @@ static int function_stat_show(struct seq_file *m, void *v) | |||
613 | goto out; | 630 | goto out; |
614 | } | 631 | } |
615 | 632 | ||
633 | #ifdef CONFIG_FUNCTION_GRAPH_TRACER | ||
634 | avg = rec->time; | ||
635 | do_div(avg, rec->counter); | ||
636 | if (tracing_thresh && (avg < tracing_thresh)) | ||
637 | goto out; | ||
638 | #endif | ||
639 | |||
616 | kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); | 640 | kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); |
617 | seq_printf(m, " %-30.30s %10lu", str, rec->counter); | 641 | seq_printf(m, " %-30.30s %10lu", str, rec->counter); |
618 | 642 | ||
619 | #ifdef CONFIG_FUNCTION_GRAPH_TRACER | 643 | #ifdef CONFIG_FUNCTION_GRAPH_TRACER |
620 | seq_puts(m, " "); | 644 | seq_puts(m, " "); |
621 | avg = rec->time; | ||
622 | do_div(avg, rec->counter); | ||
623 | 645 | ||
624 | /* Sample standard deviation (s^2) */ | 646 | /* Sample standard deviation (s^2) */ |
625 | if (rec->counter <= 1) | 647 | if (rec->counter <= 1) |
@@ -1133,7 +1155,8 @@ static struct ftrace_ops global_ops = { | |||
1133 | .local_hash.filter_hash = EMPTY_HASH, | 1155 | .local_hash.filter_hash = EMPTY_HASH, |
1134 | INIT_OPS_HASH(global_ops) | 1156 | INIT_OPS_HASH(global_ops) |
1135 | .flags = FTRACE_OPS_FL_RECURSION_SAFE | | 1157 | .flags = FTRACE_OPS_FL_RECURSION_SAFE | |
1136 | FTRACE_OPS_FL_INITIALIZED, | 1158 | FTRACE_OPS_FL_INITIALIZED | |
1159 | FTRACE_OPS_FL_PID, | ||
1137 | }; | 1160 | }; |
1138 | 1161 | ||
1139 | /* | 1162 | /* |
@@ -5023,7 +5046,9 @@ static void ftrace_update_trampoline(struct ftrace_ops *ops) | |||
5023 | 5046 | ||
5024 | static struct ftrace_ops global_ops = { | 5047 | static struct ftrace_ops global_ops = { |
5025 | .func = ftrace_stub, | 5048 | .func = ftrace_stub, |
5026 | .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_INITIALIZED, | 5049 | .flags = FTRACE_OPS_FL_RECURSION_SAFE | |
5050 | FTRACE_OPS_FL_INITIALIZED | | ||
5051 | FTRACE_OPS_FL_PID, | ||
5027 | }; | 5052 | }; |
5028 | 5053 | ||
5029 | static int __init ftrace_nodyn_init(void) | 5054 | static int __init ftrace_nodyn_init(void) |
@@ -5080,11 +5105,6 @@ void ftrace_init_array_ops(struct trace_array *tr, ftrace_func_t func) | |||
5080 | if (WARN_ON(tr->ops->func != ftrace_stub)) | 5105 | if (WARN_ON(tr->ops->func != ftrace_stub)) |
5081 | printk("ftrace ops had %pS for function\n", | 5106 | printk("ftrace ops had %pS for function\n", |
5082 | tr->ops->func); | 5107 | tr->ops->func); |
5083 | /* Only the top level instance does pid tracing */ | ||
5084 | if (!list_empty(&ftrace_pids)) { | ||
5085 | set_ftrace_pid_function(func); | ||
5086 | func = ftrace_pid_func; | ||
5087 | } | ||
5088 | } | 5108 | } |
5089 | tr->ops->func = func; | 5109 | tr->ops->func = func; |
5090 | tr->ops->private = tr; | 5110 | tr->ops->private = tr; |
@@ -5371,7 +5391,7 @@ static void *fpid_start(struct seq_file *m, loff_t *pos) | |||
5371 | { | 5391 | { |
5372 | mutex_lock(&ftrace_lock); | 5392 | mutex_lock(&ftrace_lock); |
5373 | 5393 | ||
5374 | if (list_empty(&ftrace_pids) && (!*pos)) | 5394 | if (!ftrace_pids_enabled() && (!*pos)) |
5375 | return (void *) 1; | 5395 | return (void *) 1; |
5376 | 5396 | ||
5377 | return seq_list_start(&ftrace_pids, *pos); | 5397 | return seq_list_start(&ftrace_pids, *pos); |
@@ -5610,6 +5630,7 @@ static struct ftrace_ops graph_ops = { | |||
5610 | .func = ftrace_stub, | 5630 | .func = ftrace_stub, |
5611 | .flags = FTRACE_OPS_FL_RECURSION_SAFE | | 5631 | .flags = FTRACE_OPS_FL_RECURSION_SAFE | |
5612 | FTRACE_OPS_FL_INITIALIZED | | 5632 | FTRACE_OPS_FL_INITIALIZED | |
5633 | FTRACE_OPS_FL_PID | | ||
5613 | FTRACE_OPS_FL_STUB, | 5634 | FTRACE_OPS_FL_STUB, |
5614 | #ifdef FTRACE_GRAPH_TRAMP_ADDR | 5635 | #ifdef FTRACE_GRAPH_TRAMP_ADDR |
5615 | .trampoline = FTRACE_GRAPH_TRAMP_ADDR, | 5636 | .trampoline = FTRACE_GRAPH_TRAMP_ADDR, |
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 6260717c18e3..fc347f8b1bca 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c | |||
@@ -400,6 +400,17 @@ struct rb_irq_work { | |||
400 | }; | 400 | }; |
401 | 401 | ||
402 | /* | 402 | /* |
403 | * Structure to hold event state and handle nested events. | ||
404 | */ | ||
405 | struct rb_event_info { | ||
406 | u64 ts; | ||
407 | u64 delta; | ||
408 | unsigned long length; | ||
409 | struct buffer_page *tail_page; | ||
410 | int add_timestamp; | ||
411 | }; | ||
412 | |||
413 | /* | ||
403 | * Used for which event context the event is in. | 414 | * Used for which event context the event is in. |
404 | * NMI = 0 | 415 | * NMI = 0 |
405 | * IRQ = 1 | 416 | * IRQ = 1 |
@@ -1876,73 +1887,6 @@ rb_event_index(struct ring_buffer_event *event) | |||
1876 | return (addr & ~PAGE_MASK) - BUF_PAGE_HDR_SIZE; | 1887 | return (addr & ~PAGE_MASK) - BUF_PAGE_HDR_SIZE; |
1877 | } | 1888 | } |
1878 | 1889 | ||
1879 | static inline int | ||
1880 | rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer, | ||
1881 | struct ring_buffer_event *event) | ||
1882 | { | ||
1883 | unsigned long addr = (unsigned long)event; | ||
1884 | unsigned long index; | ||
1885 | |||
1886 | index = rb_event_index(event); | ||
1887 | addr &= PAGE_MASK; | ||
1888 | |||
1889 | return cpu_buffer->commit_page->page == (void *)addr && | ||
1890 | rb_commit_index(cpu_buffer) == index; | ||
1891 | } | ||
1892 | |||
1893 | static void | ||
1894 | rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer) | ||
1895 | { | ||
1896 | unsigned long max_count; | ||
1897 | |||
1898 | /* | ||
1899 | * We only race with interrupts and NMIs on this CPU. | ||
1900 | * If we own the commit event, then we can commit | ||
1901 | * all others that interrupted us, since the interruptions | ||
1902 | * are in stack format (they finish before they come | ||
1903 | * back to us). This allows us to do a simple loop to | ||
1904 | * assign the commit to the tail. | ||
1905 | */ | ||
1906 | again: | ||
1907 | max_count = cpu_buffer->nr_pages * 100; | ||
1908 | |||
1909 | while (cpu_buffer->commit_page != cpu_buffer->tail_page) { | ||
1910 | if (RB_WARN_ON(cpu_buffer, !(--max_count))) | ||
1911 | return; | ||
1912 | if (RB_WARN_ON(cpu_buffer, | ||
1913 | rb_is_reader_page(cpu_buffer->tail_page))) | ||
1914 | return; | ||
1915 | local_set(&cpu_buffer->commit_page->page->commit, | ||
1916 | rb_page_write(cpu_buffer->commit_page)); | ||
1917 | rb_inc_page(cpu_buffer, &cpu_buffer->commit_page); | ||
1918 | cpu_buffer->write_stamp = | ||
1919 | cpu_buffer->commit_page->page->time_stamp; | ||
1920 | /* add barrier to keep gcc from optimizing too much */ | ||
1921 | barrier(); | ||
1922 | } | ||
1923 | while (rb_commit_index(cpu_buffer) != | ||
1924 | rb_page_write(cpu_buffer->commit_page)) { | ||
1925 | |||
1926 | local_set(&cpu_buffer->commit_page->page->commit, | ||
1927 | rb_page_write(cpu_buffer->commit_page)); | ||
1928 | RB_WARN_ON(cpu_buffer, | ||
1929 | local_read(&cpu_buffer->commit_page->page->commit) & | ||
1930 | ~RB_WRITE_MASK); | ||
1931 | barrier(); | ||
1932 | } | ||
1933 | |||
1934 | /* again, keep gcc from optimizing */ | ||
1935 | barrier(); | ||
1936 | |||
1937 | /* | ||
1938 | * If an interrupt came in just after the first while loop | ||
1939 | * and pushed the tail page forward, we will be left with | ||
1940 | * a dangling commit that will never go forward. | ||
1941 | */ | ||
1942 | if (unlikely(cpu_buffer->commit_page != cpu_buffer->tail_page)) | ||
1943 | goto again; | ||
1944 | } | ||
1945 | |||
1946 | static void rb_reset_reader_page(struct ring_buffer_per_cpu *cpu_buffer) | 1890 | static void rb_reset_reader_page(struct ring_buffer_per_cpu *cpu_buffer) |
1947 | { | 1891 | { |
1948 | cpu_buffer->read_stamp = cpu_buffer->reader_page->page->time_stamp; | 1892 | cpu_buffer->read_stamp = cpu_buffer->reader_page->page->time_stamp; |
@@ -1968,64 +1912,6 @@ static void rb_inc_iter(struct ring_buffer_iter *iter) | |||
1968 | iter->head = 0; | 1912 | iter->head = 0; |
1969 | } | 1913 | } |
1970 | 1914 | ||
1971 | /* Slow path, do not inline */ | ||
1972 | static noinline struct ring_buffer_event * | ||
1973 | rb_add_time_stamp(struct ring_buffer_event *event, u64 delta) | ||
1974 | { | ||
1975 | event->type_len = RINGBUF_TYPE_TIME_EXTEND; | ||
1976 | |||
1977 | /* Not the first event on the page? */ | ||
1978 | if (rb_event_index(event)) { | ||
1979 | event->time_delta = delta & TS_MASK; | ||
1980 | event->array[0] = delta >> TS_SHIFT; | ||
1981 | } else { | ||
1982 | /* nope, just zero it */ | ||
1983 | event->time_delta = 0; | ||
1984 | event->array[0] = 0; | ||
1985 | } | ||
1986 | |||
1987 | return skip_time_extend(event); | ||
1988 | } | ||
1989 | |||
1990 | /** | ||
1991 | * rb_update_event - update event type and data | ||
1992 | * @event: the event to update | ||
1993 | * @type: the type of event | ||
1994 | * @length: the size of the event field in the ring buffer | ||
1995 | * | ||
1996 | * Update the type and data fields of the event. The length | ||
1997 | * is the actual size that is written to the ring buffer, | ||
1998 | * and with this, we can determine what to place into the | ||
1999 | * data field. | ||
2000 | */ | ||
2001 | static void | ||
2002 | rb_update_event(struct ring_buffer_per_cpu *cpu_buffer, | ||
2003 | struct ring_buffer_event *event, unsigned length, | ||
2004 | int add_timestamp, u64 delta) | ||
2005 | { | ||
2006 | /* Only a commit updates the timestamp */ | ||
2007 | if (unlikely(!rb_event_is_commit(cpu_buffer, event))) | ||
2008 | delta = 0; | ||
2009 | |||
2010 | /* | ||
2011 | * If we need to add a timestamp, then we | ||
2012 | * add it to the start of the resevered space. | ||
2013 | */ | ||
2014 | if (unlikely(add_timestamp)) { | ||
2015 | event = rb_add_time_stamp(event, delta); | ||
2016 | length -= RB_LEN_TIME_EXTEND; | ||
2017 | delta = 0; | ||
2018 | } | ||
2019 | |||
2020 | event->time_delta = delta; | ||
2021 | length -= RB_EVNT_HDR_SIZE; | ||
2022 | if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) { | ||
2023 | event->type_len = 0; | ||
2024 | event->array[0] = length; | ||
2025 | } else | ||
2026 | event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT); | ||
2027 | } | ||
2028 | |||
2029 | /* | 1915 | /* |
2030 | * rb_handle_head_page - writer hit the head page | 1916 | * rb_handle_head_page - writer hit the head page |
2031 | * | 1917 | * |
@@ -2184,29 +2070,13 @@ rb_handle_head_page(struct ring_buffer_per_cpu *cpu_buffer, | |||
2184 | return 0; | 2070 | return 0; |
2185 | } | 2071 | } |
2186 | 2072 | ||
2187 | static unsigned rb_calculate_event_length(unsigned length) | ||
2188 | { | ||
2189 | struct ring_buffer_event event; /* Used only for sizeof array */ | ||
2190 | |||
2191 | /* zero length can cause confusions */ | ||
2192 | if (!length) | ||
2193 | length++; | ||
2194 | |||
2195 | if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) | ||
2196 | length += sizeof(event.array[0]); | ||
2197 | |||
2198 | length += RB_EVNT_HDR_SIZE; | ||
2199 | length = ALIGN(length, RB_ARCH_ALIGNMENT); | ||
2200 | |||
2201 | return length; | ||
2202 | } | ||
2203 | |||
2204 | static inline void | 2073 | static inline void |
2205 | rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer, | 2074 | rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer, |
2206 | struct buffer_page *tail_page, | 2075 | unsigned long tail, struct rb_event_info *info) |
2207 | unsigned long tail, unsigned long length) | ||
2208 | { | 2076 | { |
2077 | struct buffer_page *tail_page = info->tail_page; | ||
2209 | struct ring_buffer_event *event; | 2078 | struct ring_buffer_event *event; |
2079 | unsigned long length = info->length; | ||
2210 | 2080 | ||
2211 | /* | 2081 | /* |
2212 | * Only the event that crossed the page boundary | 2082 | * Only the event that crossed the page boundary |
@@ -2276,13 +2146,14 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer, | |||
2276 | */ | 2146 | */ |
2277 | static noinline struct ring_buffer_event * | 2147 | static noinline struct ring_buffer_event * |
2278 | rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, | 2148 | rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, |
2279 | unsigned long length, unsigned long tail, | 2149 | unsigned long tail, struct rb_event_info *info) |
2280 | struct buffer_page *tail_page, u64 ts) | ||
2281 | { | 2150 | { |
2151 | struct buffer_page *tail_page = info->tail_page; | ||
2282 | struct buffer_page *commit_page = cpu_buffer->commit_page; | 2152 | struct buffer_page *commit_page = cpu_buffer->commit_page; |
2283 | struct ring_buffer *buffer = cpu_buffer->buffer; | 2153 | struct ring_buffer *buffer = cpu_buffer->buffer; |
2284 | struct buffer_page *next_page; | 2154 | struct buffer_page *next_page; |
2285 | int ret; | 2155 | int ret; |
2156 | u64 ts; | ||
2286 | 2157 | ||
2287 | next_page = tail_page; | 2158 | next_page = tail_page; |
2288 | 2159 | ||
@@ -2368,74 +2239,120 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, | |||
2368 | 2239 | ||
2369 | out_again: | 2240 | out_again: |
2370 | 2241 | ||
2371 | rb_reset_tail(cpu_buffer, tail_page, tail, length); | 2242 | rb_reset_tail(cpu_buffer, tail, info); |
2372 | 2243 | ||
2373 | /* fail and let the caller try again */ | 2244 | /* fail and let the caller try again */ |
2374 | return ERR_PTR(-EAGAIN); | 2245 | return ERR_PTR(-EAGAIN); |
2375 | 2246 | ||
2376 | out_reset: | 2247 | out_reset: |
2377 | /* reset write */ | 2248 | /* reset write */ |
2378 | rb_reset_tail(cpu_buffer, tail_page, tail, length); | 2249 | rb_reset_tail(cpu_buffer, tail, info); |
2379 | 2250 | ||
2380 | return NULL; | 2251 | return NULL; |
2381 | } | 2252 | } |
2382 | 2253 | ||
2383 | static struct ring_buffer_event * | 2254 | /* Slow path, do not inline */ |
2384 | __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, | 2255 | static noinline struct ring_buffer_event * |
2385 | unsigned long length, u64 ts, | 2256 | rb_add_time_stamp(struct ring_buffer_event *event, u64 delta) |
2386 | u64 delta, int add_timestamp) | ||
2387 | { | 2257 | { |
2388 | struct buffer_page *tail_page; | 2258 | event->type_len = RINGBUF_TYPE_TIME_EXTEND; |
2389 | struct ring_buffer_event *event; | ||
2390 | unsigned long tail, write; | ||
2391 | 2259 | ||
2392 | /* | 2260 | /* Not the first event on the page? */ |
2393 | * If the time delta since the last event is too big to | 2261 | if (rb_event_index(event)) { |
2394 | * hold in the time field of the event, then we append a | 2262 | event->time_delta = delta & TS_MASK; |
2395 | * TIME EXTEND event ahead of the data event. | 2263 | event->array[0] = delta >> TS_SHIFT; |
2396 | */ | 2264 | } else { |
2397 | if (unlikely(add_timestamp)) | 2265 | /* nope, just zero it */ |
2398 | length += RB_LEN_TIME_EXTEND; | 2266 | event->time_delta = 0; |
2267 | event->array[0] = 0; | ||
2268 | } | ||
2399 | 2269 | ||
2400 | tail_page = cpu_buffer->tail_page; | 2270 | return skip_time_extend(event); |
2401 | write = local_add_return(length, &tail_page->write); | 2271 | } |
2402 | 2272 | ||
2403 | /* set write to only the index of the write */ | 2273 | static inline int rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer, |
2404 | write &= RB_WRITE_MASK; | 2274 | struct ring_buffer_event *event); |
2405 | tail = write - length; | 2275 | |
2276 | /** | ||
2277 | * rb_update_event - update event type and data | ||
2278 | * @event: the event to update | ||
2279 | * @type: the type of event | ||
2280 | * @length: the size of the event field in the ring buffer | ||
2281 | * | ||
2282 | * Update the type and data fields of the event. The length | ||
2283 | * is the actual size that is written to the ring buffer, | ||
2284 | * and with this, we can determine what to place into the | ||
2285 | * data field. | ||
2286 | */ | ||
2287 | static void | ||
2288 | rb_update_event(struct ring_buffer_per_cpu *cpu_buffer, | ||
2289 | struct ring_buffer_event *event, | ||
2290 | struct rb_event_info *info) | ||
2291 | { | ||
2292 | unsigned length = info->length; | ||
2293 | u64 delta = info->delta; | ||
2294 | |||
2295 | /* Only a commit updates the timestamp */ | ||
2296 | if (unlikely(!rb_event_is_commit(cpu_buffer, event))) | ||
2297 | delta = 0; | ||
2406 | 2298 | ||
2407 | /* | 2299 | /* |
2408 | * If this is the first commit on the page, then it has the same | 2300 | * If we need to add a timestamp, then we |
2409 | * timestamp as the page itself. | 2301 | * add it to the start of the resevered space. |
2410 | */ | 2302 | */ |
2411 | if (!tail) | 2303 | if (unlikely(info->add_timestamp)) { |
2304 | event = rb_add_time_stamp(event, delta); | ||
2305 | length -= RB_LEN_TIME_EXTEND; | ||
2412 | delta = 0; | 2306 | delta = 0; |
2307 | } | ||
2413 | 2308 | ||
2414 | /* See if we shot pass the end of this buffer page */ | 2309 | event->time_delta = delta; |
2415 | if (unlikely(write > BUF_PAGE_SIZE)) | 2310 | length -= RB_EVNT_HDR_SIZE; |
2416 | return rb_move_tail(cpu_buffer, length, tail, | 2311 | if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) { |
2417 | tail_page, ts); | 2312 | event->type_len = 0; |
2313 | event->array[0] = length; | ||
2314 | } else | ||
2315 | event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT); | ||
2316 | } | ||
2418 | 2317 | ||
2419 | /* We reserved something on the buffer */ | 2318 | static unsigned rb_calculate_event_length(unsigned length) |
2319 | { | ||
2320 | struct ring_buffer_event event; /* Used only for sizeof array */ | ||
2420 | 2321 | ||
2421 | event = __rb_page_index(tail_page, tail); | 2322 | /* zero length can cause confusions */ |
2422 | kmemcheck_annotate_bitfield(event, bitfield); | 2323 | if (!length) |
2423 | rb_update_event(cpu_buffer, event, length, add_timestamp, delta); | 2324 | length++; |
2424 | 2325 | ||
2425 | local_inc(&tail_page->entries); | 2326 | if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) |
2327 | length += sizeof(event.array[0]); | ||
2328 | |||
2329 | length += RB_EVNT_HDR_SIZE; | ||
2330 | length = ALIGN(length, RB_ARCH_ALIGNMENT); | ||
2426 | 2331 | ||
2427 | /* | 2332 | /* |
2428 | * If this is the first commit on the page, then update | 2333 | * In case the time delta is larger than the 27 bits for it |
2429 | * its timestamp. | 2334 | * in the header, we need to add a timestamp. If another |
2335 | * event comes in when trying to discard this one to increase | ||
2336 | * the length, then the timestamp will be added in the allocated | ||
2337 | * space of this event. If length is bigger than the size needed | ||
2338 | * for the TIME_EXTEND, then padding has to be used. The events | ||
2339 | * length must be either RB_LEN_TIME_EXTEND, or greater than or equal | ||
2340 | * to RB_LEN_TIME_EXTEND + 8, as 8 is the minimum size for padding. | ||
2341 | * As length is a multiple of 4, we only need to worry if it | ||
2342 | * is 12 (RB_LEN_TIME_EXTEND + 4). | ||
2430 | */ | 2343 | */ |
2431 | if (!tail) | 2344 | if (length == RB_LEN_TIME_EXTEND + RB_ALIGNMENT) |
2432 | tail_page->page->time_stamp = ts; | 2345 | length += RB_ALIGNMENT; |
2433 | 2346 | ||
2434 | /* account for these added bytes */ | 2347 | return length; |
2435 | local_add(length, &cpu_buffer->entries_bytes); | 2348 | } |
2436 | 2349 | ||
2437 | return event; | 2350 | #ifndef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK |
2351 | static inline bool sched_clock_stable(void) | ||
2352 | { | ||
2353 | return true; | ||
2438 | } | 2354 | } |
2355 | #endif | ||
2439 | 2356 | ||
2440 | static inline int | 2357 | static inline int |
2441 | rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer, | 2358 | rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer, |
@@ -2483,6 +2400,59 @@ static void rb_start_commit(struct ring_buffer_per_cpu *cpu_buffer) | |||
2483 | local_inc(&cpu_buffer->commits); | 2400 | local_inc(&cpu_buffer->commits); |
2484 | } | 2401 | } |
2485 | 2402 | ||
2403 | static void | ||
2404 | rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer) | ||
2405 | { | ||
2406 | unsigned long max_count; | ||
2407 | |||
2408 | /* | ||
2409 | * We only race with interrupts and NMIs on this CPU. | ||
2410 | * If we own the commit event, then we can commit | ||
2411 | * all others that interrupted us, since the interruptions | ||
2412 | * are in stack format (they finish before they come | ||
2413 | * back to us). This allows us to do a simple loop to | ||
2414 | * assign the commit to the tail. | ||
2415 | */ | ||
2416 | again: | ||
2417 | max_count = cpu_buffer->nr_pages * 100; | ||
2418 | |||
2419 | while (cpu_buffer->commit_page != cpu_buffer->tail_page) { | ||
2420 | if (RB_WARN_ON(cpu_buffer, !(--max_count))) | ||
2421 | return; | ||
2422 | if (RB_WARN_ON(cpu_buffer, | ||
2423 | rb_is_reader_page(cpu_buffer->tail_page))) | ||
2424 | return; | ||
2425 | local_set(&cpu_buffer->commit_page->page->commit, | ||
2426 | rb_page_write(cpu_buffer->commit_page)); | ||
2427 | rb_inc_page(cpu_buffer, &cpu_buffer->commit_page); | ||
2428 | cpu_buffer->write_stamp = | ||
2429 | cpu_buffer->commit_page->page->time_stamp; | ||
2430 | /* add barrier to keep gcc from optimizing too much */ | ||
2431 | barrier(); | ||
2432 | } | ||
2433 | while (rb_commit_index(cpu_buffer) != | ||
2434 | rb_page_write(cpu_buffer->commit_page)) { | ||
2435 | |||
2436 | local_set(&cpu_buffer->commit_page->page->commit, | ||
2437 | rb_page_write(cpu_buffer->commit_page)); | ||
2438 | RB_WARN_ON(cpu_buffer, | ||
2439 | local_read(&cpu_buffer->commit_page->page->commit) & | ||
2440 | ~RB_WRITE_MASK); | ||
2441 | barrier(); | ||
2442 | } | ||
2443 | |||
2444 | /* again, keep gcc from optimizing */ | ||
2445 | barrier(); | ||
2446 | |||
2447 | /* | ||
2448 | * If an interrupt came in just after the first while loop | ||
2449 | * and pushed the tail page forward, we will be left with | ||
2450 | * a dangling commit that will never go forward. | ||
2451 | */ | ||
2452 | if (unlikely(cpu_buffer->commit_page != cpu_buffer->tail_page)) | ||
2453 | goto again; | ||
2454 | } | ||
2455 | |||
2486 | static inline void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer) | 2456 | static inline void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer) |
2487 | { | 2457 | { |
2488 | unsigned long commits; | 2458 | unsigned long commits; |
@@ -2515,91 +2485,94 @@ static inline void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer) | |||
2515 | } | 2485 | } |
2516 | } | 2486 | } |
2517 | 2487 | ||
2518 | static struct ring_buffer_event * | 2488 | static inline void rb_event_discard(struct ring_buffer_event *event) |
2519 | rb_reserve_next_event(struct ring_buffer *buffer, | ||
2520 | struct ring_buffer_per_cpu *cpu_buffer, | ||
2521 | unsigned long length) | ||
2522 | { | 2489 | { |
2523 | struct ring_buffer_event *event; | 2490 | if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) |
2524 | u64 ts, delta; | 2491 | event = skip_time_extend(event); |
2525 | int nr_loops = 0; | ||
2526 | int add_timestamp; | ||
2527 | u64 diff; | ||
2528 | 2492 | ||
2529 | rb_start_commit(cpu_buffer); | 2493 | /* array[0] holds the actual length for the discarded event */ |
2494 | event->array[0] = rb_event_data_length(event) - RB_EVNT_HDR_SIZE; | ||
2495 | event->type_len = RINGBUF_TYPE_PADDING; | ||
2496 | /* time delta must be non zero */ | ||
2497 | if (!event->time_delta) | ||
2498 | event->time_delta = 1; | ||
2499 | } | ||
2530 | 2500 | ||
2531 | #ifdef CONFIG_RING_BUFFER_ALLOW_SWAP | 2501 | static inline int |
2532 | /* | 2502 | rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer, |
2533 | * Due to the ability to swap a cpu buffer from a buffer | 2503 | struct ring_buffer_event *event) |
2534 | * it is possible it was swapped before we committed. | 2504 | { |
2535 | * (committing stops a swap). We check for it here and | 2505 | unsigned long addr = (unsigned long)event; |
2536 | * if it happened, we have to fail the write. | 2506 | unsigned long index; |
2537 | */ | ||
2538 | barrier(); | ||
2539 | if (unlikely(ACCESS_ONCE(cpu_buffer->buffer) != buffer)) { | ||
2540 | local_dec(&cpu_buffer->committing); | ||
2541 | local_dec(&cpu_buffer->commits); | ||
2542 | return NULL; | ||
2543 | } | ||
2544 | #endif | ||
2545 | 2507 | ||
2546 | length = rb_calculate_event_length(length); | 2508 | index = rb_event_index(event); |
2547 | again: | 2509 | addr &= PAGE_MASK; |
2548 | add_timestamp = 0; | 2510 | |
2549 | delta = 0; | 2511 | return cpu_buffer->commit_page->page == (void *)addr && |
2512 | rb_commit_index(cpu_buffer) == index; | ||
2513 | } | ||
2514 | |||
2515 | static void | ||
2516 | rb_update_write_stamp(struct ring_buffer_per_cpu *cpu_buffer, | ||
2517 | struct ring_buffer_event *event) | ||
2518 | { | ||
2519 | u64 delta; | ||
2550 | 2520 | ||
2551 | /* | 2521 | /* |
2552 | * We allow for interrupts to reenter here and do a trace. | 2522 | * The event first in the commit queue updates the |
2553 | * If one does, it will cause this original code to loop | 2523 | * time stamp. |
2554 | * back here. Even with heavy interrupts happening, this | ||
2555 | * should only happen a few times in a row. If this happens | ||
2556 | * 1000 times in a row, there must be either an interrupt | ||
2557 | * storm or we have something buggy. | ||
2558 | * Bail! | ||
2559 | */ | 2524 | */ |
2560 | if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000)) | 2525 | if (rb_event_is_commit(cpu_buffer, event)) { |
2561 | goto out_fail; | 2526 | /* |
2527 | * A commit event that is first on a page | ||
2528 | * updates the write timestamp with the page stamp | ||
2529 | */ | ||
2530 | if (!rb_event_index(event)) | ||
2531 | cpu_buffer->write_stamp = | ||
2532 | cpu_buffer->commit_page->page->time_stamp; | ||
2533 | else if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) { | ||
2534 | delta = event->array[0]; | ||
2535 | delta <<= TS_SHIFT; | ||
2536 | delta += event->time_delta; | ||
2537 | cpu_buffer->write_stamp += delta; | ||
2538 | } else | ||
2539 | cpu_buffer->write_stamp += event->time_delta; | ||
2540 | } | ||
2541 | } | ||
2562 | 2542 | ||
2563 | ts = rb_time_stamp(cpu_buffer->buffer); | 2543 | static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer, |
2564 | diff = ts - cpu_buffer->write_stamp; | 2544 | struct ring_buffer_event *event) |
2545 | { | ||
2546 | local_inc(&cpu_buffer->entries); | ||
2547 | rb_update_write_stamp(cpu_buffer, event); | ||
2548 | rb_end_commit(cpu_buffer); | ||
2549 | } | ||
2565 | 2550 | ||
2566 | /* make sure this diff is calculated here */ | 2551 | static __always_inline void |
2567 | barrier(); | 2552 | rb_wakeups(struct ring_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer) |
2553 | { | ||
2554 | bool pagebusy; | ||
2568 | 2555 | ||
2569 | /* Did the write stamp get updated already? */ | 2556 | if (buffer->irq_work.waiters_pending) { |
2570 | if (likely(ts >= cpu_buffer->write_stamp)) { | 2557 | buffer->irq_work.waiters_pending = false; |
2571 | delta = diff; | 2558 | /* irq_work_queue() supplies it's own memory barriers */ |
2572 | if (unlikely(test_time_stamp(delta))) { | 2559 | irq_work_queue(&buffer->irq_work.work); |
2573 | int local_clock_stable = 1; | ||
2574 | #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK | ||
2575 | local_clock_stable = sched_clock_stable(); | ||
2576 | #endif | ||
2577 | WARN_ONCE(delta > (1ULL << 59), | ||
2578 | KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n%s", | ||
2579 | (unsigned long long)delta, | ||
2580 | (unsigned long long)ts, | ||
2581 | (unsigned long long)cpu_buffer->write_stamp, | ||
2582 | local_clock_stable ? "" : | ||
2583 | "If you just came from a suspend/resume,\n" | ||
2584 | "please switch to the trace global clock:\n" | ||
2585 | " echo global > /sys/kernel/debug/tracing/trace_clock\n"); | ||
2586 | add_timestamp = 1; | ||
2587 | } | ||
2588 | } | 2560 | } |
2589 | 2561 | ||
2590 | event = __rb_reserve_next(cpu_buffer, length, ts, | 2562 | if (cpu_buffer->irq_work.waiters_pending) { |
2591 | delta, add_timestamp); | 2563 | cpu_buffer->irq_work.waiters_pending = false; |
2592 | if (unlikely(PTR_ERR(event) == -EAGAIN)) | 2564 | /* irq_work_queue() supplies it's own memory barriers */ |
2593 | goto again; | 2565 | irq_work_queue(&cpu_buffer->irq_work.work); |
2594 | 2566 | } | |
2595 | if (!event) | ||
2596 | goto out_fail; | ||
2597 | 2567 | ||
2598 | return event; | 2568 | pagebusy = cpu_buffer->reader_page == cpu_buffer->commit_page; |
2599 | 2569 | ||
2600 | out_fail: | 2570 | if (!pagebusy && cpu_buffer->irq_work.full_waiters_pending) { |
2601 | rb_end_commit(cpu_buffer); | 2571 | cpu_buffer->irq_work.wakeup_full = true; |
2602 | return NULL; | 2572 | cpu_buffer->irq_work.full_waiters_pending = false; |
2573 | /* irq_work_queue() supplies it's own memory barriers */ | ||
2574 | irq_work_queue(&cpu_buffer->irq_work.work); | ||
2575 | } | ||
2603 | } | 2576 | } |
2604 | 2577 | ||
2605 | /* | 2578 | /* |
@@ -2672,6 +2645,178 @@ trace_recursive_unlock(struct ring_buffer_per_cpu *cpu_buffer) | |||
2672 | } | 2645 | } |
2673 | 2646 | ||
2674 | /** | 2647 | /** |
2648 | * ring_buffer_unlock_commit - commit a reserved | ||
2649 | * @buffer: The buffer to commit to | ||
2650 | * @event: The event pointer to commit. | ||
2651 | * | ||
2652 | * This commits the data to the ring buffer, and releases any locks held. | ||
2653 | * | ||
2654 | * Must be paired with ring_buffer_lock_reserve. | ||
2655 | */ | ||
2656 | int ring_buffer_unlock_commit(struct ring_buffer *buffer, | ||
2657 | struct ring_buffer_event *event) | ||
2658 | { | ||
2659 | struct ring_buffer_per_cpu *cpu_buffer; | ||
2660 | int cpu = raw_smp_processor_id(); | ||
2661 | |||
2662 | cpu_buffer = buffer->buffers[cpu]; | ||
2663 | |||
2664 | rb_commit(cpu_buffer, event); | ||
2665 | |||
2666 | rb_wakeups(buffer, cpu_buffer); | ||
2667 | |||
2668 | trace_recursive_unlock(cpu_buffer); | ||
2669 | |||
2670 | preempt_enable_notrace(); | ||
2671 | |||
2672 | return 0; | ||
2673 | } | ||
2674 | EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit); | ||
2675 | |||
2676 | static noinline void | ||
2677 | rb_handle_timestamp(struct ring_buffer_per_cpu *cpu_buffer, | ||
2678 | struct rb_event_info *info) | ||
2679 | { | ||
2680 | WARN_ONCE(info->delta > (1ULL << 59), | ||
2681 | KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n%s", | ||
2682 | (unsigned long long)info->delta, | ||
2683 | (unsigned long long)info->ts, | ||
2684 | (unsigned long long)cpu_buffer->write_stamp, | ||
2685 | sched_clock_stable() ? "" : | ||
2686 | "If you just came from a suspend/resume,\n" | ||
2687 | "please switch to the trace global clock:\n" | ||
2688 | " echo global > /sys/kernel/debug/tracing/trace_clock\n"); | ||
2689 | info->add_timestamp = 1; | ||
2690 | } | ||
2691 | |||
2692 | static struct ring_buffer_event * | ||
2693 | __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, | ||
2694 | struct rb_event_info *info) | ||
2695 | { | ||
2696 | struct ring_buffer_event *event; | ||
2697 | struct buffer_page *tail_page; | ||
2698 | unsigned long tail, write; | ||
2699 | |||
2700 | /* | ||
2701 | * If the time delta since the last event is too big to | ||
2702 | * hold in the time field of the event, then we append a | ||
2703 | * TIME EXTEND event ahead of the data event. | ||
2704 | */ | ||
2705 | if (unlikely(info->add_timestamp)) | ||
2706 | info->length += RB_LEN_TIME_EXTEND; | ||
2707 | |||
2708 | tail_page = info->tail_page = cpu_buffer->tail_page; | ||
2709 | write = local_add_return(info->length, &tail_page->write); | ||
2710 | |||
2711 | /* set write to only the index of the write */ | ||
2712 | write &= RB_WRITE_MASK; | ||
2713 | tail = write - info->length; | ||
2714 | |||
2715 | /* | ||
2716 | * If this is the first commit on the page, then it has the same | ||
2717 | * timestamp as the page itself. | ||
2718 | */ | ||
2719 | if (!tail) | ||
2720 | info->delta = 0; | ||
2721 | |||
2722 | /* See if we shot pass the end of this buffer page */ | ||
2723 | if (unlikely(write > BUF_PAGE_SIZE)) | ||
2724 | return rb_move_tail(cpu_buffer, tail, info); | ||
2725 | |||
2726 | /* We reserved something on the buffer */ | ||
2727 | |||
2728 | event = __rb_page_index(tail_page, tail); | ||
2729 | kmemcheck_annotate_bitfield(event, bitfield); | ||
2730 | rb_update_event(cpu_buffer, event, info); | ||
2731 | |||
2732 | local_inc(&tail_page->entries); | ||
2733 | |||
2734 | /* | ||
2735 | * If this is the first commit on the page, then update | ||
2736 | * its timestamp. | ||
2737 | */ | ||
2738 | if (!tail) | ||
2739 | tail_page->page->time_stamp = info->ts; | ||
2740 | |||
2741 | /* account for these added bytes */ | ||
2742 | local_add(info->length, &cpu_buffer->entries_bytes); | ||
2743 | |||
2744 | return event; | ||
2745 | } | ||
2746 | |||
2747 | static struct ring_buffer_event * | ||
2748 | rb_reserve_next_event(struct ring_buffer *buffer, | ||
2749 | struct ring_buffer_per_cpu *cpu_buffer, | ||
2750 | unsigned long length) | ||
2751 | { | ||
2752 | struct ring_buffer_event *event; | ||
2753 | struct rb_event_info info; | ||
2754 | int nr_loops = 0; | ||
2755 | u64 diff; | ||
2756 | |||
2757 | rb_start_commit(cpu_buffer); | ||
2758 | |||
2759 | #ifdef CONFIG_RING_BUFFER_ALLOW_SWAP | ||
2760 | /* | ||
2761 | * Due to the ability to swap a cpu buffer from a buffer | ||
2762 | * it is possible it was swapped before we committed. | ||
2763 | * (committing stops a swap). We check for it here and | ||
2764 | * if it happened, we have to fail the write. | ||
2765 | */ | ||
2766 | barrier(); | ||
2767 | if (unlikely(ACCESS_ONCE(cpu_buffer->buffer) != buffer)) { | ||
2768 | local_dec(&cpu_buffer->committing); | ||
2769 | local_dec(&cpu_buffer->commits); | ||
2770 | return NULL; | ||
2771 | } | ||
2772 | #endif | ||
2773 | |||
2774 | info.length = rb_calculate_event_length(length); | ||
2775 | again: | ||
2776 | info.add_timestamp = 0; | ||
2777 | info.delta = 0; | ||
2778 | |||
2779 | /* | ||
2780 | * We allow for interrupts to reenter here and do a trace. | ||
2781 | * If one does, it will cause this original code to loop | ||
2782 | * back here. Even with heavy interrupts happening, this | ||
2783 | * should only happen a few times in a row. If this happens | ||
2784 | * 1000 times in a row, there must be either an interrupt | ||
2785 | * storm or we have something buggy. | ||
2786 | * Bail! | ||
2787 | */ | ||
2788 | if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000)) | ||
2789 | goto out_fail; | ||
2790 | |||
2791 | info.ts = rb_time_stamp(cpu_buffer->buffer); | ||
2792 | diff = info.ts - cpu_buffer->write_stamp; | ||
2793 | |||
2794 | /* make sure this diff is calculated here */ | ||
2795 | barrier(); | ||
2796 | |||
2797 | /* Did the write stamp get updated already? */ | ||
2798 | if (likely(info.ts >= cpu_buffer->write_stamp)) { | ||
2799 | info.delta = diff; | ||
2800 | if (unlikely(test_time_stamp(info.delta))) | ||
2801 | rb_handle_timestamp(cpu_buffer, &info); | ||
2802 | } | ||
2803 | |||
2804 | event = __rb_reserve_next(cpu_buffer, &info); | ||
2805 | |||
2806 | if (unlikely(PTR_ERR(event) == -EAGAIN)) | ||
2807 | goto again; | ||
2808 | |||
2809 | if (!event) | ||
2810 | goto out_fail; | ||
2811 | |||
2812 | return event; | ||
2813 | |||
2814 | out_fail: | ||
2815 | rb_end_commit(cpu_buffer); | ||
2816 | return NULL; | ||
2817 | } | ||
2818 | |||
2819 | /** | ||
2675 | * ring_buffer_lock_reserve - reserve a part of the buffer | 2820 | * ring_buffer_lock_reserve - reserve a part of the buffer |
2676 | * @buffer: the ring buffer to reserve from | 2821 | * @buffer: the ring buffer to reserve from |
2677 | * @length: the length of the data to reserve (excluding event header) | 2822 | * @length: the length of the data to reserve (excluding event header) |
@@ -2729,111 +2874,6 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length) | |||
2729 | } | 2874 | } |
2730 | EXPORT_SYMBOL_GPL(ring_buffer_lock_reserve); | 2875 | EXPORT_SYMBOL_GPL(ring_buffer_lock_reserve); |
2731 | 2876 | ||
2732 | static void | ||
2733 | rb_update_write_stamp(struct ring_buffer_per_cpu *cpu_buffer, | ||
2734 | struct ring_buffer_event *event) | ||
2735 | { | ||
2736 | u64 delta; | ||
2737 | |||
2738 | /* | ||
2739 | * The event first in the commit queue updates the | ||
2740 | * time stamp. | ||
2741 | */ | ||
2742 | if (rb_event_is_commit(cpu_buffer, event)) { | ||
2743 | /* | ||
2744 | * A commit event that is first on a page | ||
2745 | * updates the write timestamp with the page stamp | ||
2746 | */ | ||
2747 | if (!rb_event_index(event)) | ||
2748 | cpu_buffer->write_stamp = | ||
2749 | cpu_buffer->commit_page->page->time_stamp; | ||
2750 | else if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) { | ||
2751 | delta = event->array[0]; | ||
2752 | delta <<= TS_SHIFT; | ||
2753 | delta += event->time_delta; | ||
2754 | cpu_buffer->write_stamp += delta; | ||
2755 | } else | ||
2756 | cpu_buffer->write_stamp += event->time_delta; | ||
2757 | } | ||
2758 | } | ||
2759 | |||
2760 | static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer, | ||
2761 | struct ring_buffer_event *event) | ||
2762 | { | ||
2763 | local_inc(&cpu_buffer->entries); | ||
2764 | rb_update_write_stamp(cpu_buffer, event); | ||
2765 | rb_end_commit(cpu_buffer); | ||
2766 | } | ||
2767 | |||
2768 | static __always_inline void | ||
2769 | rb_wakeups(struct ring_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer) | ||
2770 | { | ||
2771 | bool pagebusy; | ||
2772 | |||
2773 | if (buffer->irq_work.waiters_pending) { | ||
2774 | buffer->irq_work.waiters_pending = false; | ||
2775 | /* irq_work_queue() supplies it's own memory barriers */ | ||
2776 | irq_work_queue(&buffer->irq_work.work); | ||
2777 | } | ||
2778 | |||
2779 | if (cpu_buffer->irq_work.waiters_pending) { | ||
2780 | cpu_buffer->irq_work.waiters_pending = false; | ||
2781 | /* irq_work_queue() supplies it's own memory barriers */ | ||
2782 | irq_work_queue(&cpu_buffer->irq_work.work); | ||
2783 | } | ||
2784 | |||
2785 | pagebusy = cpu_buffer->reader_page == cpu_buffer->commit_page; | ||
2786 | |||
2787 | if (!pagebusy && cpu_buffer->irq_work.full_waiters_pending) { | ||
2788 | cpu_buffer->irq_work.wakeup_full = true; | ||
2789 | cpu_buffer->irq_work.full_waiters_pending = false; | ||
2790 | /* irq_work_queue() supplies it's own memory barriers */ | ||
2791 | irq_work_queue(&cpu_buffer->irq_work.work); | ||
2792 | } | ||
2793 | } | ||
2794 | |||
2795 | /** | ||
2796 | * ring_buffer_unlock_commit - commit a reserved | ||
2797 | * @buffer: The buffer to commit to | ||
2798 | * @event: The event pointer to commit. | ||
2799 | * | ||
2800 | * This commits the data to the ring buffer, and releases any locks held. | ||
2801 | * | ||
2802 | * Must be paired with ring_buffer_lock_reserve. | ||
2803 | */ | ||
2804 | int ring_buffer_unlock_commit(struct ring_buffer *buffer, | ||
2805 | struct ring_buffer_event *event) | ||
2806 | { | ||
2807 | struct ring_buffer_per_cpu *cpu_buffer; | ||
2808 | int cpu = raw_smp_processor_id(); | ||
2809 | |||
2810 | cpu_buffer = buffer->buffers[cpu]; | ||
2811 | |||
2812 | rb_commit(cpu_buffer, event); | ||
2813 | |||
2814 | rb_wakeups(buffer, cpu_buffer); | ||
2815 | |||
2816 | trace_recursive_unlock(cpu_buffer); | ||
2817 | |||
2818 | preempt_enable_notrace(); | ||
2819 | |||
2820 | return 0; | ||
2821 | } | ||
2822 | EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit); | ||
2823 | |||
2824 | static inline void rb_event_discard(struct ring_buffer_event *event) | ||
2825 | { | ||
2826 | if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) | ||
2827 | event = skip_time_extend(event); | ||
2828 | |||
2829 | /* array[0] holds the actual length for the discarded event */ | ||
2830 | event->array[0] = rb_event_data_length(event) - RB_EVNT_HDR_SIZE; | ||
2831 | event->type_len = RINGBUF_TYPE_PADDING; | ||
2832 | /* time delta must be non zero */ | ||
2833 | if (!event->time_delta) | ||
2834 | event->time_delta = 1; | ||
2835 | } | ||
2836 | |||
2837 | /* | 2877 | /* |
2838 | * Decrement the entries to the page that an event is on. | 2878 | * Decrement the entries to the page that an event is on. |
2839 | * The event does not even need to exist, only the pointer | 2879 | * The event does not even need to exist, only the pointer |
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index abcbf7ff8743..6e79408674aa 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c | |||
@@ -3035,7 +3035,7 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot) | |||
3035 | if (!iter) | 3035 | if (!iter) |
3036 | return ERR_PTR(-ENOMEM); | 3036 | return ERR_PTR(-ENOMEM); |
3037 | 3037 | ||
3038 | iter->buffer_iter = kzalloc(sizeof(*iter->buffer_iter) * num_possible_cpus(), | 3038 | iter->buffer_iter = kcalloc(nr_cpu_ids, sizeof(*iter->buffer_iter), |
3039 | GFP_KERNEL); | 3039 | GFP_KERNEL); |
3040 | if (!iter->buffer_iter) | 3040 | if (!iter->buffer_iter) |
3041 | goto release; | 3041 | goto release; |
@@ -6990,7 +6990,7 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) | |||
6990 | trace_init_global_iter(&iter); | 6990 | trace_init_global_iter(&iter); |
6991 | 6991 | ||
6992 | for_each_tracing_cpu(cpu) { | 6992 | for_each_tracing_cpu(cpu) { |
6993 | atomic_inc(&per_cpu_ptr(iter.tr->trace_buffer.data, cpu)->disabled); | 6993 | atomic_inc(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled); |
6994 | } | 6994 | } |
6995 | 6995 | ||
6996 | old_userobj = trace_flags & TRACE_ITER_SYM_USEROBJ; | 6996 | old_userobj = trace_flags & TRACE_ITER_SYM_USEROBJ; |
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index f060716b02ae..74bde81601a9 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h | |||
@@ -444,6 +444,7 @@ enum { | |||
444 | 444 | ||
445 | TRACE_CONTROL_BIT, | 445 | TRACE_CONTROL_BIT, |
446 | 446 | ||
447 | TRACE_BRANCH_BIT, | ||
447 | /* | 448 | /* |
448 | * Abuse of the trace_recursion. | 449 | * Abuse of the trace_recursion. |
449 | * As we need a way to maintain state if we are tracing the function | 450 | * As we need a way to maintain state if we are tracing the function |
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c index a87b43f49eb4..e2e12ad3186f 100644 --- a/kernel/trace/trace_branch.c +++ b/kernel/trace/trace_branch.c | |||
@@ -36,9 +36,12 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect) | |||
36 | struct trace_branch *entry; | 36 | struct trace_branch *entry; |
37 | struct ring_buffer *buffer; | 37 | struct ring_buffer *buffer; |
38 | unsigned long flags; | 38 | unsigned long flags; |
39 | int cpu, pc; | 39 | int pc; |
40 | const char *p; | 40 | const char *p; |
41 | 41 | ||
42 | if (current->trace_recursion & TRACE_BRANCH_BIT) | ||
43 | return; | ||
44 | |||
42 | /* | 45 | /* |
43 | * I would love to save just the ftrace_likely_data pointer, but | 46 | * I would love to save just the ftrace_likely_data pointer, but |
44 | * this code can also be used by modules. Ugly things can happen | 47 | * this code can also be used by modules. Ugly things can happen |
@@ -49,10 +52,10 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect) | |||
49 | if (unlikely(!tr)) | 52 | if (unlikely(!tr)) |
50 | return; | 53 | return; |
51 | 54 | ||
52 | local_irq_save(flags); | 55 | raw_local_irq_save(flags); |
53 | cpu = raw_smp_processor_id(); | 56 | current->trace_recursion |= TRACE_BRANCH_BIT; |
54 | data = per_cpu_ptr(tr->trace_buffer.data, cpu); | 57 | data = this_cpu_ptr(tr->trace_buffer.data); |
55 | if (atomic_inc_return(&data->disabled) != 1) | 58 | if (atomic_read(&data->disabled)) |
56 | goto out; | 59 | goto out; |
57 | 60 | ||
58 | pc = preempt_count(); | 61 | pc = preempt_count(); |
@@ -81,8 +84,8 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect) | |||
81 | __buffer_unlock_commit(buffer, event); | 84 | __buffer_unlock_commit(buffer, event); |
82 | 85 | ||
83 | out: | 86 | out: |
84 | atomic_dec(&data->disabled); | 87 | current->trace_recursion &= ~TRACE_BRANCH_BIT; |
85 | local_irq_restore(flags); | 88 | raw_local_irq_restore(flags); |
86 | } | 89 | } |
87 | 90 | ||
88 | static inline | 91 | static inline |
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 404a372ad85a..7ca09cdc20c2 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c | |||
@@ -30,6 +30,7 @@ | |||
30 | DEFINE_MUTEX(event_mutex); | 30 | DEFINE_MUTEX(event_mutex); |
31 | 31 | ||
32 | LIST_HEAD(ftrace_events); | 32 | LIST_HEAD(ftrace_events); |
33 | static LIST_HEAD(ftrace_generic_fields); | ||
33 | static LIST_HEAD(ftrace_common_fields); | 34 | static LIST_HEAD(ftrace_common_fields); |
34 | 35 | ||
35 | #define GFP_TRACE (GFP_KERNEL | __GFP_ZERO) | 36 | #define GFP_TRACE (GFP_KERNEL | __GFP_ZERO) |
@@ -94,6 +95,10 @@ trace_find_event_field(struct trace_event_call *call, char *name) | |||
94 | struct ftrace_event_field *field; | 95 | struct ftrace_event_field *field; |
95 | struct list_head *head; | 96 | struct list_head *head; |
96 | 97 | ||
98 | field = __find_event_field(&ftrace_generic_fields, name); | ||
99 | if (field) | ||
100 | return field; | ||
101 | |||
97 | field = __find_event_field(&ftrace_common_fields, name); | 102 | field = __find_event_field(&ftrace_common_fields, name); |
98 | if (field) | 103 | if (field) |
99 | return field; | 104 | return field; |
@@ -144,6 +149,13 @@ int trace_define_field(struct trace_event_call *call, const char *type, | |||
144 | } | 149 | } |
145 | EXPORT_SYMBOL_GPL(trace_define_field); | 150 | EXPORT_SYMBOL_GPL(trace_define_field); |
146 | 151 | ||
152 | #define __generic_field(type, item, filter_type) \ | ||
153 | ret = __trace_define_field(&ftrace_generic_fields, #type, \ | ||
154 | #item, 0, 0, is_signed_type(type), \ | ||
155 | filter_type); \ | ||
156 | if (ret) \ | ||
157 | return ret; | ||
158 | |||
147 | #define __common_field(type, item) \ | 159 | #define __common_field(type, item) \ |
148 | ret = __trace_define_field(&ftrace_common_fields, #type, \ | 160 | ret = __trace_define_field(&ftrace_common_fields, #type, \ |
149 | "common_" #item, \ | 161 | "common_" #item, \ |
@@ -153,6 +165,16 @@ EXPORT_SYMBOL_GPL(trace_define_field); | |||
153 | if (ret) \ | 165 | if (ret) \ |
154 | return ret; | 166 | return ret; |
155 | 167 | ||
168 | static int trace_define_generic_fields(void) | ||
169 | { | ||
170 | int ret; | ||
171 | |||
172 | __generic_field(int, cpu, FILTER_OTHER); | ||
173 | __generic_field(char *, comm, FILTER_PTR_STRING); | ||
174 | |||
175 | return ret; | ||
176 | } | ||
177 | |||
156 | static int trace_define_common_fields(void) | 178 | static int trace_define_common_fields(void) |
157 | { | 179 | { |
158 | int ret; | 180 | int ret; |
@@ -2671,6 +2693,9 @@ static __init int event_trace_init(void) | |||
2671 | if (!entry) | 2693 | if (!entry) |
2672 | pr_warn("Could not create tracefs 'available_events' entry\n"); | 2694 | pr_warn("Could not create tracefs 'available_events' entry\n"); |
2673 | 2695 | ||
2696 | if (trace_define_generic_fields()) | ||
2697 | pr_warn("tracing: Failed to allocated generic fields"); | ||
2698 | |||
2674 | if (trace_define_common_fields()) | 2699 | if (trace_define_common_fields()) |
2675 | pr_warn("tracing: Failed to allocate common fields"); | 2700 | pr_warn("tracing: Failed to allocate common fields"); |
2676 | 2701 | ||
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index d81d6f302b14..bd1bf184c5c9 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c | |||
@@ -252,6 +252,50 @@ static int filter_pred_strloc(struct filter_pred *pred, void *event) | |||
252 | return match; | 252 | return match; |
253 | } | 253 | } |
254 | 254 | ||
255 | /* Filter predicate for CPUs. */ | ||
256 | static int filter_pred_cpu(struct filter_pred *pred, void *event) | ||
257 | { | ||
258 | int cpu, cmp; | ||
259 | int match = 0; | ||
260 | |||
261 | cpu = raw_smp_processor_id(); | ||
262 | cmp = pred->val; | ||
263 | |||
264 | switch (pred->op) { | ||
265 | case OP_EQ: | ||
266 | match = cpu == cmp; | ||
267 | break; | ||
268 | case OP_LT: | ||
269 | match = cpu < cmp; | ||
270 | break; | ||
271 | case OP_LE: | ||
272 | match = cpu <= cmp; | ||
273 | break; | ||
274 | case OP_GT: | ||
275 | match = cpu > cmp; | ||
276 | break; | ||
277 | case OP_GE: | ||
278 | match = cpu >= cmp; | ||
279 | break; | ||
280 | default: | ||
281 | break; | ||
282 | } | ||
283 | |||
284 | return !!match == !pred->not; | ||
285 | } | ||
286 | |||
287 | /* Filter predicate for COMM. */ | ||
288 | static int filter_pred_comm(struct filter_pred *pred, void *event) | ||
289 | { | ||
290 | int cmp, match; | ||
291 | |||
292 | cmp = pred->regex.match(current->comm, &pred->regex, | ||
293 | pred->regex.field_len); | ||
294 | match = cmp ^ pred->not; | ||
295 | |||
296 | return match; | ||
297 | } | ||
298 | |||
255 | static int filter_pred_none(struct filter_pred *pred, void *event) | 299 | static int filter_pred_none(struct filter_pred *pred, void *event) |
256 | { | 300 | { |
257 | return 0; | 301 | return 0; |
@@ -1002,7 +1046,10 @@ static int init_pred(struct filter_parse_state *ps, | |||
1002 | if (is_string_field(field)) { | 1046 | if (is_string_field(field)) { |
1003 | filter_build_regex(pred); | 1047 | filter_build_regex(pred); |
1004 | 1048 | ||
1005 | if (field->filter_type == FILTER_STATIC_STRING) { | 1049 | if (!strcmp(field->name, "comm")) { |
1050 | fn = filter_pred_comm; | ||
1051 | pred->regex.field_len = TASK_COMM_LEN; | ||
1052 | } else if (field->filter_type == FILTER_STATIC_STRING) { | ||
1006 | fn = filter_pred_string; | 1053 | fn = filter_pred_string; |
1007 | pred->regex.field_len = field->size; | 1054 | pred->regex.field_len = field->size; |
1008 | } else if (field->filter_type == FILTER_DYN_STRING) | 1055 | } else if (field->filter_type == FILTER_DYN_STRING) |
@@ -1025,7 +1072,10 @@ static int init_pred(struct filter_parse_state *ps, | |||
1025 | } | 1072 | } |
1026 | pred->val = val; | 1073 | pred->val = val; |
1027 | 1074 | ||
1028 | fn = select_comparison_fn(pred->op, field->size, | 1075 | if (!strcmp(field->name, "cpu")) |
1076 | fn = filter_pred_cpu; | ||
1077 | else | ||
1078 | fn = select_comparison_fn(pred->op, field->size, | ||
1029 | field->is_signed); | 1079 | field->is_signed); |
1030 | if (!fn) { | 1080 | if (!fn) { |
1031 | parse_error(ps, FILT_ERR_INVALID_OP, 0); | 1081 | parse_error(ps, FILT_ERR_INVALID_OP, 0); |
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 8968bf720c12..ca98445782ac 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c | |||
@@ -715,13 +715,13 @@ trace_print_graph_duration(unsigned long long duration, struct trace_seq *s) | |||
715 | 715 | ||
716 | snprintf(nsecs_str, slen, "%03lu", nsecs_rem); | 716 | snprintf(nsecs_str, slen, "%03lu", nsecs_rem); |
717 | trace_seq_printf(s, ".%s", nsecs_str); | 717 | trace_seq_printf(s, ".%s", nsecs_str); |
718 | len += strlen(nsecs_str); | 718 | len += strlen(nsecs_str) + 1; |
719 | } | 719 | } |
720 | 720 | ||
721 | trace_seq_puts(s, " us "); | 721 | trace_seq_puts(s, " us "); |
722 | 722 | ||
723 | /* Print remaining spaces to fit the row's width */ | 723 | /* Print remaining spaces to fit the row's width */ |
724 | for (i = len; i < 7; i++) | 724 | for (i = len; i < 8; i++) |
725 | trace_seq_putc(s, ' '); | 725 | trace_seq_putc(s, ' '); |
726 | } | 726 | } |
727 | 727 | ||
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index b7d0cdd9906c..c9956440d0e6 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c | |||
@@ -165,11 +165,9 @@ DEFINE_BASIC_FETCH_FUNCS(memory) | |||
165 | static void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs, | 165 | static void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs, |
166 | void *addr, void *dest) | 166 | void *addr, void *dest) |
167 | { | 167 | { |
168 | long ret; | ||
169 | int maxlen = get_rloc_len(*(u32 *)dest); | 168 | int maxlen = get_rloc_len(*(u32 *)dest); |
170 | u8 *dst = get_rloc_data(dest); | 169 | u8 *dst = get_rloc_data(dest); |
171 | u8 *src = addr; | 170 | long ret; |
172 | mm_segment_t old_fs = get_fs(); | ||
173 | 171 | ||
174 | if (!maxlen) | 172 | if (!maxlen) |
175 | return; | 173 | return; |
@@ -178,23 +176,13 @@ static void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs, | |||
178 | * Try to get string again, since the string can be changed while | 176 | * Try to get string again, since the string can be changed while |
179 | * probing. | 177 | * probing. |
180 | */ | 178 | */ |
181 | set_fs(KERNEL_DS); | 179 | ret = strncpy_from_unsafe(dst, addr, maxlen); |
182 | pagefault_disable(); | ||
183 | |||
184 | do | ||
185 | ret = __copy_from_user_inatomic(dst++, src++, 1); | ||
186 | while (dst[-1] && ret == 0 && src - (u8 *)addr < maxlen); | ||
187 | |||
188 | dst[-1] = '\0'; | ||
189 | pagefault_enable(); | ||
190 | set_fs(old_fs); | ||
191 | 180 | ||
192 | if (ret < 0) { /* Failed to fetch string */ | 181 | if (ret < 0) { /* Failed to fetch string */ |
193 | ((u8 *)get_rloc_data(dest))[0] = '\0'; | 182 | dst[0] = '\0'; |
194 | *(u32 *)dest = make_data_rloc(0, get_rloc_offs(*(u32 *)dest)); | 183 | *(u32 *)dest = make_data_rloc(0, get_rloc_offs(*(u32 *)dest)); |
195 | } else { | 184 | } else { |
196 | *(u32 *)dest = make_data_rloc(src - (u8 *)addr, | 185 | *(u32 *)dest = make_data_rloc(ret, get_rloc_offs(*(u32 *)dest)); |
197 | get_rloc_offs(*(u32 *)dest)); | ||
198 | } | 186 | } |
199 | } | 187 | } |
200 | NOKPROBE_SYMBOL(FETCH_FUNC_NAME(memory, string)); | 188 | NOKPROBE_SYMBOL(FETCH_FUNC_NAME(memory, string)); |
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index dfab253727dc..8e481a84aeea 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c | |||
@@ -496,6 +496,8 @@ static const struct trace_mark { | |||
496 | char sym; | 496 | char sym; |
497 | } mark[] = { | 497 | } mark[] = { |
498 | MARK(1000000000ULL , '$'), /* 1 sec */ | 498 | MARK(1000000000ULL , '$'), /* 1 sec */ |
499 | MARK(100000000ULL , '@'), /* 100 msec */ | ||
500 | MARK(10000000ULL , '*'), /* 10 msec */ | ||
499 | MARK(1000000ULL , '#'), /* 1000 usecs */ | 501 | MARK(1000000ULL , '#'), /* 1000 usecs */ |
500 | MARK(100000ULL , '!'), /* 100 usecs */ | 502 | MARK(100000ULL , '!'), /* 100 usecs */ |
501 | MARK(10000ULL , '+'), /* 10 usecs */ | 503 | MARK(10000ULL , '+'), /* 10 usecs */ |
@@ -508,7 +510,7 @@ char trace_find_mark(unsigned long long d) | |||
508 | int size = ARRAY_SIZE(mark); | 510 | int size = ARRAY_SIZE(mark); |
509 | 511 | ||
510 | for (i = 0; i < size; i++) { | 512 | for (i = 0; i < size; i++) { |
511 | if (d >= mark[i].val) | 513 | if (d > mark[i].val) |
512 | break; | 514 | break; |
513 | } | 515 | } |
514 | 516 | ||
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index 419ca37e72c9..f270088e9929 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c | |||
@@ -26,7 +26,7 @@ probe_sched_switch(void *ignore, struct task_struct *prev, struct task_struct *n | |||
26 | } | 26 | } |
27 | 27 | ||
28 | static void | 28 | static void |
29 | probe_sched_wakeup(void *ignore, struct task_struct *wakee, int success) | 29 | probe_sched_wakeup(void *ignore, struct task_struct *wakee) |
30 | { | 30 | { |
31 | if (unlikely(!sched_ref)) | 31 | if (unlikely(!sched_ref)) |
32 | return; | 32 | return; |
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 9b33dd117f3f..12cbe77b4136 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c | |||
@@ -514,7 +514,7 @@ static void wakeup_reset(struct trace_array *tr) | |||
514 | } | 514 | } |
515 | 515 | ||
516 | static void | 516 | static void |
517 | probe_wakeup(void *ignore, struct task_struct *p, int success) | 517 | probe_wakeup(void *ignore, struct task_struct *p) |
518 | { | 518 | { |
519 | struct trace_array_cpu *data; | 519 | struct trace_array_cpu *data; |
520 | int cpu = smp_processor_id(); | 520 | int cpu = smp_processor_id(); |
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 3f34496244e9..b746399ab59c 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c | |||
@@ -18,12 +18,6 @@ | |||
18 | 18 | ||
19 | #define STACK_TRACE_ENTRIES 500 | 19 | #define STACK_TRACE_ENTRIES 500 |
20 | 20 | ||
21 | #ifdef CC_USING_FENTRY | ||
22 | # define fentry 1 | ||
23 | #else | ||
24 | # define fentry 0 | ||
25 | #endif | ||
26 | |||
27 | static unsigned long stack_dump_trace[STACK_TRACE_ENTRIES+1] = | 21 | static unsigned long stack_dump_trace[STACK_TRACE_ENTRIES+1] = |
28 | { [0 ... (STACK_TRACE_ENTRIES)] = ULONG_MAX }; | 22 | { [0 ... (STACK_TRACE_ENTRIES)] = ULONG_MAX }; |
29 | static unsigned stack_dump_index[STACK_TRACE_ENTRIES]; | 23 | static unsigned stack_dump_index[STACK_TRACE_ENTRIES]; |
@@ -35,7 +29,7 @@ static unsigned stack_dump_index[STACK_TRACE_ENTRIES]; | |||
35 | */ | 29 | */ |
36 | static struct stack_trace max_stack_trace = { | 30 | static struct stack_trace max_stack_trace = { |
37 | .max_entries = STACK_TRACE_ENTRIES - 1, | 31 | .max_entries = STACK_TRACE_ENTRIES - 1, |
38 | .entries = &stack_dump_trace[1], | 32 | .entries = &stack_dump_trace[0], |
39 | }; | 33 | }; |
40 | 34 | ||
41 | static unsigned long max_stack_size; | 35 | static unsigned long max_stack_size; |
@@ -55,7 +49,7 @@ static inline void print_max_stack(void) | |||
55 | 49 | ||
56 | pr_emerg(" Depth Size Location (%d entries)\n" | 50 | pr_emerg(" Depth Size Location (%d entries)\n" |
57 | " ----- ---- --------\n", | 51 | " ----- ---- --------\n", |
58 | max_stack_trace.nr_entries - 1); | 52 | max_stack_trace.nr_entries); |
59 | 53 | ||
60 | for (i = 0; i < max_stack_trace.nr_entries; i++) { | 54 | for (i = 0; i < max_stack_trace.nr_entries; i++) { |
61 | if (stack_dump_trace[i] == ULONG_MAX) | 55 | if (stack_dump_trace[i] == ULONG_MAX) |
@@ -77,7 +71,7 @@ check_stack(unsigned long ip, unsigned long *stack) | |||
77 | unsigned long this_size, flags; unsigned long *p, *top, *start; | 71 | unsigned long this_size, flags; unsigned long *p, *top, *start; |
78 | static int tracer_frame; | 72 | static int tracer_frame; |
79 | int frame_size = ACCESS_ONCE(tracer_frame); | 73 | int frame_size = ACCESS_ONCE(tracer_frame); |
80 | int i; | 74 | int i, x; |
81 | 75 | ||
82 | this_size = ((unsigned long)stack) & (THREAD_SIZE-1); | 76 | this_size = ((unsigned long)stack) & (THREAD_SIZE-1); |
83 | this_size = THREAD_SIZE - this_size; | 77 | this_size = THREAD_SIZE - this_size; |
@@ -105,26 +99,20 @@ check_stack(unsigned long ip, unsigned long *stack) | |||
105 | max_stack_size = this_size; | 99 | max_stack_size = this_size; |
106 | 100 | ||
107 | max_stack_trace.nr_entries = 0; | 101 | max_stack_trace.nr_entries = 0; |
108 | 102 | max_stack_trace.skip = 3; | |
109 | if (using_ftrace_ops_list_func()) | ||
110 | max_stack_trace.skip = 4; | ||
111 | else | ||
112 | max_stack_trace.skip = 3; | ||
113 | 103 | ||
114 | save_stack_trace(&max_stack_trace); | 104 | save_stack_trace(&max_stack_trace); |
115 | 105 | ||
116 | /* | 106 | /* Skip over the overhead of the stack tracer itself */ |
117 | * Add the passed in ip from the function tracer. | 107 | for (i = 0; i < max_stack_trace.nr_entries; i++) { |
118 | * Searching for this on the stack will skip over | 108 | if (stack_dump_trace[i] == ip) |
119 | * most of the overhead from the stack tracer itself. | 109 | break; |
120 | */ | 110 | } |
121 | stack_dump_trace[0] = ip; | ||
122 | max_stack_trace.nr_entries++; | ||
123 | 111 | ||
124 | /* | 112 | /* |
125 | * Now find where in the stack these are. | 113 | * Now find where in the stack these are. |
126 | */ | 114 | */ |
127 | i = 0; | 115 | x = 0; |
128 | start = stack; | 116 | start = stack; |
129 | top = (unsigned long *) | 117 | top = (unsigned long *) |
130 | (((unsigned long)start & ~(THREAD_SIZE-1)) + THREAD_SIZE); | 118 | (((unsigned long)start & ~(THREAD_SIZE-1)) + THREAD_SIZE); |
@@ -139,12 +127,15 @@ check_stack(unsigned long ip, unsigned long *stack) | |||
139 | while (i < max_stack_trace.nr_entries) { | 127 | while (i < max_stack_trace.nr_entries) { |
140 | int found = 0; | 128 | int found = 0; |
141 | 129 | ||
142 | stack_dump_index[i] = this_size; | 130 | stack_dump_index[x] = this_size; |
143 | p = start; | 131 | p = start; |
144 | 132 | ||
145 | for (; p < top && i < max_stack_trace.nr_entries; p++) { | 133 | for (; p < top && i < max_stack_trace.nr_entries; p++) { |
134 | if (stack_dump_trace[i] == ULONG_MAX) | ||
135 | break; | ||
146 | if (*p == stack_dump_trace[i]) { | 136 | if (*p == stack_dump_trace[i]) { |
147 | this_size = stack_dump_index[i++] = | 137 | stack_dump_trace[x] = stack_dump_trace[i++]; |
138 | this_size = stack_dump_index[x++] = | ||
148 | (top - p) * sizeof(unsigned long); | 139 | (top - p) * sizeof(unsigned long); |
149 | found = 1; | 140 | found = 1; |
150 | /* Start the search from here */ | 141 | /* Start the search from here */ |
@@ -156,7 +147,7 @@ check_stack(unsigned long ip, unsigned long *stack) | |||
156 | * out what that is, then figure it out | 147 | * out what that is, then figure it out |
157 | * now. | 148 | * now. |
158 | */ | 149 | */ |
159 | if (unlikely(!tracer_frame) && i == 1) { | 150 | if (unlikely(!tracer_frame)) { |
160 | tracer_frame = (p - stack) * | 151 | tracer_frame = (p - stack) * |
161 | sizeof(unsigned long); | 152 | sizeof(unsigned long); |
162 | max_stack_size -= tracer_frame; | 153 | max_stack_size -= tracer_frame; |
@@ -168,6 +159,10 @@ check_stack(unsigned long ip, unsigned long *stack) | |||
168 | i++; | 159 | i++; |
169 | } | 160 | } |
170 | 161 | ||
162 | max_stack_trace.nr_entries = x; | ||
163 | for (; x < i; x++) | ||
164 | stack_dump_trace[x] = ULONG_MAX; | ||
165 | |||
171 | if (task_stack_end_corrupted(current)) { | 166 | if (task_stack_end_corrupted(current)) { |
172 | print_max_stack(); | 167 | print_max_stack(); |
173 | BUG(); | 168 | BUG(); |
@@ -192,24 +187,7 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip, | |||
192 | if (per_cpu(trace_active, cpu)++ != 0) | 187 | if (per_cpu(trace_active, cpu)++ != 0) |
193 | goto out; | 188 | goto out; |
194 | 189 | ||
195 | /* | 190 | ip += MCOUNT_INSN_SIZE; |
196 | * When fentry is used, the traced function does not get | ||
197 | * its stack frame set up, and we lose the parent. | ||
198 | * The ip is pretty useless because the function tracer | ||
199 | * was called before that function set up its stack frame. | ||
200 | * In this case, we use the parent ip. | ||
201 | * | ||
202 | * By adding the return address of either the parent ip | ||
203 | * or the current ip we can disregard most of the stack usage | ||
204 | * caused by the stack tracer itself. | ||
205 | * | ||
206 | * The function tracer always reports the address of where the | ||
207 | * mcount call was, but the stack will hold the return address. | ||
208 | */ | ||
209 | if (fentry) | ||
210 | ip = parent_ip; | ||
211 | else | ||
212 | ip += MCOUNT_INSN_SIZE; | ||
213 | 191 | ||
214 | check_stack(ip, &stack); | 192 | check_stack(ip, &stack); |
215 | 193 | ||
@@ -284,7 +262,7 @@ __next(struct seq_file *m, loff_t *pos) | |||
284 | { | 262 | { |
285 | long n = *pos - 1; | 263 | long n = *pos - 1; |
286 | 264 | ||
287 | if (n >= max_stack_trace.nr_entries || stack_dump_trace[n] == ULONG_MAX) | 265 | if (n > max_stack_trace.nr_entries || stack_dump_trace[n] == ULONG_MAX) |
288 | return NULL; | 266 | return NULL; |
289 | 267 | ||
290 | m->private = (void *)n; | 268 | m->private = (void *)n; |
@@ -354,7 +332,7 @@ static int t_show(struct seq_file *m, void *v) | |||
354 | seq_printf(m, " Depth Size Location" | 332 | seq_printf(m, " Depth Size Location" |
355 | " (%d entries)\n" | 333 | " (%d entries)\n" |
356 | " ----- ---- --------\n", | 334 | " ----- ---- --------\n", |
357 | max_stack_trace.nr_entries - 1); | 335 | max_stack_trace.nr_entries); |
358 | 336 | ||
359 | if (!stack_tracer_enabled && !max_stack_size) | 337 | if (!stack_tracer_enabled && !max_stack_size) |
360 | print_disabled(m); | 338 | print_disabled(m); |
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index aa1ea7b36fa8..d2f6d0be3503 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c | |||
@@ -601,7 +601,22 @@ static int probes_seq_show(struct seq_file *m, void *v) | |||
601 | 601 | ||
602 | seq_printf(m, "%c:%s/%s", c, tu->tp.call.class->system, | 602 | seq_printf(m, "%c:%s/%s", c, tu->tp.call.class->system, |
603 | trace_event_name(&tu->tp.call)); | 603 | trace_event_name(&tu->tp.call)); |
604 | seq_printf(m, " %s:0x%p", tu->filename, (void *)tu->offset); | 604 | seq_printf(m, " %s:", tu->filename); |
605 | |||
606 | /* Don't print "0x (null)" when offset is 0 */ | ||
607 | if (tu->offset) { | ||
608 | seq_printf(m, "0x%p", (void *)tu->offset); | ||
609 | } else { | ||
610 | switch (sizeof(void *)) { | ||
611 | case 4: | ||
612 | seq_printf(m, "0x00000000"); | ||
613 | break; | ||
614 | case 8: | ||
615 | default: | ||
616 | seq_printf(m, "0x0000000000000000"); | ||
617 | break; | ||
618 | } | ||
619 | } | ||
605 | 620 | ||
606 | for (i = 0; i < tu->tp.nr_args; i++) | 621 | for (i = 0; i < tu->tp.nr_args; i++) |
607 | seq_printf(m, " %s=%s", tu->tp.args[i].name, tu->tp.args[i].comm); | 622 | seq_printf(m, " %s=%s", tu->tp.args[i].name, tu->tp.args[i].comm); |
@@ -1095,11 +1110,15 @@ static void __uprobe_perf_func(struct trace_uprobe *tu, | |||
1095 | { | 1110 | { |
1096 | struct trace_event_call *call = &tu->tp.call; | 1111 | struct trace_event_call *call = &tu->tp.call; |
1097 | struct uprobe_trace_entry_head *entry; | 1112 | struct uprobe_trace_entry_head *entry; |
1113 | struct bpf_prog *prog = call->prog; | ||
1098 | struct hlist_head *head; | 1114 | struct hlist_head *head; |
1099 | void *data; | 1115 | void *data; |
1100 | int size, esize; | 1116 | int size, esize; |
1101 | int rctx; | 1117 | int rctx; |
1102 | 1118 | ||
1119 | if (prog && !trace_call_bpf(prog, regs)) | ||
1120 | return; | ||
1121 | |||
1103 | esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); | 1122 | esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); |
1104 | 1123 | ||
1105 | size = esize + tu->tp.size + dsize; | 1124 | size = esize + tu->tp.size + dsize; |
@@ -1289,6 +1308,7 @@ static int register_uprobe_event(struct trace_uprobe *tu) | |||
1289 | return -ENODEV; | 1308 | return -ENODEV; |
1290 | } | 1309 | } |
1291 | 1310 | ||
1311 | call->flags = TRACE_EVENT_FL_UPROBE; | ||
1292 | call->class->reg = trace_uprobe_register; | 1312 | call->class->reg = trace_uprobe_register; |
1293 | call->data = tu; | 1313 | call->data = tu; |
1294 | ret = trace_add_event_call(call); | 1314 | ret = trace_add_event_call(call); |
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 4109f8320684..88fefa68c516 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c | |||
@@ -39,6 +39,7 @@ static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns) | |||
39 | cred->cap_inheritable = CAP_EMPTY_SET; | 39 | cred->cap_inheritable = CAP_EMPTY_SET; |
40 | cred->cap_permitted = CAP_FULL_SET; | 40 | cred->cap_permitted = CAP_FULL_SET; |
41 | cred->cap_effective = CAP_FULL_SET; | 41 | cred->cap_effective = CAP_FULL_SET; |
42 | cred->cap_ambient = CAP_EMPTY_SET; | ||
42 | cred->cap_bset = CAP_FULL_SET; | 43 | cred->cap_bset = CAP_FULL_SET; |
43 | #ifdef CONFIG_KEYS | 44 | #ifdef CONFIG_KEYS |
44 | key_put(cred->request_key_auth); | 45 | key_put(cred->request_key_auth); |
@@ -976,8 +977,8 @@ static int userns_install(struct nsproxy *nsproxy, struct ns_common *ns) | |||
976 | if (user_ns == current_user_ns()) | 977 | if (user_ns == current_user_ns()) |
977 | return -EINVAL; | 978 | return -EINVAL; |
978 | 979 | ||
979 | /* Threaded processes may not enter a different user namespace */ | 980 | /* Tasks that share a thread group must share a user namespace */ |
980 | if (atomic_read(¤t->mm->mm_users) > 1) | 981 | if (!thread_group_empty(current)) |
981 | return -EINVAL; | 982 | return -EINVAL; |
982 | 983 | ||
983 | if (current->fs->users != 1) | 984 | if (current->fs->users != 1) |
diff --git a/kernel/watchdog.c b/kernel/watchdog.c index a6ffa43f2993..64ed1c37bd1f 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c | |||
@@ -24,6 +24,7 @@ | |||
24 | #include <asm/irq_regs.h> | 24 | #include <asm/irq_regs.h> |
25 | #include <linux/kvm_para.h> | 25 | #include <linux/kvm_para.h> |
26 | #include <linux/perf_event.h> | 26 | #include <linux/perf_event.h> |
27 | #include <linux/kthread.h> | ||
27 | 28 | ||
28 | /* | 29 | /* |
29 | * The run state of the lockup detectors is controlled by the content of the | 30 | * The run state of the lockup detectors is controlled by the content of the |
@@ -66,7 +67,26 @@ unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask); | |||
66 | #define for_each_watchdog_cpu(cpu) \ | 67 | #define for_each_watchdog_cpu(cpu) \ |
67 | for_each_cpu_and((cpu), cpu_online_mask, &watchdog_cpumask) | 68 | for_each_cpu_and((cpu), cpu_online_mask, &watchdog_cpumask) |
68 | 69 | ||
70 | /* | ||
71 | * The 'watchdog_running' variable is set to 1 when the watchdog threads | ||
72 | * are registered/started and is set to 0 when the watchdog threads are | ||
73 | * unregistered/stopped, so it is an indicator whether the threads exist. | ||
74 | */ | ||
69 | static int __read_mostly watchdog_running; | 75 | static int __read_mostly watchdog_running; |
76 | /* | ||
77 | * If a subsystem has a need to deactivate the watchdog temporarily, it | ||
78 | * can use the suspend/resume interface to achieve this. The content of | ||
79 | * the 'watchdog_suspended' variable reflects this state. Existing threads | ||
80 | * are parked/unparked by the lockup_detector_{suspend|resume} functions | ||
81 | * (see comment blocks pertaining to those functions for further details). | ||
82 | * | ||
83 | * 'watchdog_suspended' also prevents threads from being registered/started | ||
84 | * or unregistered/stopped via parameters in /proc/sys/kernel, so the state | ||
85 | * of 'watchdog_running' cannot change while the watchdog is deactivated | ||
86 | * temporarily (see related code in 'proc' handlers). | ||
87 | */ | ||
88 | static int __read_mostly watchdog_suspended; | ||
89 | |||
70 | static u64 __read_mostly sample_period; | 90 | static u64 __read_mostly sample_period; |
71 | 91 | ||
72 | static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); | 92 | static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); |
@@ -613,46 +633,9 @@ static void watchdog_nmi_disable(unsigned int cpu) | |||
613 | } | 633 | } |
614 | } | 634 | } |
615 | 635 | ||
616 | void watchdog_nmi_enable_all(void) | ||
617 | { | ||
618 | int cpu; | ||
619 | |||
620 | mutex_lock(&watchdog_proc_mutex); | ||
621 | |||
622 | if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED)) | ||
623 | goto unlock; | ||
624 | |||
625 | get_online_cpus(); | ||
626 | for_each_watchdog_cpu(cpu) | ||
627 | watchdog_nmi_enable(cpu); | ||
628 | put_online_cpus(); | ||
629 | |||
630 | unlock: | ||
631 | mutex_unlock(&watchdog_proc_mutex); | ||
632 | } | ||
633 | |||
634 | void watchdog_nmi_disable_all(void) | ||
635 | { | ||
636 | int cpu; | ||
637 | |||
638 | mutex_lock(&watchdog_proc_mutex); | ||
639 | |||
640 | if (!watchdog_running) | ||
641 | goto unlock; | ||
642 | |||
643 | get_online_cpus(); | ||
644 | for_each_watchdog_cpu(cpu) | ||
645 | watchdog_nmi_disable(cpu); | ||
646 | put_online_cpus(); | ||
647 | |||
648 | unlock: | ||
649 | mutex_unlock(&watchdog_proc_mutex); | ||
650 | } | ||
651 | #else | 636 | #else |
652 | static int watchdog_nmi_enable(unsigned int cpu) { return 0; } | 637 | static int watchdog_nmi_enable(unsigned int cpu) { return 0; } |
653 | static void watchdog_nmi_disable(unsigned int cpu) { return; } | 638 | static void watchdog_nmi_disable(unsigned int cpu) { return; } |
654 | void watchdog_nmi_enable_all(void) {} | ||
655 | void watchdog_nmi_disable_all(void) {} | ||
656 | #endif /* CONFIG_HARDLOCKUP_DETECTOR */ | 639 | #endif /* CONFIG_HARDLOCKUP_DETECTOR */ |
657 | 640 | ||
658 | static struct smp_hotplug_thread watchdog_threads = { | 641 | static struct smp_hotplug_thread watchdog_threads = { |
@@ -666,46 +649,89 @@ static struct smp_hotplug_thread watchdog_threads = { | |||
666 | .unpark = watchdog_enable, | 649 | .unpark = watchdog_enable, |
667 | }; | 650 | }; |
668 | 651 | ||
669 | static void restart_watchdog_hrtimer(void *info) | 652 | /* |
653 | * park all watchdog threads that are specified in 'watchdog_cpumask' | ||
654 | */ | ||
655 | static int watchdog_park_threads(void) | ||
670 | { | 656 | { |
671 | struct hrtimer *hrtimer = raw_cpu_ptr(&watchdog_hrtimer); | 657 | int cpu, ret = 0; |
672 | int ret; | ||
673 | 658 | ||
659 | get_online_cpus(); | ||
660 | for_each_watchdog_cpu(cpu) { | ||
661 | ret = kthread_park(per_cpu(softlockup_watchdog, cpu)); | ||
662 | if (ret) | ||
663 | break; | ||
664 | } | ||
665 | if (ret) { | ||
666 | for_each_watchdog_cpu(cpu) | ||
667 | kthread_unpark(per_cpu(softlockup_watchdog, cpu)); | ||
668 | } | ||
669 | put_online_cpus(); | ||
670 | |||
671 | return ret; | ||
672 | } | ||
673 | |||
674 | /* | ||
675 | * unpark all watchdog threads that are specified in 'watchdog_cpumask' | ||
676 | */ | ||
677 | static void watchdog_unpark_threads(void) | ||
678 | { | ||
679 | int cpu; | ||
680 | |||
681 | get_online_cpus(); | ||
682 | for_each_watchdog_cpu(cpu) | ||
683 | kthread_unpark(per_cpu(softlockup_watchdog, cpu)); | ||
684 | put_online_cpus(); | ||
685 | } | ||
686 | |||
687 | /* | ||
688 | * Suspend the hard and soft lockup detector by parking the watchdog threads. | ||
689 | */ | ||
690 | int lockup_detector_suspend(void) | ||
691 | { | ||
692 | int ret = 0; | ||
693 | |||
694 | mutex_lock(&watchdog_proc_mutex); | ||
674 | /* | 695 | /* |
675 | * No need to cancel and restart hrtimer if it is currently executing | 696 | * Multiple suspend requests can be active in parallel (counted by |
676 | * because it will reprogram itself with the new period now. | 697 | * the 'watchdog_suspended' variable). If the watchdog threads are |
677 | * We should never see it unqueued here because we are running per-cpu | 698 | * running, the first caller takes care that they will be parked. |
678 | * with interrupts disabled. | 699 | * The state of 'watchdog_running' cannot change while a suspend |
700 | * request is active (see related code in 'proc' handlers). | ||
679 | */ | 701 | */ |
680 | ret = hrtimer_try_to_cancel(hrtimer); | 702 | if (watchdog_running && !watchdog_suspended) |
681 | if (ret == 1) | 703 | ret = watchdog_park_threads(); |
682 | hrtimer_start(hrtimer, ns_to_ktime(sample_period), | 704 | |
683 | HRTIMER_MODE_REL_PINNED); | 705 | if (ret == 0) |
706 | watchdog_suspended++; | ||
707 | |||
708 | mutex_unlock(&watchdog_proc_mutex); | ||
709 | |||
710 | return ret; | ||
684 | } | 711 | } |
685 | 712 | ||
686 | static void update_watchdog(int cpu) | 713 | /* |
714 | * Resume the hard and soft lockup detector by unparking the watchdog threads. | ||
715 | */ | ||
716 | void lockup_detector_resume(void) | ||
687 | { | 717 | { |
718 | mutex_lock(&watchdog_proc_mutex); | ||
719 | |||
720 | watchdog_suspended--; | ||
688 | /* | 721 | /* |
689 | * Make sure that perf event counter will adopt to a new | 722 | * The watchdog threads are unparked if they were previously running |
690 | * sampling period. Updating the sampling period directly would | 723 | * and if there is no more active suspend request. |
691 | * be much nicer but we do not have an API for that now so | ||
692 | * let's use a big hammer. | ||
693 | * Hrtimer will adopt the new period on the next tick but this | ||
694 | * might be late already so we have to restart the timer as well. | ||
695 | */ | 724 | */ |
696 | watchdog_nmi_disable(cpu); | 725 | if (watchdog_running && !watchdog_suspended) |
697 | smp_call_function_single(cpu, restart_watchdog_hrtimer, NULL, 1); | 726 | watchdog_unpark_threads(); |
698 | watchdog_nmi_enable(cpu); | 727 | |
728 | mutex_unlock(&watchdog_proc_mutex); | ||
699 | } | 729 | } |
700 | 730 | ||
701 | static void update_watchdog_all_cpus(void) | 731 | static void update_watchdog_all_cpus(void) |
702 | { | 732 | { |
703 | int cpu; | 733 | watchdog_park_threads(); |
704 | 734 | watchdog_unpark_threads(); | |
705 | get_online_cpus(); | ||
706 | for_each_watchdog_cpu(cpu) | ||
707 | update_watchdog(cpu); | ||
708 | put_online_cpus(); | ||
709 | } | 735 | } |
710 | 736 | ||
711 | static int watchdog_enable_all_cpus(void) | 737 | static int watchdog_enable_all_cpus(void) |
@@ -713,15 +739,12 @@ static int watchdog_enable_all_cpus(void) | |||
713 | int err = 0; | 739 | int err = 0; |
714 | 740 | ||
715 | if (!watchdog_running) { | 741 | if (!watchdog_running) { |
716 | err = smpboot_register_percpu_thread(&watchdog_threads); | 742 | err = smpboot_register_percpu_thread_cpumask(&watchdog_threads, |
743 | &watchdog_cpumask); | ||
717 | if (err) | 744 | if (err) |
718 | pr_err("Failed to create watchdog threads, disabled\n"); | 745 | pr_err("Failed to create watchdog threads, disabled\n"); |
719 | else { | 746 | else |
720 | if (smpboot_update_cpumask_percpu_thread( | ||
721 | &watchdog_threads, &watchdog_cpumask)) | ||
722 | pr_err("Failed to set cpumask for watchdog threads\n"); | ||
723 | watchdog_running = 1; | 747 | watchdog_running = 1; |
724 | } | ||
725 | } else { | 748 | } else { |
726 | /* | 749 | /* |
727 | * Enable/disable the lockup detectors or | 750 | * Enable/disable the lockup detectors or |
@@ -787,6 +810,12 @@ static int proc_watchdog_common(int which, struct ctl_table *table, int write, | |||
787 | 810 | ||
788 | mutex_lock(&watchdog_proc_mutex); | 811 | mutex_lock(&watchdog_proc_mutex); |
789 | 812 | ||
813 | if (watchdog_suspended) { | ||
814 | /* no parameter changes allowed while watchdog is suspended */ | ||
815 | err = -EAGAIN; | ||
816 | goto out; | ||
817 | } | ||
818 | |||
790 | /* | 819 | /* |
791 | * If the parameter is being read return the state of the corresponding | 820 | * If the parameter is being read return the state of the corresponding |
792 | * bit(s) in 'watchdog_enabled', else update 'watchdog_enabled' and the | 821 | * bit(s) in 'watchdog_enabled', else update 'watchdog_enabled' and the |
@@ -872,6 +901,12 @@ int proc_watchdog_thresh(struct ctl_table *table, int write, | |||
872 | 901 | ||
873 | mutex_lock(&watchdog_proc_mutex); | 902 | mutex_lock(&watchdog_proc_mutex); |
874 | 903 | ||
904 | if (watchdog_suspended) { | ||
905 | /* no parameter changes allowed while watchdog is suspended */ | ||
906 | err = -EAGAIN; | ||
907 | goto out; | ||
908 | } | ||
909 | |||
875 | old = ACCESS_ONCE(watchdog_thresh); | 910 | old = ACCESS_ONCE(watchdog_thresh); |
876 | err = proc_dointvec_minmax(table, write, buffer, lenp, ppos); | 911 | err = proc_dointvec_minmax(table, write, buffer, lenp, ppos); |
877 | 912 | ||
@@ -903,6 +938,13 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write, | |||
903 | int err; | 938 | int err; |
904 | 939 | ||
905 | mutex_lock(&watchdog_proc_mutex); | 940 | mutex_lock(&watchdog_proc_mutex); |
941 | |||
942 | if (watchdog_suspended) { | ||
943 | /* no parameter changes allowed while watchdog is suspended */ | ||
944 | err = -EAGAIN; | ||
945 | goto out; | ||
946 | } | ||
947 | |||
906 | err = proc_do_large_bitmap(table, write, buffer, lenp, ppos); | 948 | err = proc_do_large_bitmap(table, write, buffer, lenp, ppos); |
907 | if (!err && write) { | 949 | if (!err && write) { |
908 | /* Remove impossible cpus to keep sysctl output cleaner. */ | 950 | /* Remove impossible cpus to keep sysctl output cleaner. */ |
@@ -920,6 +962,7 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write, | |||
920 | pr_err("cpumask update failed\n"); | 962 | pr_err("cpumask update failed\n"); |
921 | } | 963 | } |
922 | } | 964 | } |
965 | out: | ||
923 | mutex_unlock(&watchdog_proc_mutex); | 966 | mutex_unlock(&watchdog_proc_mutex); |
924 | return err; | 967 | return err; |
925 | } | 968 | } |
@@ -932,10 +975,8 @@ void __init lockup_detector_init(void) | |||
932 | 975 | ||
933 | #ifdef CONFIG_NO_HZ_FULL | 976 | #ifdef CONFIG_NO_HZ_FULL |
934 | if (tick_nohz_full_enabled()) { | 977 | if (tick_nohz_full_enabled()) { |
935 | if (!cpumask_empty(tick_nohz_full_mask)) | 978 | pr_info("Disabling watchdog on nohz_full cores by default\n"); |
936 | pr_info("Disabling watchdog on nohz_full cores by default\n"); | 979 | cpumask_copy(&watchdog_cpumask, housekeeping_mask); |
937 | cpumask_andnot(&watchdog_cpumask, cpu_possible_mask, | ||
938 | tick_nohz_full_mask); | ||
939 | } else | 980 | } else |
940 | cpumask_copy(&watchdog_cpumask, cpu_possible_mask); | 981 | cpumask_copy(&watchdog_cpumask, cpu_possible_mask); |
941 | #else | 982 | #else |
diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 4c4f06176f74..ca71582fcfab 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c | |||
@@ -338,20 +338,20 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq); | |||
338 | #include <trace/events/workqueue.h> | 338 | #include <trace/events/workqueue.h> |
339 | 339 | ||
340 | #define assert_rcu_or_pool_mutex() \ | 340 | #define assert_rcu_or_pool_mutex() \ |
341 | rcu_lockdep_assert(rcu_read_lock_sched_held() || \ | 341 | RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() && \ |
342 | lockdep_is_held(&wq_pool_mutex), \ | 342 | !lockdep_is_held(&wq_pool_mutex), \ |
343 | "sched RCU or wq_pool_mutex should be held") | 343 | "sched RCU or wq_pool_mutex should be held") |
344 | 344 | ||
345 | #define assert_rcu_or_wq_mutex(wq) \ | 345 | #define assert_rcu_or_wq_mutex(wq) \ |
346 | rcu_lockdep_assert(rcu_read_lock_sched_held() || \ | 346 | RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() && \ |
347 | lockdep_is_held(&wq->mutex), \ | 347 | !lockdep_is_held(&wq->mutex), \ |
348 | "sched RCU or wq->mutex should be held") | 348 | "sched RCU or wq->mutex should be held") |
349 | 349 | ||
350 | #define assert_rcu_or_wq_mutex_or_pool_mutex(wq) \ | 350 | #define assert_rcu_or_wq_mutex_or_pool_mutex(wq) \ |
351 | rcu_lockdep_assert(rcu_read_lock_sched_held() || \ | 351 | RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() && \ |
352 | lockdep_is_held(&wq->mutex) || \ | 352 | !lockdep_is_held(&wq->mutex) && \ |
353 | lockdep_is_held(&wq_pool_mutex), \ | 353 | !lockdep_is_held(&wq_pool_mutex), \ |
354 | "sched RCU, wq->mutex or wq_pool_mutex should be held") | 354 | "sched RCU, wq->mutex or wq_pool_mutex should be held") |
355 | 355 | ||
356 | #define for_each_cpu_worker_pool(pool, cpu) \ | 356 | #define for_each_cpu_worker_pool(pool, cpu) \ |
357 | for ((pool) = &per_cpu(cpu_worker_pools, cpu)[0]; \ | 357 | for ((pool) = &per_cpu(cpu_worker_pools, cpu)[0]; \ |
@@ -1714,9 +1714,7 @@ static struct worker *create_worker(struct worker_pool *pool) | |||
1714 | goto fail; | 1714 | goto fail; |
1715 | 1715 | ||
1716 | set_user_nice(worker->task, pool->attrs->nice); | 1716 | set_user_nice(worker->task, pool->attrs->nice); |
1717 | 1717 | kthread_bind_mask(worker->task, pool->attrs->cpumask); | |
1718 | /* prevent userland from meddling with cpumask of workqueue workers */ | ||
1719 | worker->task->flags |= PF_NO_SETAFFINITY; | ||
1720 | 1718 | ||
1721 | /* successful, attach the worker to the pool */ | 1719 | /* successful, attach the worker to the pool */ |
1722 | worker_attach_to_pool(worker, pool); | 1720 | worker_attach_to_pool(worker, pool); |
@@ -2614,7 +2612,7 @@ void flush_workqueue(struct workqueue_struct *wq) | |||
2614 | out_unlock: | 2612 | out_unlock: |
2615 | mutex_unlock(&wq->mutex); | 2613 | mutex_unlock(&wq->mutex); |
2616 | } | 2614 | } |
2617 | EXPORT_SYMBOL_GPL(flush_workqueue); | 2615 | EXPORT_SYMBOL(flush_workqueue); |
2618 | 2616 | ||
2619 | /** | 2617 | /** |
2620 | * drain_workqueue - drain a workqueue | 2618 | * drain_workqueue - drain a workqueue |
@@ -3856,7 +3854,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, | |||
3856 | } | 3854 | } |
3857 | 3855 | ||
3858 | wq->rescuer = rescuer; | 3856 | wq->rescuer = rescuer; |
3859 | rescuer->task->flags |= PF_NO_SETAFFINITY; | 3857 | kthread_bind_mask(rescuer->task, cpu_possible_mask); |
3860 | wake_up_process(rescuer->task); | 3858 | wake_up_process(rescuer->task); |
3861 | } | 3859 | } |
3862 | 3860 | ||